mirror of
https://github.com/google/flatbuffers.git
synced 2026-06-02 04:04:19 +00:00
Handle \u-escaped surrogate pairs correctly in IDL parser
This commit is contained in:
@@ -236,12 +236,19 @@ CheckedError Parser::Next() {
|
||||
if(!isdigit(static_cast<const unsigned char>(*cursor_))) return NoError();
|
||||
return Error("floating point constant can\'t start with \".\"");
|
||||
case '\"':
|
||||
case '\'':
|
||||
case '\'': {
|
||||
int unicode_high_surrogate = -1;
|
||||
|
||||
while (*cursor_ != c) {
|
||||
if (*cursor_ < ' ' && *cursor_ >= 0)
|
||||
return Error("illegal character in string constant");
|
||||
if (*cursor_ == '\\') {
|
||||
cursor_++;
|
||||
if (unicode_high_surrogate != -1 &&
|
||||
*cursor_ != 'u') {
|
||||
return Error(
|
||||
"illegal Unicode sequence (unpaired high surrogate)");
|
||||
}
|
||||
switch (*cursor_) {
|
||||
case 'n': attribute_ += '\n'; cursor_++; break;
|
||||
case 't': attribute_ += '\t'; cursor_++; break;
|
||||
@@ -263,18 +270,51 @@ CheckedError Parser::Next() {
|
||||
cursor_++;
|
||||
int64_t val;
|
||||
ECHECK(ParseHexNum(4, &val));
|
||||
ToUTF8(static_cast<int>(val), &attribute_);
|
||||
if (val >= 0xD800 && val <= 0xDBFF) {
|
||||
if (unicode_high_surrogate != -1) {
|
||||
return Error(
|
||||
"illegal Unicode sequence (multiple high surrogates)");
|
||||
} else {
|
||||
unicode_high_surrogate = val;
|
||||
}
|
||||
} else if (val >= 0xDC00 && val <= 0xDFFF) {
|
||||
if (unicode_high_surrogate == -1) {
|
||||
return Error(
|
||||
"illegal Unicode sequence (unpaired low surrogate)");
|
||||
} else {
|
||||
int code_point = 0x10000 +
|
||||
((unicode_high_surrogate & 0x03FF) << 10) +
|
||||
(val & 0x03FF);
|
||||
ToUTF8(code_point, &attribute_);
|
||||
unicode_high_surrogate = -1;
|
||||
}
|
||||
} else {
|
||||
if (unicode_high_surrogate != -1) {
|
||||
return Error(
|
||||
"illegal Unicode sequence (unpaired high surrogate)");
|
||||
}
|
||||
ToUTF8(static_cast<int>(val), &attribute_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: return Error("unknown escape code in string constant");
|
||||
}
|
||||
} else { // printable chars + UTF-8 bytes
|
||||
if (unicode_high_surrogate != -1) {
|
||||
return Error(
|
||||
"illegal Unicode sequence (unpaired high surrogate)");
|
||||
}
|
||||
attribute_ += *cursor_++;
|
||||
}
|
||||
}
|
||||
if (unicode_high_surrogate != -1) {
|
||||
return Error(
|
||||
"illegal Unicode sequence (unpaired high surrogate)");
|
||||
}
|
||||
cursor_++;
|
||||
token_ = kTokenStringConstant;
|
||||
return NoError();
|
||||
}
|
||||
case '/':
|
||||
if (*cursor_ == '/') {
|
||||
const char *start = ++cursor_;
|
||||
|
||||
@@ -859,6 +859,44 @@ void UnicodeTest() {
|
||||
"\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
|
||||
}
|
||||
|
||||
void UnicodeSurrogatesTest() {
|
||||
flatbuffers::Parser parser;
|
||||
|
||||
TEST_EQ(
|
||||
parser.Parse(
|
||||
"table T { F:string (id: 0); }"
|
||||
"root_type T;"
|
||||
"{ F:\"\\uD83D\\uDCA9\"}"), true);
|
||||
auto root = flatbuffers::GetRoot<flatbuffers::Table>(
|
||||
parser.builder_.GetBufferPointer());
|
||||
auto string = root->GetPointer<flatbuffers::String *>(
|
||||
flatbuffers::FieldIndexToOffset(0));
|
||||
TEST_EQ(strcmp(string->c_str(), "\xF0\x9F\x92\xA9"), 0);
|
||||
}
|
||||
|
||||
void UnicodeInvalidSurrogatesTest() {
|
||||
TestError(
|
||||
"table T { F:string; }"
|
||||
"root_type T;"
|
||||
"{ F:\"\\uD800\"}", "unpaired high surrogate");
|
||||
TestError(
|
||||
"table T { F:string; }"
|
||||
"root_type T;"
|
||||
"{ F:\"\\uD800abcd\"}", "unpaired high surrogate");
|
||||
TestError(
|
||||
"table T { F:string; }"
|
||||
"root_type T;"
|
||||
"{ F:\"\\uD800\\n\"}", "unpaired high surrogate");
|
||||
TestError(
|
||||
"table T { F:string; }"
|
||||
"root_type T;"
|
||||
"{ F:\"\\uD800\\uD800\"}", "multiple high surrogates");
|
||||
TestError(
|
||||
"table T { F:string; }"
|
||||
"root_type T;"
|
||||
"{ F:\"\\uDC00\"}", "unpaired low surrogate");
|
||||
}
|
||||
|
||||
void UnknownFieldsTest() {
|
||||
flatbuffers::IDLOptions opts;
|
||||
opts.skip_unexpected_fields_in_json = true;
|
||||
@@ -907,6 +945,8 @@ int main(int /*argc*/, const char * /*argv*/[]) {
|
||||
EnumStringsTest();
|
||||
IntegerOutOfRangeTest();
|
||||
UnicodeTest();
|
||||
UnicodeSurrogatesTest();
|
||||
UnicodeInvalidSurrogatesTest();
|
||||
UnknownFieldsTest();
|
||||
|
||||
if (!testing_fails) {
|
||||
|
||||
Reference in New Issue
Block a user