mirror of
https://github.com/google/flatbuffers.git
synced 2026-06-21 16:05:44 +00:00
Merge pull request #3865 from bhamiltoncx/surrogate-pairs
Handle \u-escaped surrogate pairs correctly in IDL parser
This commit is contained in:
@@ -236,12 +236,19 @@ CheckedError Parser::Next() {
|
|||||||
if(!isdigit(static_cast<const unsigned char>(*cursor_))) return NoError();
|
if(!isdigit(static_cast<const unsigned char>(*cursor_))) return NoError();
|
||||||
return Error("floating point constant can\'t start with \".\"");
|
return Error("floating point constant can\'t start with \".\"");
|
||||||
case '\"':
|
case '\"':
|
||||||
case '\'':
|
case '\'': {
|
||||||
|
int unicode_high_surrogate = -1;
|
||||||
|
|
||||||
while (*cursor_ != c) {
|
while (*cursor_ != c) {
|
||||||
if (*cursor_ < ' ' && *cursor_ >= 0)
|
if (*cursor_ < ' ' && *cursor_ >= 0)
|
||||||
return Error("illegal character in string constant");
|
return Error("illegal character in string constant");
|
||||||
if (*cursor_ == '\\') {
|
if (*cursor_ == '\\') {
|
||||||
cursor_++;
|
cursor_++;
|
||||||
|
if (unicode_high_surrogate != -1 &&
|
||||||
|
*cursor_ != 'u') {
|
||||||
|
return Error(
|
||||||
|
"illegal Unicode sequence (unpaired high surrogate)");
|
||||||
|
}
|
||||||
switch (*cursor_) {
|
switch (*cursor_) {
|
||||||
case 'n': attribute_ += '\n'; cursor_++; break;
|
case 'n': attribute_ += '\n'; cursor_++; break;
|
||||||
case 't': attribute_ += '\t'; cursor_++; break;
|
case 't': attribute_ += '\t'; cursor_++; break;
|
||||||
@@ -263,18 +270,51 @@ CheckedError Parser::Next() {
|
|||||||
cursor_++;
|
cursor_++;
|
||||||
int64_t val;
|
int64_t val;
|
||||||
ECHECK(ParseHexNum(4, &val));
|
ECHECK(ParseHexNum(4, &val));
|
||||||
ToUTF8(static_cast<int>(val), &attribute_);
|
if (val >= 0xD800 && val <= 0xDBFF) {
|
||||||
|
if (unicode_high_surrogate != -1) {
|
||||||
|
return Error(
|
||||||
|
"illegal Unicode sequence (multiple high surrogates)");
|
||||||
|
} else {
|
||||||
|
unicode_high_surrogate = val;
|
||||||
|
}
|
||||||
|
} else if (val >= 0xDC00 && val <= 0xDFFF) {
|
||||||
|
if (unicode_high_surrogate == -1) {
|
||||||
|
return Error(
|
||||||
|
"illegal Unicode sequence (unpaired low surrogate)");
|
||||||
|
} else {
|
||||||
|
int code_point = 0x10000 +
|
||||||
|
((unicode_high_surrogate & 0x03FF) << 10) +
|
||||||
|
(val & 0x03FF);
|
||||||
|
ToUTF8(code_point, &attribute_);
|
||||||
|
unicode_high_surrogate = -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (unicode_high_surrogate != -1) {
|
||||||
|
return Error(
|
||||||
|
"illegal Unicode sequence (unpaired high surrogate)");
|
||||||
|
}
|
||||||
|
ToUTF8(static_cast<int>(val), &attribute_);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: return Error("unknown escape code in string constant");
|
default: return Error("unknown escape code in string constant");
|
||||||
}
|
}
|
||||||
} else { // printable chars + UTF-8 bytes
|
} else { // printable chars + UTF-8 bytes
|
||||||
|
if (unicode_high_surrogate != -1) {
|
||||||
|
return Error(
|
||||||
|
"illegal Unicode sequence (unpaired high surrogate)");
|
||||||
|
}
|
||||||
attribute_ += *cursor_++;
|
attribute_ += *cursor_++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (unicode_high_surrogate != -1) {
|
||||||
|
return Error(
|
||||||
|
"illegal Unicode sequence (unpaired high surrogate)");
|
||||||
|
}
|
||||||
cursor_++;
|
cursor_++;
|
||||||
token_ = kTokenStringConstant;
|
token_ = kTokenStringConstant;
|
||||||
return NoError();
|
return NoError();
|
||||||
|
}
|
||||||
case '/':
|
case '/':
|
||||||
if (*cursor_ == '/') {
|
if (*cursor_ == '/') {
|
||||||
const char *start = ++cursor_;
|
const char *start = ++cursor_;
|
||||||
|
|||||||
@@ -859,6 +859,44 @@ void UnicodeTest() {
|
|||||||
"\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
|
"\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void UnicodeSurrogatesTest() {
|
||||||
|
flatbuffers::Parser parser;
|
||||||
|
|
||||||
|
TEST_EQ(
|
||||||
|
parser.Parse(
|
||||||
|
"table T { F:string (id: 0); }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\\uD83D\\uDCA9\"}"), true);
|
||||||
|
auto root = flatbuffers::GetRoot<flatbuffers::Table>(
|
||||||
|
parser.builder_.GetBufferPointer());
|
||||||
|
auto string = root->GetPointer<flatbuffers::String *>(
|
||||||
|
flatbuffers::FieldIndexToOffset(0));
|
||||||
|
TEST_EQ(strcmp(string->c_str(), "\xF0\x9F\x92\xA9"), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void UnicodeInvalidSurrogatesTest() {
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\\uD800\"}", "unpaired high surrogate");
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\\uD800abcd\"}", "unpaired high surrogate");
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\\uD800\\n\"}", "unpaired high surrogate");
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\\uD800\\uD800\"}", "multiple high surrogates");
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\\uDC00\"}", "unpaired low surrogate");
|
||||||
|
}
|
||||||
|
|
||||||
void UnknownFieldsTest() {
|
void UnknownFieldsTest() {
|
||||||
flatbuffers::IDLOptions opts;
|
flatbuffers::IDLOptions opts;
|
||||||
opts.skip_unexpected_fields_in_json = true;
|
opts.skip_unexpected_fields_in_json = true;
|
||||||
@@ -907,6 +945,8 @@ int main(int /*argc*/, const char * /*argv*/[]) {
|
|||||||
EnumStringsTest();
|
EnumStringsTest();
|
||||||
IntegerOutOfRangeTest();
|
IntegerOutOfRangeTest();
|
||||||
UnicodeTest();
|
UnicodeTest();
|
||||||
|
UnicodeSurrogatesTest();
|
||||||
|
UnicodeInvalidSurrogatesTest();
|
||||||
UnknownFieldsTest();
|
UnknownFieldsTest();
|
||||||
|
|
||||||
if (!testing_fails) {
|
if (!testing_fails) {
|
||||||
|
|||||||
Reference in New Issue
Block a user