From 6704b19db65727d9afbc74e92733516693df9b18 Mon Sep 17 00:00:00 2001 From: Ben Gertzfield Date: Thu, 28 Apr 2016 12:27:38 -0700 Subject: [PATCH] Handle \u-escaped surrogate pairs correctly in IDL parser --- src/idl_parser.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++-- tests/test.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/src/idl_parser.cpp b/src/idl_parser.cpp index a606781b5..2bf99eac0 100644 --- a/src/idl_parser.cpp +++ b/src/idl_parser.cpp @@ -236,12 +236,19 @@ CheckedError Parser::Next() { if(!isdigit(static_cast(*cursor_))) return NoError(); return Error("floating point constant can\'t start with \".\""); case '\"': - case '\'': + case '\'': { + int unicode_high_surrogate = -1; + while (*cursor_ != c) { if (*cursor_ < ' ' && *cursor_ >= 0) return Error("illegal character in string constant"); if (*cursor_ == '\\') { cursor_++; + if (unicode_high_surrogate != -1 && + *cursor_ != 'u') { + return Error( + "illegal Unicode sequence (unpaired high surrogate)"); + } switch (*cursor_) { case 'n': attribute_ += '\n'; cursor_++; break; case 't': attribute_ += '\t'; cursor_++; break; @@ -263,18 +270,51 @@ CheckedError Parser::Next() { cursor_++; int64_t val; ECHECK(ParseHexNum(4, &val)); - ToUTF8(static_cast(val), &attribute_); + if (val >= 0xD800 && val <= 0xDBFF) { + if (unicode_high_surrogate != -1) { + return Error( + "illegal Unicode sequence (multiple high surrogates)"); + } else { + unicode_high_surrogate = val; + } + } else if (val >= 0xDC00 && val <= 0xDFFF) { + if (unicode_high_surrogate == -1) { + return Error( + "illegal Unicode sequence (unpaired low surrogate)"); + } else { + int code_point = 0x10000 + + ((unicode_high_surrogate & 0x03FF) << 10) + + (val & 0x03FF); + ToUTF8(code_point, &attribute_); + unicode_high_surrogate = -1; + } + } else { + if (unicode_high_surrogate != -1) { + return Error( + "illegal Unicode sequence (unpaired high surrogate)"); + } + ToUTF8(static_cast(val), &attribute_); + } break; } default: return Error("unknown escape code in string constant"); } } else { // printable chars + UTF-8 bytes + if (unicode_high_surrogate != -1) { + return Error( + "illegal Unicode sequence (unpaired high surrogate)"); + } attribute_ += *cursor_++; } } + if (unicode_high_surrogate != -1) { + return Error( + "illegal Unicode sequence (unpaired high surrogate)"); + } cursor_++; token_ = kTokenStringConstant; return NoError(); + } case '/': if (*cursor_ == '/') { const char *start = ++cursor_; diff --git a/tests/test.cpp b/tests/test.cpp index e636f1f3c..df9cf08b9 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -859,6 +859,44 @@ void UnicodeTest() { "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true); } +void UnicodeSurrogatesTest() { + flatbuffers::Parser parser; + + TEST_EQ( + parser.Parse( + "table T { F:string (id: 0); }" + "root_type T;" + "{ F:\"\\uD83D\\uDCA9\"}"), true); + auto root = flatbuffers::GetRoot( + parser.builder_.GetBufferPointer()); + auto string = root->GetPointer( + flatbuffers::FieldIndexToOffset(0)); + TEST_EQ(strcmp(string->c_str(), "\xF0\x9F\x92\xA9"), 0); +} + +void UnicodeInvalidSurrogatesTest() { + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\\uD800\"}", "unpaired high surrogate"); + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\\uD800abcd\"}", "unpaired high surrogate"); + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\\uD800\\n\"}", "unpaired high surrogate"); + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\\uD800\\uD800\"}", "multiple high surrogates"); + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\\uDC00\"}", "unpaired low surrogate"); +} + void UnknownFieldsTest() { flatbuffers::IDLOptions opts; opts.skip_unexpected_fields_in_json = true; @@ -907,6 +945,8 @@ int main(int /*argc*/, const char * /*argv*/[]) { EnumStringsTest(); IntegerOutOfRangeTest(); UnicodeTest(); + UnicodeSurrogatesTest(); + UnicodeInvalidSurrogatesTest(); UnknownFieldsTest(); if (!testing_fails) {