diff --git a/include/flatbuffers/idl.h b/include/flatbuffers/idl.h index 25706e84f..5909a4e20 100644 --- a/include/flatbuffers/idl.h +++ b/include/flatbuffers/idl.h @@ -348,6 +348,7 @@ struct IDLOptions { bool escape_proto_identifiers; bool generate_object_based_api; bool union_value_namespacing; + bool allow_non_utf8; // Possible options for the more general generator below. enum Language { kJava, kCSharp, kGo, kMAX }; @@ -370,6 +371,7 @@ struct IDLOptions { escape_proto_identifiers(false), generate_object_based_api(false), union_value_namespacing(true), + allow_non_utf8(false), lang(IDLOptions::kJava) {} }; diff --git a/include/flatbuffers/util.h b/include/flatbuffers/util.h index 7bd7513bb..baf5bdd37 100644 --- a/include/flatbuffers/util.h +++ b/include/flatbuffers/util.h @@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) { } if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0. if (!len) return *(*in)++; + // UTF-8 encoded values with a length are between 2 and 4 bytes. + if (len < 2 || len > 4) { + return -1; + } // Grab initial bits of the code. int ucc = *(*in)++ & ((1 << (7 - len)) - 1); for (int i = 0; i < len - 1; i++) { @@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) { ucc <<= 6; ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code. } + // UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for + // UTF-16 surrogate pairs). + if (ucc >= 0xD800 && ucc <= 0xDFFF) { + return -1; + } + // UTF-8 must represent code points in their shortest possible encoding. + switch (len) { + case 2: + // Two bytes of UTF-8 can represent code points from U+0080 to U+07FF. + if (ucc < 0x0080 || ucc > 0x07FF) { + return -1; + } + break; + case 3: + // Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF. + if (ucc < 0x0800 || ucc > 0xFFFF) { + return -1; + } + break; + case 4: + // Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF. + if (ucc < 0x10000 || ucc > 0x10FFFF) { + return -1; + } + break; + } return ucc; } diff --git a/src/flatc.cpp b/src/flatc.cpp index b174cbd43..44ce91350 100644 --- a/src/flatc.cpp +++ b/src/flatc.cpp @@ -106,6 +106,9 @@ static void Error(const std::string &err, bool usage, bool show_exe_name) { " --version Print the version number of flatc and exit.\n" " --strict-json Strict JSON: field names must be / will be quoted,\n" " no trailing commas in tables/vectors.\n" + " --allow-non-utf8 Pass non-UTF-8 input through parser and emit nonstandard\n" + " \\x escapes in JSON. (Default is to raise parse error on\n" + " non-UTF-8 input.)\n" " --defaults-json Output fields whose value is the default when\n" " writing JSON\n" " --unknown-json Allow fields in JSON that are not defined in the\n" @@ -184,6 +187,8 @@ int main(int argc, const char *argv[]) { conform_to_schema = argv[argi]; } else if(arg == "--strict-json") { opts.strict_json = true; + } else if(arg == "--allow-non-utf8") { + opts.allow_non_utf8 = true; } else if(arg == "--no-js-exports") { opts.skip_js_exports = true; } else if(arg == "--defaults-json") { diff --git a/src/idl_gen_text.cpp b/src/idl_gen_text.cpp index dd96912cd..3e41a0a76 100644 --- a/src/idl_gen_text.cpp +++ b/src/idl_gen_text.cpp @@ -93,7 +93,7 @@ template void PrintVector(const Vector &v, Type type, text += "]"; } -static void EscapeString(const String &s, std::string *_text) { +static void EscapeString(const String &s, std::string *_text, const IDLOptions& opts) { std::string &text = *_text; text += "\""; for (uoffset_t i = 0; i < s.size(); i++) { @@ -113,17 +113,32 @@ static void EscapeString(const String &s, std::string *_text) { // Not printable ASCII data. Let's see if it's valid UTF-8 first: const char *utf8 = s.c_str() + i; int ucc = FromUTF8(&utf8); - if (ucc >= 0x80 && ucc <= 0xFFFF) { - // Parses as Unicode within JSON's \uXXXX range, so use that. - text += "\\u"; - text += IntToStringHex(ucc, 4); + if (ucc < 0) { + if (opts.allow_non_utf8) { + text += "\\x"; + text += IntToStringHex(static_cast(c), 2); + } else { + // We previously checked for non-UTF-8 and returned a parse error, + // so we shouldn't reach here. + assert(0); + } + } else { + if (ucc <= 0xFFFF) { + // Parses as Unicode within JSON's \uXXXX range, so use that. + text += "\\u"; + text += IntToStringHex(ucc, 4); + } else if (ucc <= 0x10FFFF) { + // Encode Unicode SMP values to a surrogate pair using two \u escapes. + uint32_t base = ucc - 0x10000; + uint16_t highSurrogate = (base >> 10) + 0xD800; + uint16_t lowSurrogate = (base & 0x03FF) + 0xDC00; + text += "\\u"; + text += IntToStringHex(highSurrogate, 4); + text += "\\u"; + text += IntToStringHex(lowSurrogate, 4); + } // Skip past characters recognized. i = static_cast(utf8 - s.c_str() - 1); - } else { - // It's either unprintable ASCII, arbitrary binary, or Unicode data - // that doesn't fit \uXXXX, so use \xXX escape code instead. - text += "\\x"; - text += IntToStringHex(static_cast(c), 2); } } break; @@ -157,7 +172,7 @@ template<> void Print(const void *val, _text); break; case BASE_TYPE_STRING: { - EscapeString(*reinterpret_cast(val), _text); + EscapeString(*reinterpret_cast(val), _text, opts); break; } case BASE_TYPE_VECTOR: diff --git a/src/idl_parser.cpp b/src/idl_parser.cpp index b03655c92..d845b837d 100644 --- a/src/idl_parser.cpp +++ b/src/idl_parser.cpp @@ -61,6 +61,17 @@ static_assert(BASE_TYPE_UNION == #define NEXT() ECHECK(Next()) #define EXPECT(tok) ECHECK(Expect(tok)) +static bool ValidateUTF8(const std::string &str) { + const char *s = &str[0]; + const char * const sEnd = s + str.length(); + while (s < sEnd) { + if (FromUTF8(&s) < 0) { + return false; + } + } + return true; +} + CheckedError Parser::Error(const std::string &msg) { error_ = file_being_parsed_.length() ? AbsolutePath(file_being_parsed_) : ""; #ifdef _WIN32 @@ -320,6 +331,9 @@ CheckedError Parser::Next() { "illegal Unicode sequence (unpaired high surrogate)"); } cursor_++; + if (!opts.allow_non_utf8 && !ValidateUTF8(attribute_)) { + return Error("illegal UTF-8 sequence"); + } token_ = kTokenStringConstant; return NoError(); } diff --git a/tests/test.cpp b/tests/test.cpp index 6ec4e678c..cd3723756 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -978,15 +978,36 @@ void IntegerOutOfRangeTest() { void UnicodeTest() { flatbuffers::Parser parser; + // Without setting allow_non_utf8 = true, we treat \x sequences as byte sequences + // which are then validated as UTF-8. TEST_EQ(parser.Parse("table T { F:string; }" "root_type T;" "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC" - "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\" }"), true); + "\\u5225\\u30B5\\u30A4\\u30C8\\xE2\\x82\\xAC\\u0080\\uD83D\\uDE0E\" }"), + true); std::string jsongen; parser.opts.indent_step = -1; GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen); - TEST_EQ(jsongen == "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC" - "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true); + TEST_EQ(jsongen, + std::string( + "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC" + "\\u5225\\u30B5\\u30A4\\u30C8\\u20AC\\u0080\\uD83D\\uDE0E\"}")); +} + +void UnicodeTestAllowNonUTF8() { + flatbuffers::Parser parser; + parser.opts.allow_non_utf8 = true; + TEST_EQ(parser.Parse("table T { F:string; }" + "root_type T;" + "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC" + "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\\u0080\\uD83D\\uDE0E\" }"), true); + std::string jsongen; + parser.opts.indent_step = -1; + GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen); + TEST_EQ(jsongen, + std::string( + "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC" + "\\u5225\\u30B5\\u30A4\\u30C8\\u0001\\x80\\u0080\\uD83D\\uDE0E\"}")); } void UnicodeSurrogatesTest() { @@ -1027,6 +1048,96 @@ void UnicodeInvalidSurrogatesTest() { "{ F:\"\\uDC00\"}", "unpaired low surrogate"); } +void InvalidUTF8Test() { + // "1 byte" pattern, under min length of 2 bytes + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\x80\"}", "illegal UTF-8 sequence"); + // 2 byte pattern, string too short + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xDF\"}", "illegal UTF-8 sequence"); + // 3 byte pattern, string too short + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xEF\xBF\"}", "illegal UTF-8 sequence"); + // 4 byte pattern, string too short + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xF7\xBF\xBF\"}", "illegal UTF-8 sequence"); + // "5 byte" pattern, string too short + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xFB\xBF\xBF\xBF\"}", "illegal UTF-8 sequence"); + // "6 byte" pattern, string too short + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xFD\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence"); + // "7 byte" pattern, string too short + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence"); + // "5 byte" pattern, over max length of 4 bytes + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xFB\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence"); + // "6 byte" pattern, over max length of 4 bytes + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xFD\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence"); + // "7 byte" pattern, over max length of 4 bytes + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence"); + + // Three invalid encodings for U+000A (\n, aka NEWLINE) + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xC0\x8A\"}", "illegal UTF-8 sequence"); + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xE0\x80\x8A\"}", "illegal UTF-8 sequence"); + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xF0\x80\x80\x8A\"}", "illegal UTF-8 sequence"); + + // Two invalid encodings for U+00A9 (COPYRIGHT SYMBOL) + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xE0\x81\xA9\"}", "illegal UTF-8 sequence"); + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xF0\x80\x81\xA9\"}", "illegal UTF-8 sequence"); + + // Invalid encoding for U+20AC (EURO SYMBOL) + TestError( + "table T { F:string; }" + "root_type T;" + "{ F:\"\xF0\x82\x82\xAC\"}", "illegal UTF-8 sequence"); + + // UTF-16 surrogate values between U+D800 and U+DFFF cannot be encoded in UTF-8 + TestError( + "table T { F:string; }" + "root_type T;" + // U+10400 "encoded" as U+D801 U+DC00 + "{ F:\"\xED\xA0\x81\xED\xB0\x80\"}", "illegal UTF-8 sequence"); +} + void UnknownFieldsTest() { flatbuffers::IDLOptions opts; opts.skip_unexpected_fields_in_json = true; @@ -1105,8 +1216,10 @@ int main(int /*argc*/, const char * /*argv*/[]) { EnumStringsTest(); IntegerOutOfRangeTest(); UnicodeTest(); + UnicodeTestAllowNonUTF8(); UnicodeSurrogatesTest(); UnicodeInvalidSurrogatesTest(); + InvalidUTF8Test(); UnknownFieldsTest(); ParseUnionTest(); ConformTest();