mirror of
https://github.com/google/flatbuffers.git
synced 2026-06-28 10:08:06 +00:00
Merge pull request #3975 from bhamiltoncx/validate-utf8
Allow Unicode values > U+FFFF in string literals, validate UTF-8 by default
This commit is contained in:
@@ -348,6 +348,7 @@ struct IDLOptions {
|
|||||||
bool escape_proto_identifiers;
|
bool escape_proto_identifiers;
|
||||||
bool generate_object_based_api;
|
bool generate_object_based_api;
|
||||||
bool union_value_namespacing;
|
bool union_value_namespacing;
|
||||||
|
bool allow_non_utf8;
|
||||||
|
|
||||||
// Possible options for the more general generator below.
|
// Possible options for the more general generator below.
|
||||||
enum Language { kJava, kCSharp, kGo, kMAX };
|
enum Language { kJava, kCSharp, kGo, kMAX };
|
||||||
@@ -370,6 +371,7 @@ struct IDLOptions {
|
|||||||
escape_proto_identifiers(false),
|
escape_proto_identifiers(false),
|
||||||
generate_object_based_api(false),
|
generate_object_based_api(false),
|
||||||
union_value_namespacing(true),
|
union_value_namespacing(true),
|
||||||
|
allow_non_utf8(false),
|
||||||
lang(IDLOptions::kJava) {}
|
lang(IDLOptions::kJava) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
|
|||||||
}
|
}
|
||||||
if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0.
|
if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0.
|
||||||
if (!len) return *(*in)++;
|
if (!len) return *(*in)++;
|
||||||
|
// UTF-8 encoded values with a length are between 2 and 4 bytes.
|
||||||
|
if (len < 2 || len > 4) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
// Grab initial bits of the code.
|
// Grab initial bits of the code.
|
||||||
int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
|
int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
|
||||||
for (int i = 0; i < len - 1; i++) {
|
for (int i = 0; i < len - 1; i++) {
|
||||||
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
|
|||||||
ucc <<= 6;
|
ucc <<= 6;
|
||||||
ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code.
|
ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code.
|
||||||
}
|
}
|
||||||
|
// UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
|
||||||
|
// UTF-16 surrogate pairs).
|
||||||
|
if (ucc >= 0xD800 && ucc <= 0xDFFF) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// UTF-8 must represent code points in their shortest possible encoding.
|
||||||
|
switch (len) {
|
||||||
|
case 2:
|
||||||
|
// Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
|
||||||
|
if (ucc < 0x0080 || ucc > 0x07FF) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
// Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
|
||||||
|
if (ucc < 0x0800 || ucc > 0xFFFF) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
// Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
|
||||||
|
if (ucc < 0x10000 || ucc > 0x10FFFF) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
return ucc;
|
return ucc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -106,6 +106,9 @@ static void Error(const std::string &err, bool usage, bool show_exe_name) {
|
|||||||
" --version Print the version number of flatc and exit.\n"
|
" --version Print the version number of flatc and exit.\n"
|
||||||
" --strict-json Strict JSON: field names must be / will be quoted,\n"
|
" --strict-json Strict JSON: field names must be / will be quoted,\n"
|
||||||
" no trailing commas in tables/vectors.\n"
|
" no trailing commas in tables/vectors.\n"
|
||||||
|
" --allow-non-utf8 Pass non-UTF-8 input through parser and emit nonstandard\n"
|
||||||
|
" \\x escapes in JSON. (Default is to raise parse error on\n"
|
||||||
|
" non-UTF-8 input.)\n"
|
||||||
" --defaults-json Output fields whose value is the default when\n"
|
" --defaults-json Output fields whose value is the default when\n"
|
||||||
" writing JSON\n"
|
" writing JSON\n"
|
||||||
" --unknown-json Allow fields in JSON that are not defined in the\n"
|
" --unknown-json Allow fields in JSON that are not defined in the\n"
|
||||||
@@ -184,6 +187,8 @@ int main(int argc, const char *argv[]) {
|
|||||||
conform_to_schema = argv[argi];
|
conform_to_schema = argv[argi];
|
||||||
} else if(arg == "--strict-json") {
|
} else if(arg == "--strict-json") {
|
||||||
opts.strict_json = true;
|
opts.strict_json = true;
|
||||||
|
} else if(arg == "--allow-non-utf8") {
|
||||||
|
opts.allow_non_utf8 = true;
|
||||||
} else if(arg == "--no-js-exports") {
|
} else if(arg == "--no-js-exports") {
|
||||||
opts.skip_js_exports = true;
|
opts.skip_js_exports = true;
|
||||||
} else if(arg == "--defaults-json") {
|
} else if(arg == "--defaults-json") {
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ template<typename T> void PrintVector(const Vector<T> &v, Type type,
|
|||||||
text += "]";
|
text += "]";
|
||||||
}
|
}
|
||||||
|
|
||||||
static void EscapeString(const String &s, std::string *_text) {
|
static void EscapeString(const String &s, std::string *_text, const IDLOptions& opts) {
|
||||||
std::string &text = *_text;
|
std::string &text = *_text;
|
||||||
text += "\"";
|
text += "\"";
|
||||||
for (uoffset_t i = 0; i < s.size(); i++) {
|
for (uoffset_t i = 0; i < s.size(); i++) {
|
||||||
@@ -113,17 +113,32 @@ static void EscapeString(const String &s, std::string *_text) {
|
|||||||
// Not printable ASCII data. Let's see if it's valid UTF-8 first:
|
// Not printable ASCII data. Let's see if it's valid UTF-8 first:
|
||||||
const char *utf8 = s.c_str() + i;
|
const char *utf8 = s.c_str() + i;
|
||||||
int ucc = FromUTF8(&utf8);
|
int ucc = FromUTF8(&utf8);
|
||||||
if (ucc >= 0x80 && ucc <= 0xFFFF) {
|
if (ucc < 0) {
|
||||||
// Parses as Unicode within JSON's \uXXXX range, so use that.
|
if (opts.allow_non_utf8) {
|
||||||
text += "\\u";
|
text += "\\x";
|
||||||
text += IntToStringHex(ucc, 4);
|
text += IntToStringHex(static_cast<uint8_t>(c), 2);
|
||||||
|
} else {
|
||||||
|
// We previously checked for non-UTF-8 and returned a parse error,
|
||||||
|
// so we shouldn't reach here.
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (ucc <= 0xFFFF) {
|
||||||
|
// Parses as Unicode within JSON's \uXXXX range, so use that.
|
||||||
|
text += "\\u";
|
||||||
|
text += IntToStringHex(ucc, 4);
|
||||||
|
} else if (ucc <= 0x10FFFF) {
|
||||||
|
// Encode Unicode SMP values to a surrogate pair using two \u escapes.
|
||||||
|
uint32_t base = ucc - 0x10000;
|
||||||
|
uint16_t highSurrogate = (base >> 10) + 0xD800;
|
||||||
|
uint16_t lowSurrogate = (base & 0x03FF) + 0xDC00;
|
||||||
|
text += "\\u";
|
||||||
|
text += IntToStringHex(highSurrogate, 4);
|
||||||
|
text += "\\u";
|
||||||
|
text += IntToStringHex(lowSurrogate, 4);
|
||||||
|
}
|
||||||
// Skip past characters recognized.
|
// Skip past characters recognized.
|
||||||
i = static_cast<uoffset_t>(utf8 - s.c_str() - 1);
|
i = static_cast<uoffset_t>(utf8 - s.c_str() - 1);
|
||||||
} else {
|
|
||||||
// It's either unprintable ASCII, arbitrary binary, or Unicode data
|
|
||||||
// that doesn't fit \uXXXX, so use \xXX escape code instead.
|
|
||||||
text += "\\x";
|
|
||||||
text += IntToStringHex(static_cast<uint8_t>(c), 2);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -157,7 +172,7 @@ template<> void Print<const void *>(const void *val,
|
|||||||
_text);
|
_text);
|
||||||
break;
|
break;
|
||||||
case BASE_TYPE_STRING: {
|
case BASE_TYPE_STRING: {
|
||||||
EscapeString(*reinterpret_cast<const String *>(val), _text);
|
EscapeString(*reinterpret_cast<const String *>(val), _text, opts);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case BASE_TYPE_VECTOR:
|
case BASE_TYPE_VECTOR:
|
||||||
|
|||||||
@@ -61,6 +61,17 @@ static_assert(BASE_TYPE_UNION ==
|
|||||||
#define NEXT() ECHECK(Next())
|
#define NEXT() ECHECK(Next())
|
||||||
#define EXPECT(tok) ECHECK(Expect(tok))
|
#define EXPECT(tok) ECHECK(Expect(tok))
|
||||||
|
|
||||||
|
static bool ValidateUTF8(const std::string &str) {
|
||||||
|
const char *s = &str[0];
|
||||||
|
const char * const sEnd = s + str.length();
|
||||||
|
while (s < sEnd) {
|
||||||
|
if (FromUTF8(&s) < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
CheckedError Parser::Error(const std::string &msg) {
|
CheckedError Parser::Error(const std::string &msg) {
|
||||||
error_ = file_being_parsed_.length() ? AbsolutePath(file_being_parsed_) : "";
|
error_ = file_being_parsed_.length() ? AbsolutePath(file_being_parsed_) : "";
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@@ -320,6 +331,9 @@ CheckedError Parser::Next() {
|
|||||||
"illegal Unicode sequence (unpaired high surrogate)");
|
"illegal Unicode sequence (unpaired high surrogate)");
|
||||||
}
|
}
|
||||||
cursor_++;
|
cursor_++;
|
||||||
|
if (!opts.allow_non_utf8 && !ValidateUTF8(attribute_)) {
|
||||||
|
return Error("illegal UTF-8 sequence");
|
||||||
|
}
|
||||||
token_ = kTokenStringConstant;
|
token_ = kTokenStringConstant;
|
||||||
return NoError();
|
return NoError();
|
||||||
}
|
}
|
||||||
|
|||||||
119
tests/test.cpp
119
tests/test.cpp
@@ -978,15 +978,36 @@ void IntegerOutOfRangeTest() {
|
|||||||
|
|
||||||
void UnicodeTest() {
|
void UnicodeTest() {
|
||||||
flatbuffers::Parser parser;
|
flatbuffers::Parser parser;
|
||||||
|
// Without setting allow_non_utf8 = true, we treat \x sequences as byte sequences
|
||||||
|
// which are then validated as UTF-8.
|
||||||
TEST_EQ(parser.Parse("table T { F:string; }"
|
TEST_EQ(parser.Parse("table T { F:string; }"
|
||||||
"root_type T;"
|
"root_type T;"
|
||||||
"{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
|
"{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
|
||||||
"\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\" }"), true);
|
"\\u5225\\u30B5\\u30A4\\u30C8\\xE2\\x82\\xAC\\u0080\\uD83D\\uDE0E\" }"),
|
||||||
|
true);
|
||||||
std::string jsongen;
|
std::string jsongen;
|
||||||
parser.opts.indent_step = -1;
|
parser.opts.indent_step = -1;
|
||||||
GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
|
GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
|
||||||
TEST_EQ(jsongen == "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
|
TEST_EQ(jsongen,
|
||||||
"\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
|
std::string(
|
||||||
|
"{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
|
||||||
|
"\\u5225\\u30B5\\u30A4\\u30C8\\u20AC\\u0080\\uD83D\\uDE0E\"}"));
|
||||||
|
}
|
||||||
|
|
||||||
|
void UnicodeTestAllowNonUTF8() {
|
||||||
|
flatbuffers::Parser parser;
|
||||||
|
parser.opts.allow_non_utf8 = true;
|
||||||
|
TEST_EQ(parser.Parse("table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
|
||||||
|
"\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\\u0080\\uD83D\\uDE0E\" }"), true);
|
||||||
|
std::string jsongen;
|
||||||
|
parser.opts.indent_step = -1;
|
||||||
|
GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
|
||||||
|
TEST_EQ(jsongen,
|
||||||
|
std::string(
|
||||||
|
"{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
|
||||||
|
"\\u5225\\u30B5\\u30A4\\u30C8\\u0001\\x80\\u0080\\uD83D\\uDE0E\"}"));
|
||||||
}
|
}
|
||||||
|
|
||||||
void UnicodeSurrogatesTest() {
|
void UnicodeSurrogatesTest() {
|
||||||
@@ -1027,6 +1048,96 @@ void UnicodeInvalidSurrogatesTest() {
|
|||||||
"{ F:\"\\uDC00\"}", "unpaired low surrogate");
|
"{ F:\"\\uDC00\"}", "unpaired low surrogate");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void InvalidUTF8Test() {
|
||||||
|
// "1 byte" pattern, under min length of 2 bytes
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\x80\"}", "illegal UTF-8 sequence");
|
||||||
|
// 2 byte pattern, string too short
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xDF\"}", "illegal UTF-8 sequence");
|
||||||
|
// 3 byte pattern, string too short
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xEF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
// 4 byte pattern, string too short
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xF7\xBF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
// "5 byte" pattern, string too short
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xFB\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
// "6 byte" pattern, string too short
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xFD\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
// "7 byte" pattern, string too short
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
// "5 byte" pattern, over max length of 4 bytes
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xFB\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
// "6 byte" pattern, over max length of 4 bytes
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xFD\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
// "7 byte" pattern, over max length of 4 bytes
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
|
||||||
|
|
||||||
|
// Three invalid encodings for U+000A (\n, aka NEWLINE)
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xC0\x8A\"}", "illegal UTF-8 sequence");
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xE0\x80\x8A\"}", "illegal UTF-8 sequence");
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xF0\x80\x80\x8A\"}", "illegal UTF-8 sequence");
|
||||||
|
|
||||||
|
// Two invalid encodings for U+00A9 (COPYRIGHT SYMBOL)
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xE0\x81\xA9\"}", "illegal UTF-8 sequence");
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xF0\x80\x81\xA9\"}", "illegal UTF-8 sequence");
|
||||||
|
|
||||||
|
// Invalid encoding for U+20AC (EURO SYMBOL)
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
"{ F:\"\xF0\x82\x82\xAC\"}", "illegal UTF-8 sequence");
|
||||||
|
|
||||||
|
// UTF-16 surrogate values between U+D800 and U+DFFF cannot be encoded in UTF-8
|
||||||
|
TestError(
|
||||||
|
"table T { F:string; }"
|
||||||
|
"root_type T;"
|
||||||
|
// U+10400 "encoded" as U+D801 U+DC00
|
||||||
|
"{ F:\"\xED\xA0\x81\xED\xB0\x80\"}", "illegal UTF-8 sequence");
|
||||||
|
}
|
||||||
|
|
||||||
void UnknownFieldsTest() {
|
void UnknownFieldsTest() {
|
||||||
flatbuffers::IDLOptions opts;
|
flatbuffers::IDLOptions opts;
|
||||||
opts.skip_unexpected_fields_in_json = true;
|
opts.skip_unexpected_fields_in_json = true;
|
||||||
@@ -1105,8 +1216,10 @@ int main(int /*argc*/, const char * /*argv*/[]) {
|
|||||||
EnumStringsTest();
|
EnumStringsTest();
|
||||||
IntegerOutOfRangeTest();
|
IntegerOutOfRangeTest();
|
||||||
UnicodeTest();
|
UnicodeTest();
|
||||||
|
UnicodeTestAllowNonUTF8();
|
||||||
UnicodeSurrogatesTest();
|
UnicodeSurrogatesTest();
|
||||||
UnicodeInvalidSurrogatesTest();
|
UnicodeInvalidSurrogatesTest();
|
||||||
|
InvalidUTF8Test();
|
||||||
UnknownFieldsTest();
|
UnknownFieldsTest();
|
||||||
ParseUnionTest();
|
ParseUnionTest();
|
||||||
ConformTest();
|
ConformTest();
|
||||||
|
|||||||
Reference in New Issue
Block a user