mirror of
https://github.com/google/flatbuffers.git
synced 2026-06-09 14:46:26 +00:00
Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse
This commit is contained in:
@@ -348,6 +348,7 @@ struct IDLOptions {
|
||||
bool escape_proto_identifiers;
|
||||
bool generate_object_based_api;
|
||||
bool union_value_namespacing;
|
||||
bool allow_non_utf8;
|
||||
|
||||
// Possible options for the more general generator below.
|
||||
enum Language { kJava, kCSharp, kGo, kMAX };
|
||||
@@ -370,6 +371,7 @@ struct IDLOptions {
|
||||
escape_proto_identifiers(false),
|
||||
generate_object_based_api(false),
|
||||
union_value_namespacing(true),
|
||||
allow_non_utf8(false),
|
||||
lang(IDLOptions::kJava) {}
|
||||
};
|
||||
|
||||
|
||||
@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
|
||||
}
|
||||
if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0.
|
||||
if (!len) return *(*in)++;
|
||||
// UTF-8 encoded values with a length are between 2 and 4 bytes.
|
||||
if (len < 2 || len > 4) {
|
||||
return -1;
|
||||
}
|
||||
// Grab initial bits of the code.
|
||||
int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
|
||||
for (int i = 0; i < len - 1; i++) {
|
||||
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
|
||||
ucc <<= 6;
|
||||
ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code.
|
||||
}
|
||||
// UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
|
||||
// UTF-16 surrogate pairs).
|
||||
if (ucc >= 0xD800 && ucc <= 0xDFFF) {
|
||||
return -1;
|
||||
}
|
||||
// UTF-8 must represent code points in their shortest possible encoding.
|
||||
switch (len) {
|
||||
case 2:
|
||||
// Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
|
||||
if (ucc < 0x0080 || ucc > 0x07FF) {
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
// Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
|
||||
if (ucc < 0x0800 || ucc > 0xFFFF) {
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
// Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
|
||||
if (ucc < 0x10000 || ucc > 0x10FFFF) {
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return ucc;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user