Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse

This commit is contained in:
Ben Hamilton
2016-08-01 14:04:51 -07:00
parent d70f5ac6b0
commit f6416d8471
6 changed files with 193 additions and 14 deletions

View File

@@ -348,6 +348,7 @@ struct IDLOptions {
bool escape_proto_identifiers;
bool generate_object_based_api;
bool union_value_namespacing;
bool allow_non_utf8;
// Possible options for the more general generator below.
enum Language { kJava, kCSharp, kGo, kMAX };
@@ -370,6 +371,7 @@ struct IDLOptions {
escape_proto_identifiers(false),
generate_object_based_api(false),
union_value_namespacing(true),
allow_non_utf8(false),
lang(IDLOptions::kJava) {}
};

View File

@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
}
if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0.
if (!len) return *(*in)++;
// UTF-8 encoded values with a length are between 2 and 4 bytes.
if (len < 2 || len > 4) {
return -1;
}
// Grab initial bits of the code.
int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
for (int i = 0; i < len - 1; i++) {
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
ucc <<= 6;
ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code.
}
// UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
// UTF-16 surrogate pairs).
if (ucc >= 0xD800 && ucc <= 0xDFFF) {
return -1;
}
// UTF-8 must represent code points in their shortest possible encoding.
switch (len) {
case 2:
// Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
if (ucc < 0x0080 || ucc > 0x07FF) {
return -1;
}
break;
case 3:
// Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
if (ucc < 0x0800 || ucc > 0xFFFF) {
return -1;
}
break;
case 4:
// Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
if (ucc < 0x10000 || ucc > 0x10FFFF) {
return -1;
}
break;
}
return ucc;
}