Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse

This commit is contained in:
Ben Hamilton
2016-08-01 14:04:51 -07:00
parent d70f5ac6b0
commit f6416d8471
6 changed files with 193 additions and 14 deletions

View File

@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
}
if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0.
if (!len) return *(*in)++;
// UTF-8 encoded values with a length are between 2 and 4 bytes.
if (len < 2 || len > 4) {
return -1;
}
// Grab initial bits of the code.
int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
for (int i = 0; i < len - 1; i++) {
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
ucc <<= 6;
ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code.
}
// UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
// UTF-16 surrogate pairs).
if (ucc >= 0xD800 && ucc <= 0xDFFF) {
return -1;
}
// UTF-8 must represent code points in their shortest possible encoding.
switch (len) {
case 2:
// Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
if (ucc < 0x0080 || ucc > 0x07FF) {
return -1;
}
break;
case 3:
// Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
if (ucc < 0x0800 || ucc > 0xFFFF) {
return -1;
}
break;
case 4:
// Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
if (ucc < 0x10000 || ucc > 0x10FFFF) {
return -1;
}
break;
}
return ucc;
}