Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse

2026-06-15 08:48:52 +00:00 · 2016-08-01 14:04:51 -07:00
parent d70f5ac6b0
commit f6416d8471
6 changed files with 193 additions and 14 deletions
--- a/include/flatbuffers/util.h
+++ b/include/flatbuffers/util.h
@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
  }
  if ((**in << len) & 0x80) return -1;  // Bit after leading 1's must be 0.
  if (!len) return *(*in)++;
+  // UTF-8 encoded values with a length are between 2 and 4 bytes.
+  if (len < 2 || len > 4) {
+    return -1;
+  }
  // Grab initial bits of the code.
  int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
  for (int i = 0; i < len - 1; i++) {
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
    ucc <<= 6;
    ucc |= *(*in)++ & 0x3F;  // Grab 6 more bits of the code.
  }
+  // UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
+  // UTF-16 surrogate pairs).
+  if (ucc >= 0xD800 && ucc <= 0xDFFF) {
+    return -1;
+  }
+  // UTF-8 must represent code points in their shortest possible encoding.
+  switch (len) {
+    case 2:
+      // Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
+      if (ucc < 0x0080 || ucc > 0x07FF) {
+        return -1;
+      }
+      break;
+    case 3:
+      // Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
+      if (ucc < 0x0800 || ucc > 0xFFFF) {
+        return -1;
+      }
+      break;
+    case 4:
+      // Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
+      if (ucc < 0x10000 || ucc > 0x10FFFF) {
+        return -1;
+      }
+      break;
+  }
  return ucc;
 }