Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse

2026-06-09 14:46:26 +00:00 · 2016-08-01 14:04:51 -07:00
parent d70f5ac6b0
commit f6416d8471
6 changed files with 193 additions and 14 deletions
--- a/include/flatbuffers/idl.h
+++ b/include/flatbuffers/idl.h
@@ -348,6 +348,7 @@ struct IDLOptions {
  bool escape_proto_identifiers;
  bool generate_object_based_api;
  bool union_value_namespacing;
+  bool allow_non_utf8;

  // Possible options for the more general generator below.
  enum Language { kJava, kCSharp, kGo, kMAX };
@@ -370,6 +371,7 @@ struct IDLOptions {
      escape_proto_identifiers(false),
      generate_object_based_api(false),
      union_value_namespacing(true),
+      allow_non_utf8(false),
      lang(IDLOptions::kJava) {}
 };

--- a/include/flatbuffers/util.h
+++ b/include/flatbuffers/util.h
@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
  }
  if ((**in << len) & 0x80) return -1;  // Bit after leading 1's must be 0.
  if (!len) return *(*in)++;
+  // UTF-8 encoded values with a length are between 2 and 4 bytes.
+  if (len < 2 || len > 4) {
+    return -1;
+  }
  // Grab initial bits of the code.
  int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
  for (int i = 0; i < len - 1; i++) {
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
    ucc <<= 6;
    ucc |= *(*in)++ & 0x3F;  // Grab 6 more bits of the code.
  }
+  // UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
+  // UTF-16 surrogate pairs).
+  if (ucc >= 0xD800 && ucc <= 0xDFFF) {
+    return -1;
+  }
+  // UTF-8 must represent code points in their shortest possible encoding.
+  switch (len) {
+    case 2:
+      // Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
+      if (ucc < 0x0080 || ucc > 0x07FF) {
+        return -1;
+      }
+      break;
+    case 3:
+      // Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
+      if (ucc < 0x0800 || ucc > 0xFFFF) {
+        return -1;
+      }
+      break;
+    case 4:
+      // Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
+      if (ucc < 0x10000 || ucc > 0x10FFFF) {
+        return -1;
+      }
+      break;
+  }
  return ucc;
 }