diff --git a/.gitignore b/.gitignore
index 7959fe842..13a68f835 100755
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,7 @@ flatsampletext
snapshot.sh
tests/go_gen
CMakeLists.txt.user
+CMakeScripts/**
+build/Xcode/FlatBuffers.xcodeproj/project.xcworkspace/**
+build/Xcode/FlatBuffers.xcodeproj/xcuserdata/**
+
diff --git a/docs/html/md__schemas.html b/docs/html/md__schemas.html
index 243851b17..6662ed3a3 100644
--- a/docs/html/md__schemas.html
+++ b/docs/html/md__schemas.html
@@ -144,6 +144,20 @@ root_type Monster;
It accepts field names with and without quotes, like many JSON parsers already do. It outputs them without quotes as well, though can be made to output them using the strict_json flag.
If a field has an enum type, the parser will recognize symbolic enum values (with or without quotes) instead of numbers, e.g. field: EnumVal. If a field is of integral type, you can still use symbolic names, but values need to be prefixed with their type and need to be quoted, e.g. field: "Enum.EnumVal". For enums representing flags, you may place multiple inside a string separated by spaces to OR them, e.g. field: "EnumVal1 EnumVal2" or field: "Enum.EnumVal1 Enum.EnumVal2".
+When parsing JSON, it recognizes the following escape codes in strings:
+
+\n - linefeed.
+\t - tab.
+\r - carriage return.
+\b - backspace.
+\f - form feed.
+\" - double quote.
+\\ - backslash.
+\/ - forward slash.
+\uXXXX - 16-bit unicode code point, converted to the equivalent UTF-8 representation.
+\xXX - 8-bit binary hexadecimal number XX. This is the only one that is not in the JSON spec (see http://json.org/), but is needed to be able to encode arbitrary binary in strings to text and back without losing information (e.g. the byte 0xFF can't be represented in standard JSON).
+
+It also generates these escape codes back again when generating JSON from a binary representation.
Gotchas
Schemas and version control
FlatBuffers relies on new field declarations being added at the end, and earlier declarations to not be removed, but be marked deprecated when needed. We think this is an improvement over the manual number assignment that happens in Protocol Buffers (and which is still an option using the id attribute mentioned above).
diff --git a/docs/source/Schemas.md b/docs/source/Schemas.md
index 3aab1f534..f3afdfa6d 100755
--- a/docs/source/Schemas.md
+++ b/docs/source/Schemas.md
@@ -268,6 +268,26 @@ JSON:
separated by spaces to OR them, e.g.
`field: "EnumVal1 EnumVal2"` or `field: "Enum.EnumVal1 Enum.EnumVal2"`.
+When parsing JSON, it recognizes the following escape codes in strings:
+
+- `\n` - linefeed.
+- `\t` - tab.
+- `\r` - carriage return.
+- `\b` - backspace.
+- `\f` - form feed.
+- `\"` - double quote.
+- `\\` - backslash.
+- `\/` - forward slash.
+- `\uXXXX` - 16-bit unicode code point, converted to the equivalent UTF-8
+ representation.
+- `\xXX` - 8-bit binary hexadecimal number XX. This is the only one that is
+ not in the JSON spec (see http://json.org/), but is needed to be able to
+ encode arbitrary binary in strings to text and back without losing
+ information (e.g. the byte 0xFF can't be represented in standard JSON).
+
+It also generates these escape codes back again when generating JSON from a
+binary representation.
+
## Gotchas
### Schemas and version control
diff --git a/include/flatbuffers/idl.h b/include/flatbuffers/idl.h
index 162501cbf..8c92d9fe8 100644
--- a/include/flatbuffers/idl.h
+++ b/include/flatbuffers/idl.h
@@ -276,6 +276,7 @@ class Parser {
void MarkGenerated();
private:
+ int64_t ParseHexNum(int nibbles);
void Next();
bool IsNext(int t);
void Expect(int t);
diff --git a/include/flatbuffers/util.h b/include/flatbuffers/util.h
index 34cf3b80b..fa1b60ccf 100644
--- a/include/flatbuffers/util.h
+++ b/include/flatbuffers/util.h
@@ -44,13 +44,11 @@ template<> inline std::string NumToString(unsigned char t) {
}
// Convert an integer value to a hexadecimal string.
-// The returned string length is the number of nibbles in
-// the supplied value prefixed by 0 digits. For example,
-// IntToStringHex(static_cast(0x23)) returns the
-// string "00000023".
-template std::string IntToStringHex(T i) {
+// The returned string length is always xdigits long, prefixed by 0 digits.
+// For example, IntToStringHex(0x23, 8) returns the string "00000023".
+inline std::string IntToStringHex(int i, int xdigits) {
std::stringstream ss;
- ss << std::setw(sizeof(T) * 2)
+ ss << std::setw(xdigits)
<< std::setfill('0')
<< std::hex
<< std::uppercase
@@ -59,11 +57,11 @@ template std::string IntToStringHex(T i) {
}
// Portable implementation of strtoull().
-inline int64_t StringToInt(const char *str) {
+inline int64_t StringToInt(const char *str, int base = 10) {
#ifdef _MSC_VER
- return _strtoui64(str, nullptr, 10);
+ return _strtoui64(str, nullptr, base);
#else
- return strtoull(str, nullptr, 10);
+ return strtoull(str, nullptr, base);
#endif
}
@@ -126,6 +124,60 @@ inline std::string StripFileName(const std::string &filepath) {
return i != std::string::npos ? filepath.substr(0, i + 1) : "";
}
+// To and from UTF-8 unicode conversion functions
+
+// Convert a unicode code point into a UTF-8 representation by appending it
+// to a string. Returns the number of bytes generated.
+inline int ToUTF8(uint32_t ucc, std::string *out) {
+ assert(!(ucc & 0x80000000)); // Top bit can't be set.
+ // 6 possible encodings: http://en.wikipedia.org/wiki/UTF-8
+ for (int i = 0; i < 6; i++) {
+ // Max bits this encoding can represent.
+ uint32_t max_bits = 6 + i * 5 + static_cast(!i);
+ if (ucc < (1 << max_bits)) { // does it fit?
+ // Remaining bits not encoded in the first byte, store 6 bits each
+ uint32_t remain_bits = i * 6;
+ // Store first byte:
+ (*out) += static_cast((0xFE << (max_bits - remain_bits)) |
+ (ucc >> remain_bits));
+ // Store remaining bytes:
+ for (int j = i - 1; j >= 0; j--) {
+ (*out) += static_cast(((ucc >> (j * 6)) & 0x3F) | 0x80);
+ }
+ return i + 1; // Return the number of bytes added.
+ }
+ }
+ assert(0); // Impossible to arrive here.
+ return -1;
+}
+
+// Converts whatever prefix of the incoming string corresponds to a valid
+// UTF-8 sequence into a unicode code. The incoming pointer will have been
+// advanced past all bytes parsed.
+// returns -1 upon corrupt UTF-8 encoding (ignore the incoming pointer in
+// this case).
+inline int FromUTF8(const char **in) {
+ int len = 0;
+ // Count leading 1 bits.
+ for (int mask = 0x80; mask >= 0x04; mask >>= 1) {
+ if (**in & mask) {
+ len++;
+ } else {
+ break;
+ }
+ }
+ if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0.
+ if (!len) return *(*in)++;
+ // Grab initial bits of the code.
+ int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
+ for (int i = 0; i < len - 1; i++) {
+ if ((**in & 0xC0) != 0x80) return -1; // Upper bits must 1 0.
+ ucc <<= 6;
+ ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code.
+ }
+ return ucc;
+}
+
} // namespace flatbuffers
#endif // FLATBUFFERS_UTIL_H_
diff --git a/src/idl_gen_text.cpp b/src/idl_gen_text.cpp
index a9e2f6c7a..0fd4674b1 100644
--- a/src/idl_gen_text.cpp
+++ b/src/idl_gen_text.cpp
@@ -28,8 +28,12 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
// If indentation is less than 0, that indicates we don't want any newlines
// either.
-const char *NewLine(int indent_step) {
- return indent_step >= 0 ? "\n" : "";
+const char *NewLine(const GeneratorOptions &opts) {
+ return opts.indent_step >= 0 ? "\n" : "";
+}
+
+int Indent(const GeneratorOptions &opts) {
+ return std::max(opts.indent_step, 0);
}
// Output an identifier with or without quotes depending on strictness.
@@ -65,21 +69,21 @@ template void PrintVector(const Vector &v, Type type,
std::string *_text) {
std::string &text = *_text;
text += "[";
- text += NewLine(opts.indent_step);
+ text += NewLine(opts);
for (uoffset_t i = 0; i < v.Length(); i++) {
if (i) {
text += ",";
- text += NewLine(opts.indent_step);
+ text += NewLine(opts);
}
- text.append(indent + opts.indent_step, ' ');
+ text.append(indent + Indent(opts), ' ');
if (IsStruct(type))
Print(v.GetStructFromOffset(i * type.struct_def->bytesize), type,
- indent + opts.indent_step, nullptr, opts, _text);
+ indent + Indent(opts), nullptr, opts, _text);
else
- Print(v.Get(i), type, indent + opts.indent_step, nullptr,
+ Print(v.Get(i), type, indent + Indent(opts), nullptr,
opts, _text);
}
- text += NewLine(opts.indent_step);
+ text += NewLine(opts);
text.append(indent, ' ');
text += "]";
}
@@ -93,15 +97,28 @@ static void EscapeString(const String &s, std::string *_text) {
case '\n': text += "\\n"; break;
case '\t': text += "\\t"; break;
case '\r': text += "\\r"; break;
+ case '\b': text += "\\b"; break;
+ case '\f': text += "\\f"; break;
case '\"': text += "\\\""; break;
case '\\': text += "\\\\"; break;
default:
if (c >= ' ' && c <= '~') {
text += c;
} else {
- auto u = static_cast(c);
- text += "\\x";
- text += IntToStringHex(u);
+ // Not printable ASCII data. Let's see if it's valid UTF-8 first:
+ const char *utf8 = s.c_str() + i;
+ int ucc = FromUTF8(&utf8);
+ if (ucc >= 0x80 && ucc <= 0xFFFF) {
+ // Parses as Unicode within JSON's \uXXXX range, so use that.
+ text += "\\u";
+ text += IntToStringHex(ucc, 4);
+ i = utf8 - s.c_str() - 1; // Skip past characters recognized.
+ } else {
+ // It's either unprintable ASCII, arbitrary binary, or Unicode data
+ // that doesn't fit \uXXXX, so use \xXX escape code instead.
+ text += "\\x";
+ text += IntToStringHex(static_cast(c), 2);
+ }
}
break;
}
@@ -202,15 +219,15 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
if (fieldout++) {
text += ",";
}
- text += NewLine(opts.indent_step);
- text.append(indent + opts.indent_step, ' ');
+ text += NewLine(opts);
+ text.append(indent + Indent(opts), ' ');
OutputIdentifier(fd.name, opts, _text);
text += ": ";
switch (fd.value.type.base_type) {
#define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE) \
case BASE_TYPE_ ## ENUM: \
GenField(fd, table, struct_def.fixed, \
- opts, indent + opts.indent_step, _text); \
+ opts, indent + Indent(opts), _text); \
break;
FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
#undef FLATBUFFERS_TD
@@ -219,7 +236,7 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
case BASE_TYPE_ ## ENUM:
FLATBUFFERS_GEN_TYPES_POINTER(FLATBUFFERS_TD)
#undef FLATBUFFERS_TD
- GenFieldOffset(fd, table, struct_def.fixed, indent + opts.indent_step,
+ GenFieldOffset(fd, table, struct_def.fixed, indent + Indent(opts),
union_sd, opts, _text);
break;
}
@@ -231,7 +248,7 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
}
}
}
- text += NewLine(opts.indent_step);
+ text += NewLine(opts);
text.append(indent, ' ');
text += "}";
}
@@ -247,7 +264,7 @@ void GenerateText(const Parser &parser, const void *flatbuffer,
0,
opts,
_text);
- text += NewLine(opts.indent_step);
+ text += NewLine(opts);
}
} // namespace flatbuffers
diff --git a/src/idl_parser.cpp b/src/idl_parser.cpp
index 555d5e6ba..2eb6441dd 100644
--- a/src/idl_parser.cpp
+++ b/src/idl_parser.cpp
@@ -115,6 +115,17 @@ static std::string TokenToString(int t) {
}
}
+// Parses exactly nibbles worth of hex digits into a number, or error.
+int64_t Parser::ParseHexNum(int nibbles) {
+ for (int i = 0; i < nibbles; i++)
+ if (!isxdigit(cursor_[i]))
+ Error("escape code must be followed by " + NumToString(nibbles) +
+ " hex digits");
+ auto val = StringToInt(cursor_, 16);
+ cursor_ += nibbles;
+ return val;
+}
+
void Parser::Next() {
doc_comment_.clear();
bool seen_newline = false;
@@ -142,8 +153,21 @@ void Parser::Next() {
case 'n': attribute_ += '\n'; cursor_++; break;
case 't': attribute_ += '\t'; cursor_++; break;
case 'r': attribute_ += '\r'; cursor_++; break;
+ case 'b': attribute_ += '\b'; cursor_++; break;
+ case 'f': attribute_ += '\f'; cursor_++; break;
case '\"': attribute_ += '\"'; cursor_++; break;
case '\\': attribute_ += '\\'; cursor_++; break;
+ case '/': attribute_ += '/'; cursor_++; break;
+ case 'x': { // Not in the JSON standard
+ cursor_++;
+ attribute_ += static_cast(ParseHexNum(2));
+ break;
+ }
+ case 'u': {
+ cursor_++;
+ ToUTF8(static_cast(ParseHexNum(4)), &attribute_);
+ break;
+ }
default: Error("unknown escape code in string constant"); break;
}
} else { // printable chars + UTF-8 bytes
diff --git a/tests/test.cpp b/tests/test.cpp
index 3129dfdbb..07545a0a8 100644
--- a/tests/test.cpp
+++ b/tests/test.cpp
@@ -516,7 +516,19 @@ void EnumStringsTest() {
"{ F:[ \"E.C\", \"E.A E.B E.C\" ] }", ""), true);
}
-
+void UnicodeTest() {
+ flatbuffers::Parser parser;
+ TEST_EQ(parser.Parse("table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+ "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\" }", ""), true);
+ std::string jsongen;
+ flatbuffers::GeneratorOptions opts;
+ opts.indent_step = -1;
+ GenerateText(parser, parser.builder_.GetBufferPointer(), opts, &jsongen);
+ TEST_EQ(jsongen == "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+ "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
+}
int main(int /*argc*/, const char * /*argv*/[]) {
// Run our various test suites:
@@ -534,6 +546,7 @@ int main(int /*argc*/, const char * /*argv*/[]) {
ErrorTest();
ScientificTest();
EnumStringsTest();
+ UnicodeTest();
if (!testing_fails) {
TEST_OUTPUT_LINE("ALL TESTS PASSED");