mirror of
https://github.com/google/flatbuffers.git
synced 2026-06-11 07:27:27 +00:00
Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse
This commit is contained in:
@@ -106,6 +106,9 @@ static void Error(const std::string &err, bool usage, bool show_exe_name) {
|
||||
" --version Print the version number of flatc and exit.\n"
|
||||
" --strict-json Strict JSON: field names must be / will be quoted,\n"
|
||||
" no trailing commas in tables/vectors.\n"
|
||||
" --allow-non-utf8 Pass non-UTF-8 input through parser and emit nonstandard\n"
|
||||
" \\x escapes in JSON. (Default is to raise parse error on\n"
|
||||
" non-UTF-8 input.)\n"
|
||||
" --defaults-json Output fields whose value is the default when\n"
|
||||
" writing JSON\n"
|
||||
" --unknown-json Allow fields in JSON that are not defined in the\n"
|
||||
@@ -184,6 +187,8 @@ int main(int argc, const char *argv[]) {
|
||||
conform_to_schema = argv[argi];
|
||||
} else if(arg == "--strict-json") {
|
||||
opts.strict_json = true;
|
||||
} else if(arg == "--allow-non-utf8") {
|
||||
opts.allow_non_utf8 = true;
|
||||
} else if(arg == "--no-js-exports") {
|
||||
opts.skip_js_exports = true;
|
||||
} else if(arg == "--defaults-json") {
|
||||
|
||||
@@ -93,7 +93,7 @@ template<typename T> void PrintVector(const Vector<T> &v, Type type,
|
||||
text += "]";
|
||||
}
|
||||
|
||||
static void EscapeString(const String &s, std::string *_text) {
|
||||
static void EscapeString(const String &s, std::string *_text, const IDLOptions& opts) {
|
||||
std::string &text = *_text;
|
||||
text += "\"";
|
||||
for (uoffset_t i = 0; i < s.size(); i++) {
|
||||
@@ -113,17 +113,32 @@ static void EscapeString(const String &s, std::string *_text) {
|
||||
// Not printable ASCII data. Let's see if it's valid UTF-8 first:
|
||||
const char *utf8 = s.c_str() + i;
|
||||
int ucc = FromUTF8(&utf8);
|
||||
if (ucc >= 0x80 && ucc <= 0xFFFF) {
|
||||
// Parses as Unicode within JSON's \uXXXX range, so use that.
|
||||
text += "\\u";
|
||||
text += IntToStringHex(ucc, 4);
|
||||
if (ucc < 0) {
|
||||
if (opts.allow_non_utf8) {
|
||||
text += "\\x";
|
||||
text += IntToStringHex(static_cast<uint8_t>(c), 2);
|
||||
} else {
|
||||
// We previously checked for non-UTF-8 and returned a parse error,
|
||||
// so we shouldn't reach here.
|
||||
assert(0);
|
||||
}
|
||||
} else {
|
||||
if (ucc <= 0xFFFF) {
|
||||
// Parses as Unicode within JSON's \uXXXX range, so use that.
|
||||
text += "\\u";
|
||||
text += IntToStringHex(ucc, 4);
|
||||
} else if (ucc <= 0x10FFFF) {
|
||||
// Encode Unicode SMP values to a surrogate pair using two \u escapes.
|
||||
uint32_t base = ucc - 0x10000;
|
||||
uint16_t highSurrogate = (base >> 10) + 0xD800;
|
||||
uint16_t lowSurrogate = (base & 0x03FF) + 0xDC00;
|
||||
text += "\\u";
|
||||
text += IntToStringHex(highSurrogate, 4);
|
||||
text += "\\u";
|
||||
text += IntToStringHex(lowSurrogate, 4);
|
||||
}
|
||||
// Skip past characters recognized.
|
||||
i = static_cast<uoffset_t>(utf8 - s.c_str() - 1);
|
||||
} else {
|
||||
// It's either unprintable ASCII, arbitrary binary, or Unicode data
|
||||
// that doesn't fit \uXXXX, so use \xXX escape code instead.
|
||||
text += "\\x";
|
||||
text += IntToStringHex(static_cast<uint8_t>(c), 2);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -157,7 +172,7 @@ template<> void Print<const void *>(const void *val,
|
||||
_text);
|
||||
break;
|
||||
case BASE_TYPE_STRING: {
|
||||
EscapeString(*reinterpret_cast<const String *>(val), _text);
|
||||
EscapeString(*reinterpret_cast<const String *>(val), _text, opts);
|
||||
break;
|
||||
}
|
||||
case BASE_TYPE_VECTOR:
|
||||
|
||||
@@ -61,6 +61,17 @@ static_assert(BASE_TYPE_UNION ==
|
||||
#define NEXT() ECHECK(Next())
|
||||
#define EXPECT(tok) ECHECK(Expect(tok))
|
||||
|
||||
static bool ValidateUTF8(const std::string &str) {
|
||||
const char *s = &str[0];
|
||||
const char * const sEnd = s + str.length();
|
||||
while (s < sEnd) {
|
||||
if (FromUTF8(&s) < 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
CheckedError Parser::Error(const std::string &msg) {
|
||||
error_ = file_being_parsed_.length() ? AbsolutePath(file_being_parsed_) : "";
|
||||
#ifdef _WIN32
|
||||
@@ -320,6 +331,9 @@ CheckedError Parser::Next() {
|
||||
"illegal Unicode sequence (unpaired high surrogate)");
|
||||
}
|
||||
cursor_++;
|
||||
if (!opts.allow_non_utf8 && !ValidateUTF8(attribute_)) {
|
||||
return Error("illegal UTF-8 sequence");
|
||||
}
|
||||
token_ = kTokenStringConstant;
|
||||
return NoError();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user