Refactoring of idl_parser (#4948)

* Refactoring of numbers parser

More accurate parse of float and double.
Hexadecimal floats.
Check "out-of-range" of uint64 fields.
Check correctness of default values and metadata.

* Remove locale-independent code strtod/strtof from PR #4948.

* small optimization

* Add is_(ascii) functions

* is_ascii cleanup

* Fix format conversation

* Refine number parser

* Make code compatible with Android build

* Remove unnecessary suppression of warning C4127
This commit is contained in:
Vladimir Glavnyy
2018-10-12 00:37:47 +07:00
committed by Wouter van Oortmerssen
parent 53ce80ce91
commit 4ed6fafdfa
11 changed files with 880 additions and 206 deletions

View File

@@ -1,4 +1,4 @@
/*
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -119,57 +119,22 @@ CheckedError Parser::RecurseError() {
NumToString(FLATBUFFERS_MAX_PARSING_DEPTH) + " reached");
}
inline std::string OutOfRangeErrorMsg(int64_t val, const std::string &op,
int64_t limit) {
const std::string cause = NumToString(val) + op + NumToString(limit);
return "constant does not fit (" + cause + ")";
CheckedError Parser::InvalidNumber(const char *number, const std::string &msg) {
return Error("invalid number: \"" + std::string(number) + "\"" + msg);
}
// Ensure that integer values we parse fit inside the declared integer type.
CheckedError Parser::CheckInRange(int64_t val, int64_t min, int64_t max) {
if (val < min)
return Error(OutOfRangeErrorMsg(val, " < ", min));
else if (val > max)
return Error(OutOfRangeErrorMsg(val, " > ", max));
else
return NoError();
}
// atot: templated version of atoi/atof: convert a string to an instance of T.
template<typename T>
inline CheckedError atot(const char *s, Parser &parser, T *val) {
int64_t i = StringToInt(s);
const int64_t min = flatbuffers::numeric_limits<T>::min();
const int64_t max = flatbuffers::numeric_limits<T>::max();
*val = (T)i; // Assign this first to make ASAN happy.
return parser.CheckInRange(i, min, max);
}
template<>
inline CheckedError atot<uint64_t>(const char *s, Parser &parser,
uint64_t *val) {
(void)parser;
*val = StringToUInt(s);
return NoError();
}
template<>
inline CheckedError atot<bool>(const char *s, Parser &parser, bool *val) {
(void)parser;
*val = 0 != atoi(s);
return NoError();
}
template<>
inline CheckedError atot<float>(const char *s, Parser &parser, float *val) {
(void)parser;
*val = static_cast<float>(strtod(s, nullptr));
return NoError();
}
template<>
inline CheckedError atot<double>(const char *s, Parser &parser, double *val) {
(void)parser;
*val = strtod(s, nullptr);
return NoError();
}
auto done = StringToNumber(s, val);
if (done) return NoError();
return parser.InvalidNumber(
s, (0 == *val)
? ""
: (", constant does not fit [" +
NumToString(flatbuffers::numeric_limits<T>::lowest()) + "; " +
NumToString(flatbuffers::numeric_limits<T>::max()) + "]"));
}
template<>
inline CheckedError atot<Offset<void>>(const char *s, Parser &parser,
Offset<void> *val) {
@@ -239,8 +204,9 @@ std::string Parser::TokenToStringId(int t) const {
// Parses exactly nibbles worth of hex digits into a number, or error.
CheckedError Parser::ParseHexNum(int nibbles, uint64_t *val) {
FLATBUFFERS_ASSERT(nibbles > 0);
for (int i = 0; i < nibbles; i++)
if (!isxdigit(static_cast<unsigned char>(cursor_[i])))
if (!is_xdigit(cursor_[i]))
return Error("escape code must be followed by " + NumToString(nibbles) +
" hex digits");
std::string target(cursor_, cursor_ + nibbles);
@@ -261,14 +227,15 @@ CheckedError Parser::SkipByteOrderMark() {
return NoError();
}
bool IsIdentifierStart(char c) {
return isalpha(static_cast<unsigned char>(c)) || c == '_';
static inline bool IsIdentifierStart(char c) {
return is_alpha(c) || (c == '_');
}
CheckedError Parser::Next() {
doc_comment_.clear();
bool seen_newline = cursor_ == source_;
attribute_.clear();
attr_is_trivial_ascii_string_ = true;
for (;;) {
char c = *cursor_++;
token_ = c;
@@ -294,10 +261,6 @@ CheckedError Parser::Next() {
case ':':
case ';':
case '=': return NoError();
case '.':
if (!isdigit(static_cast<unsigned char>(*cursor_)))
return NoError();
return Error("floating point constant can\'t start with \".\"");
case '\"':
case '\'': {
int unicode_high_surrogate = -1;
@@ -306,6 +269,7 @@ CheckedError Parser::Next() {
if (*cursor_ < ' ' && static_cast<signed char>(*cursor_) >= 0)
return Error("illegal character in string constant");
if (*cursor_ == '\\') {
attr_is_trivial_ascii_string_ = false; // has escape sequence
cursor_++;
if (unicode_high_surrogate != -1 && *cursor_ != 'u') {
return Error(
@@ -393,6 +357,9 @@ CheckedError Parser::Next() {
return Error(
"illegal Unicode sequence (unpaired high surrogate)");
}
// reset if non-printable
attr_is_trivial_ascii_string_ &= check_in_range(*cursor_, ' ', '~');
attribute_ += *cursor_++;
}
}
@@ -400,7 +367,8 @@ CheckedError Parser::Next() {
return Error("illegal Unicode sequence (unpaired high surrogate)");
}
cursor_++;
if (!opts.allow_non_utf8 && !ValidateUTF8(attribute_)) {
if (!attr_is_trivial_ascii_string_ && !opts.allow_non_utf8 &&
!ValidateUTF8(attribute_)) {
return Error("illegal UTF-8 sequence");
}
token_ = kTokenStringConstant;
@@ -430,55 +398,69 @@ CheckedError Parser::Next() {
}
// fall thru
default:
if (IsIdentifierStart(c)) {
const auto has_sign = (c == '+') || (c == '-');
// '-'/'+' and following identifier - can be a predefined constant like:
// NAN, INF, PI, etc.
if (IsIdentifierStart(c) || (has_sign && IsIdentifierStart(*cursor_))) {
// Collect all chars of an identifier:
const char *start = cursor_ - 1;
while (isalnum(static_cast<unsigned char>(*cursor_)) || *cursor_ == '_')
cursor_++;
while (IsIdentifierStart(*cursor_) || is_digit(*cursor_)) cursor_++;
attribute_.append(start, cursor_);
token_ = kTokenIdentifier;
token_ = has_sign ? kTokenStringConstant : kTokenIdentifier;
return NoError();
} else if (isdigit(static_cast<unsigned char>(c)) || c == '-') {
const char *start = cursor_ - 1;
if (c == '-' && *cursor_ == '0' &&
(cursor_[1] == 'x' || cursor_[1] == 'X')) {
++start;
++cursor_;
attribute_.append(&c, &c + 1);
c = '0';
}
auto dot_lvl = (c == '.') ? 0 : 1; // dot_lvl==0 <=> exactly one '.' seen
if (!dot_lvl && !is_digit(*cursor_)) return NoError(); // enum?
// Parser accepts hexadecimal-floating-literal (see C++ 5.13.4).
if (is_digit(c) || has_sign || !dot_lvl) {
const auto start = cursor_ - 1;
auto start_digits = !is_digit(c) ? cursor_ : cursor_ - 1;
if (!is_digit(c) && is_digit(*cursor_)){
start_digits = cursor_; // see digit in cursor_ position
c = *cursor_++;
}
if (c == '0' && (*cursor_ == 'x' || *cursor_ == 'X')) {
cursor_++;
while (isxdigit(static_cast<unsigned char>(*cursor_))) cursor_++;
attribute_.append(start + 2, cursor_);
attribute_ = NumToString(static_cast<int64_t>(
StringToUInt(attribute_.c_str(), nullptr, 16)));
token_ = kTokenIntegerConstant;
return NoError();
}
while (isdigit(static_cast<unsigned char>(*cursor_))) cursor_++;
if (*cursor_ == '.' || *cursor_ == 'e' || *cursor_ == 'E') {
if (*cursor_ == '.') {
cursor_++;
while (isdigit(static_cast<unsigned char>(*cursor_))) cursor_++;
// hex-float can't begind with '.'
auto use_hex = dot_lvl && (c == '0') && is_alpha_char(*cursor_, 'X');
if (use_hex) start_digits = ++cursor_; // '0x' is the prefix, skip it
// Read an integer number or mantisa of float-point number.
do {
if (use_hex) {
while (is_xdigit(*cursor_)) cursor_++;
} else {
while (is_digit(*cursor_)) cursor_++;
}
// See if this float has a scientific notation suffix. Both JSON
// and C++ (through strtod() we use) have the same format:
if (*cursor_ == 'e' || *cursor_ == 'E') {
} while ((*cursor_ == '.') && (++cursor_) && (--dot_lvl >= 0));
// Exponent of float-point number.
if ((dot_lvl >= 0) && (cursor_ > start_digits)) {
// The exponent suffix of hexadecimal float number is mandatory.
if (use_hex && !dot_lvl) start_digits = cursor_;
if ((use_hex && is_alpha_char(*cursor_, 'P')) ||
is_alpha_char(*cursor_, 'E')) {
dot_lvl = 0; // Emulate dot to signal about float-point number.
cursor_++;
if (*cursor_ == '+' || *cursor_ == '-') cursor_++;
while (isdigit(static_cast<unsigned char>(*cursor_))) cursor_++;
start_digits = cursor_; // the exponent-part has to have digits
// Exponent is decimal integer number
while (is_digit(*cursor_)) cursor_++;
if (*cursor_ == '.') {
cursor_++; // If see a dot treat it as part of invalid number.
dot_lvl = -1; // Fall thru to Error().
}
}
token_ = kTokenFloatConstant;
} else {
token_ = kTokenIntegerConstant;
}
attribute_.append(start, cursor_);
return NoError();
// Finalize.
if ((dot_lvl >= 0) && (cursor_ > start_digits)) {
attribute_.append(start, cursor_);
token_ = dot_lvl ? kTokenIntegerConstant : kTokenFloatConstant;
return NoError();
} else {
return Error("invalid number: " + std::string(start, cursor_));
}
}
std::string ch;
ch = c;
if (c < ' ' || c > '~') ch = "code: " + NumToString(c);
if (false == check_in_range(c, ' ', '~')) ch = "code: " + NumToString(c);
return Error("illegal character: " + ch);
}
}
@@ -674,7 +656,7 @@ CheckedError Parser::ParseField(StructDef &struct_def) {
(struct_def.fixed && field->value.constant != "0"))
return Error(
"default values currently only supported for scalars in tables");
ECHECK(ParseSingleValue(&field->name, field->value));
ECHECK(ParseSingleValue(&field->name, field->value, true));
}
if (type.enum_def &&
!type.enum_def->is_union &&
@@ -684,9 +666,20 @@ CheckedError Parser::ParseField(StructDef &struct_def) {
return Error("default value of " + field->value.constant + " for field " +
name + " is not part of enum " + type.enum_def->name);
}
// Append .0 if the value has not it (skip hex and scientific floats).
// This suffix needed for generated C++ code.
if (IsFloat(type.base_type)) {
if (!strpbrk(field->value.constant.c_str(), ".eE"))
auto &text = field->value.constant;
FLATBUFFERS_ASSERT(false == text.empty());
auto s = text.c_str();
while(*s == ' ') s++;
if (*s == '-' || *s == '+') s++;
// 1) A float constants (nan, inf, pi, etc) is a kind of identifier.
// 2) A float number needn't ".0" at the end if it has exponent.
if ((false == IsIdentifierStart(*s)) &&
(std::string::npos == field->value.constant.find_first_of(".eEpP"))) {
field->value.constant += ".0";
}
}
if (type.enum_def && IsScalar(type.base_type) && !struct_def.fixed &&
@@ -915,11 +908,13 @@ CheckedError Parser::ParseAnyValue(Value &val, FieldDef *field,
(token_ == kTokenIdentifier || token_ == kTokenStringConstant)) {
ECHECK(ParseHash(val, field));
} else {
ECHECK(ParseSingleValue(field ? &field->name : nullptr, val));
ECHECK(ParseSingleValue(field ? &field->name : nullptr, val, false));
}
break;
}
default: ECHECK(ParseSingleValue(field ? &field->name : nullptr, val)); break;
default:
ECHECK(ParseSingleValue(field ? &field->name : nullptr, val, false));
break;
}
return NoError();
}
@@ -994,7 +989,8 @@ CheckedError Parser::ParseTable(const StructDef &struct_def, std::string *value,
ECHECK(parser->SkipAnyJsonValue());
}
} else {
if (parser->IsIdent("null")) {
if (parser->IsIdent("null") &&
!IsScalar(field->value.type.base_type)) {
ECHECK(parser->Next()); // Ignore this field.
} else {
Value val = field->value;
@@ -1252,7 +1248,7 @@ CheckedError Parser::ParseMetaData(SymbolTable<Value> *attributes) {
attributes->Add(name, e);
if (Is(':')) {
NEXT();
ECHECK(ParseSingleValue(&name, *e));
ECHECK(ParseSingleValue(&name, *e, true));
}
if (Is(')')) {
NEXT();
@@ -1264,23 +1260,40 @@ CheckedError Parser::ParseMetaData(SymbolTable<Value> *attributes) {
return NoError();
}
CheckedError Parser::TryTypedValue(const std::string *name, int dtoken, bool check, Value &e,
BaseType req, bool *destmatch) {
CheckedError Parser::TryTypedValue(const std::string *name, int dtoken,
bool check, Value &e, BaseType req,
bool *destmatch) {
bool match = dtoken == token_;
if (match) {
FLATBUFFERS_ASSERT(*destmatch == false);
*destmatch = true;
e.constant = attribute_;
// Check token match
if (!check) {
if (e.type.base_type == BASE_TYPE_NONE) {
e.type.base_type = req;
} else {
return Error(std::string("type mismatch: expecting: ") +
kTypeNames[e.type.base_type] +
", found: " + kTypeNames[req] +
", name: " + (name ? *name : "") +
", value: " + e.constant);
return Error(
std::string("type mismatch: expecting: ") +
kTypeNames[e.type.base_type] + ", found: " + kTypeNames[req] +
", name: " + (name ? *name : "") + ", value: " + e.constant);
}
}
// The exponent suffix of hexadecimal float-point number is mandatory.
// A hex-integer constant is forbidden as an initializer of float number.
if ((kTokenFloatConstant != dtoken) && IsFloat(e.type.base_type)) {
const auto &s = e.constant;
const auto k = s.find_first_of("0123456789.");
if ((std::string::npos != k) && (s.length() > (k + 1)) &&
(s.at(k) == '0' && is_alpha_char(s.at(k + 1), 'X')) &&
(std::string::npos == s.find_first_of("pP", k + 2))) {
return Error(
"invalid number, the exponent suffix of hexadecimal "
"floating-point literals is mandatory: \"" +
s + "\"");
}
}
NEXT();
}
return NoError();
@@ -1375,20 +1388,29 @@ CheckedError Parser::TokenError() {
return Error("cannot parse value starting with: " + TokenToStringId(token_));
}
CheckedError Parser::ParseSingleValue(const std::string *name, Value &e) {
CheckedError Parser::ParseSingleValue(const std::string *name, Value &e,
bool check_now) {
// First see if this could be a conversion function:
if (token_ == kTokenIdentifier && *cursor_ == '(') {
auto functionname = attribute_;
// todo: Extract processing of conversion functions to ParseFunction.
const auto functionname = attribute_;
if (!IsFloat(e.type.base_type)) {
return Error(functionname + ": type of argument mismatch, expecting: " +
kTypeNames[BASE_TYPE_DOUBLE] +
", found: " + kTypeNames[e.type.base_type] +
", name: " + (name ? *name : "") + ", value: " + e.constant);
}
NEXT();
EXPECT('(');
ECHECK(ParseSingleValue(name, e));
ECHECK(Recurse([&]() { return ParseSingleValue(name, e, false); }));
EXPECT(')');
// calculate with double precision
double x, y = 0.0;
ECHECK(atot(e.constant.c_str(), *this, &x));
auto func_match = false;
// clang-format off
#define FLATBUFFERS_FN_DOUBLE(name, op) \
if (functionname == name) { \
auto x = strtod(e.constant.c_str(), nullptr); \
e.constant = NumToString(op); \
}
if (!func_match && functionname == name) { y = op; func_match = true; }
FLATBUFFERS_FN_DOUBLE("deg", x / kPi * 180);
FLATBUFFERS_FN_DOUBLE("rad", x * kPi / 180);
FLATBUFFERS_FN_DOUBLE("sin", sin(x));
@@ -1400,47 +1422,108 @@ CheckedError Parser::ParseSingleValue(const std::string *name, Value &e) {
// TODO(wvo): add more useful conversion functions here.
#undef FLATBUFFERS_FN_DOUBLE
// clang-format on
// Then check if this could be a string/identifier enum value:
} else if (e.type.base_type != BASE_TYPE_STRING &&
e.type.base_type != BASE_TYPE_BOOL &&
e.type.base_type != BASE_TYPE_NONE &&
(token_ == kTokenIdentifier || token_ == kTokenStringConstant)) {
if (IsIdentifierStart(attribute_[0])) { // Enum value.
if (true != func_match) {
return Error(std::string("Unknown conversion function: ") + functionname +
", field name: " + (name ? *name : "") +
", value: " + e.constant);
}
e.constant = NumToString(y);
return NoError();
}
auto match = false;
// clang-format off
#define TRY_ECHECK(force, dtoken, check, req) \
if (!match && ((check) || IsConstTrue(force))) \
ECHECK(TryTypedValue(name, dtoken, check, e, req, &match))
// clang-format on
if (token_ == kTokenStringConstant || token_ == kTokenIdentifier) {
const auto kTokenStringOrIdent = token_;
// The string type is a most probable type, check it first.
TRY_ECHECK(false, kTokenStringConstant,
e.type.base_type == BASE_TYPE_STRING, BASE_TYPE_STRING);
// avoid escaped and non-ascii in the string
if ((token_ == kTokenStringConstant) && IsScalar(e.type.base_type) &&
!attr_is_trivial_ascii_string_) {
return Error(
std::string("type mismatch or invalid value, an initializer of "
"non-string field must be trivial ASCII string: type: ") +
kTypeNames[e.type.base_type] + ", name: " + (name ? *name : "") +
", value: " + attribute_);
}
// A boolean as true/false. Boolean as Integer check below.
if (!match && IsBool(e.type.base_type)) {
auto is_true = attribute_ == "true";
if (is_true || attribute_ == "false") {
attribute_ = is_true ? "1" : "0";
// accepts both kTokenStringConstant and kTokenIdentifier
TRY_ECHECK(false, kTokenStringOrIdent, IsBool(e.type.base_type),
BASE_TYPE_BOOL);
}
}
// Check if this could be a string/identifier enum value.
// Enum can have only true integer base type.
if (!match && IsInteger(e.type.base_type) && !IsBool(e.type.base_type) &&
IsIdentifierStart(*attribute_.c_str())) {
int64_t val;
ECHECK(ParseEnumFromString(e.type, &val));
e.constant = NumToString(val);
NEXT();
} else { // Numeric constant in string.
if (IsInteger(e.type.base_type)) {
char *end;
e.constant = NumToString(StringToInt(attribute_.c_str(), &end));
if (*end) return Error("invalid integer: " + attribute_);
} else if (IsFloat(e.type.base_type)) {
char *end;
e.constant = NumToString(strtod(attribute_.c_str(), &end));
if (*end) return Error("invalid float: " + attribute_);
} else {
FLATBUFFERS_ASSERT(0); // Shouldn't happen, we covered all types.
e.constant = "0";
}
NEXT();
match = true;
}
// float/integer number in string
if ((token_ == kTokenStringConstant) && IsScalar(e.type.base_type)) {
// remove trailing whitespaces from attribute_
auto last = attribute_.find_last_not_of(' ');
if (std::string::npos != last) // has non-whitespace
attribute_.resize(last + 1);
}
// Float numbers or nan, inf, pi, etc.
TRY_ECHECK(false, kTokenStringOrIdent, IsFloat(e.type.base_type),
BASE_TYPE_FLOAT);
// An integer constant in string.
TRY_ECHECK(false, kTokenStringOrIdent, IsInteger(e.type.base_type),
BASE_TYPE_INT);
// Unknown tokens will be interpreted as string type.
TRY_ECHECK(true, kTokenStringConstant, e.type.base_type == BASE_TYPE_STRING,
BASE_TYPE_STRING);
} else {
bool match = false;
ECHECK(TryTypedValue(name, kTokenIntegerConstant, IsScalar(e.type.base_type), e,
BASE_TYPE_INT, &match));
ECHECK(TryTypedValue(name, kTokenFloatConstant, IsFloat(e.type.base_type), e,
BASE_TYPE_FLOAT, &match));
ECHECK(TryTypedValue(name, kTokenStringConstant,
e.type.base_type == BASE_TYPE_STRING, e,
BASE_TYPE_STRING, &match));
auto istrue = IsIdent("true");
if (istrue || IsIdent("false")) {
attribute_ = NumToString(istrue);
ECHECK(TryTypedValue(name, kTokenIdentifier, IsBool(e.type.base_type), e,
BASE_TYPE_BOOL, &match));
// Try a float number.
TRY_ECHECK(false, kTokenFloatConstant, IsFloat(e.type.base_type),
BASE_TYPE_FLOAT);
// Integer token can init any scalar (integer of float).
TRY_ECHECK(true, kTokenIntegerConstant, IsScalar(e.type.base_type),
BASE_TYPE_INT);
}
#undef TRY_ECHECK
if (!match) return TokenError();
// The check_now flag must be true when parse a fbs-schema.
// This flag forces to check default scalar values or metadata of field.
// For JSON parser the flag should be false.
// If it is set for JSON each value will be checked twice (see ParseTable).
if (check_now && IsScalar(e.type.base_type)) {
// "re-pack" an integer scalar to remove any ambiguities like leading zeros
// which can be treated as octal-literal (idl_gen_cpp/GenDefaultConstant).
const auto repack = IsInteger(e.type.base_type);
switch (e.type.base_type) {
// clang-format off
#define FLATBUFFERS_TD(ENUM, IDLTYPE, \
CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, RTYPE) \
case BASE_TYPE_ ## ENUM: {\
CTYPE val; \
ECHECK(atot(e.constant.c_str(), *this, &val)); \
if(repack) e.constant = NumToString(val); \
break; }
FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD);
#undef FLATBUFFERS_TD
default: break;
// clang-format on
}
if (!match) return TokenError();
}
return NoError();
}
@@ -1565,7 +1648,7 @@ CheckedError Parser::ParseEnum(bool is_union, EnumDef **dest) {
}
if (Is('=')) {
NEXT();
ev.value = StringToInt(attribute_.c_str());
ECHECK(atot(attribute_.c_str(), *this, &ev.value));
EXPECT(kTokenIntegerConstant);
if (!opts.proto_mode && prevsize &&
enum_def->vals.vec[prevsize - 1]->value >= ev.value)
@@ -2451,6 +2534,9 @@ CheckedError Parser::DoParse(const char *source, const char **include_paths,
? file_identifier_.c_str()
: nullptr);
}
// Check that JSON file doesn't contain more objects or IDL directives.
// Comments after JSON are allowed.
EXPECT(kTokenEof);
} else if (IsIdent("enum")) {
ECHECK(ParseEnum(false, nullptr));
} else if (IsIdent("union")) {
@@ -2606,7 +2692,9 @@ Offset<reflection::Field> FieldDef::Serialize(FlatBufferBuilder *builder,
return reflection::CreateField(
*builder, builder->CreateString(name), value.type.Serialize(builder), id,
value.offset,
// Is uint64>max(int64) tested?
IsInteger(value.type.base_type) ? StringToInt(value.constant.c_str()) : 0,
// result may be platform-dependent if underlying is float (not double)
IsFloat(value.type.base_type) ? strtod(value.constant.c_str(), nullptr)
: 0.0,
deprecated, required, key, SerializeAttributes(builder, parser),