base/json/json_reader.cc - Issue 10035042: Rewrite base::JSONReader to be 35-40% faster, depending on the input string.

Unified Diff: base/json/json_reader.cc

Issue 10035042: Rewrite base::JSONReader to be 35-40% faster, depending on the input string. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Really fix Windows, address comments Created 8 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: base/json/json_reader.cc

diff --git a/base/json/json_reader.cc b/base/json/json_reader.cc

index 34574788671043c836dc88746810634de6719c32..fb1459b4ceb33aa42225c431888c8f2767a5dc53 100644

--- a/base/json/json_reader.cc

+++ b/base/json/json_reader.cc

@@ -4,73 +4,17 @@

#include "base/json/json_reader.h"

-#include "base/float_util.h"

+#include "base/json/json_parser.h"

#include "base/logging.h"

-#include "base/memory/scoped_ptr.h"

-#include "base/stringprintf.h"

-#include "base/string_number_conversions.h"

-#include "base/string_piece.h"

-#include "base/string_util.h"

-#include "base/third_party/icu/icu_utf.h"

-#include "base/utf_string_conversions.h"

-#include "base/values.h"

-namespace {

-const char kNullString[] = "null";

-const char kTrueString[] = "true";

-const char kFalseString[] = "false";

-const int kStackLimit = 100;

-// A helper method for ParseNumberToken. It reads an int from the end of

-// token. The method returns false if there is no valid integer at the end of

-// the token.

-bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {

- char first = token.NextChar();

- int len = 0;

- // Read in more digits.

- char c = first;

- while ('\0' != c && IsAsciiDigit(c)) {

- ++token.length;

- ++len;

- c = token.NextChar();

- }

- // We need at least 1 digit.

- if (len == 0)

- return false;

- if (!can_have_leading_zeros && len > 1 && '0' == first)

- return false;

- return true;

-// A helper method for ParseStringToken. It reads |digits| hex digits from the

-// token. If the sequence if digits is not valid (contains other characters),

-// the method returns false.

-bool ReadHexDigits(base::JSONReader::Token& token, int digits) {

- for (int i = 1; i <= digits; ++i) {

- char c = *(token.begin + token.length + i);

- if (c == '\0' || !IsHexDigit(c))

- return false;

- }

- token.length += digits;

- return true;

-} // namespace

namespace base {

-const char* JSONReader::kBadRootElementType =

- "Root value must be an array or object.";

const char* JSONReader::kInvalidEscape =

"Invalid escape sequence.";

const char* JSONReader::kSyntaxError =

"Syntax error.";

+const char* JSONReader::kUnexpectedToken =

+ "Unexpected token.";

const char* JSONReader::kTrailingComma =

"Trailing comma not allowed.";

const char* JSONReader::kTooMuchNesting =

@@ -83,24 +27,27 @@ const char* JSONReader::kUnquotedDictionaryKey =

"Dictionary keys must be quoted.";

JSONReader::JSONReader()

- : start_pos_(NULL),

- json_pos_(NULL),

- end_pos_(NULL),

- stack_depth_(0),

- allow_trailing_comma_(false),

- error_code_(JSON_NO_ERROR),

- error_line_(0),

- error_col_(0) {}

+ : parser_(new internal::JSONParser(JSON_PARSE_RFC)) {

+JSONReader::JSONReader(int options)

+ : parser_(new internal::JSONParser(options)) {

+JSONReader::~JSONReader() {

// static

Value* JSONReader::Read(const std::string& json) {

- return Read(json, JSON_PARSE_RFC);

+ internal::JSONParser parser(JSON_PARSE_RFC);

+ return parser.Parse(json);

}

// static

Value* JSONReader::Read(const std::string& json,

int options) {

- return ReadAndReturnError(json, options, NULL, NULL);

+ internal::JSONParser parser(options);

+ return parser.Parse(json);

}

// static

@@ -108,16 +55,15 @@ Value* JSONReader::ReadAndReturnError(const std::string& json,

int options,

int* error_code_out,

std::string* error_msg_out) {

- JSONReader reader = JSONReader();

- Value* root = reader.JsonToValue(json, false,

- (options & JSON_ALLOW_TRAILING_COMMAS) != 0);

+ internal::JSONParser parser(options);

+ Value* root = parser.Parse(json);

if (root)

return root;

if (error_code_out)

- *error_code_out = reader.error_code();

+ *error_code_out = parser.error_code();

if (error_msg_out)

- *error_msg_out = reader.GetErrorMessage();

+ *error_msg_out = parser.GetErrorMessage();

return NULL;

}

@@ -127,12 +73,12 @@ std::string JSONReader::ErrorCodeToString(JsonParseError error_code) {

switch (error_code) {

case JSON_NO_ERROR:

return std::string();

- case JSON_BAD_ROOT_ELEMENT_TYPE:

- return kBadRootElementType;

case JSON_INVALID_ESCAPE:

return kInvalidEscape;

case JSON_SYNTAX_ERROR:

return kSyntaxError;

+ case JSON_UNEXPECTED_TOKEN:

+ return kUnexpectedToken;

case JSON_TRAILING_COMMA:

return kTrailingComma;

case JSON_TOO_MUCH_NESTING:

@@ -149,586 +95,16 @@ std::string JSONReader::ErrorCodeToString(JsonParseError error_code) {

}

-std::string JSONReader::GetErrorMessage() const {

- return FormatErrorMessage(error_line_, error_col_,

- ErrorCodeToString(error_code_));

-Value* JSONReader::JsonToValue(const std::string& json, bool check_root,

- bool allow_trailing_comma) {

- // The input must be in UTF-8.

- if (!IsStringUTF8(json.data())) {

- error_code_ = JSON_UNSUPPORTED_ENCODING;

- return NULL;

- }

- start_pos_ = json.data();

- end_pos_ = start_pos_ + json.size();

- // When the input JSON string starts with a UTF-8 Byte-Order-Mark (U+FEFF)

- // or <0xEF 0xBB 0xBF>, advance the start position to avoid the

- // JSONReader::BuildValue() function from mis-treating a Unicode BOM as an

- // invalid character and returning NULL.

- if (json.size() >= 3 && static_cast<uint8>(start_pos_[0]) == 0xEF &&

- static_cast<uint8>(start_pos_[1]) == 0xBB &&

- static_cast<uint8>(start_pos_[2]) == 0xBF) {

- start_pos_ += 3;

- }

- json_pos_ = start_pos_;

- allow_trailing_comma_ = allow_trailing_comma;

- stack_depth_ = 0;

- error_code_ = JSON_NO_ERROR;

- scoped_ptr<Value> root(BuildValue(check_root));

- if (root.get()) {

- if (ParseToken().type == Token::END_OF_INPUT) {

- return root.release();

- } else {

- SetErrorCode(JSON_UNEXPECTED_DATA_AFTER_ROOT, json_pos_);

- }

- // Default to calling errors "syntax errors".

- if (error_code_ == 0)

- SetErrorCode(JSON_SYNTAX_ERROR, json_pos_);

- return NULL;

-// static

-std::string JSONReader::FormatErrorMessage(int line, int column,

- const std::string& description) {

- if (line || column) {

- return base::StringPrintf(

- "Line: %i, column: %i, %s", line, column, description.c_str());

- }

- return description;

+Value* JSONReader::ReadToValue(const std::string& json) {

+ return parser_->Parse(json);

}

-Value* JSONReader::BuildValue(bool is_root) {

- ++stack_depth_;

- if (stack_depth_ > kStackLimit) {

- SetErrorCode(JSON_TOO_MUCH_NESTING, json_pos_);

- return NULL;

- }

- Token token = ParseToken();

- // The root token must be an array or an object.

- if (is_root && token.type != Token::OBJECT_BEGIN &&

- token.type != Token::ARRAY_BEGIN) {

- SetErrorCode(JSON_BAD_ROOT_ELEMENT_TYPE, json_pos_);

- return NULL;

- }

- scoped_ptr<Value> node;

- switch (token.type) {

- case Token::END_OF_INPUT:

- case Token::INVALID_TOKEN:

- return NULL;

- case Token::NULL_TOKEN:

- node.reset(Value::CreateNullValue());

- break;

- case Token::BOOL_TRUE:

- node.reset(Value::CreateBooleanValue(true));

- break;

- case Token::BOOL_FALSE:

- node.reset(Value::CreateBooleanValue(false));

- break;

- case Token::NUMBER:

- node.reset(DecodeNumber(token));

- if (!node.get())

- return NULL;

- break;

- case Token::STRING:

- node.reset(DecodeString(token));

- if (!node.get())

- return NULL;

- break;

- case Token::ARRAY_BEGIN:

- {

- json_pos_ += token.length;

- token = ParseToken();

- node.reset(new ListValue());

- while (token.type != Token::ARRAY_END) {

- Value* array_node = BuildValue(false);

- if (!array_node)

- return NULL;

- static_cast<ListValue*>(node.get())->Append(array_node);

- // After a list value, we expect a comma or the end of the list.

- token = ParseToken();

- if (token.type == Token::LIST_SEPARATOR) {

- json_pos_ += token.length;

- token = ParseToken();

- // Trailing commas are invalid according to the JSON RFC, but some

- // consumers need the parsing leniency, so handle accordingly.

- if (token.type == Token::ARRAY_END) {

- if (!allow_trailing_comma_) {

- SetErrorCode(JSON_TRAILING_COMMA, json_pos_);

- return NULL;

- }

- // Trailing comma OK, stop parsing the Array.

- break;

- }

- } else if (token.type != Token::ARRAY_END) {

- // Unexpected value after list value. Bail out.

- return NULL;

- }

- if (token.type != Token::ARRAY_END) {

- return NULL;

- }

- break;

- }

- case Token::OBJECT_BEGIN:

- {

- json_pos_ += token.length;

- token = ParseToken();

- node.reset(new DictionaryValue);

- while (token.type != Token::OBJECT_END) {

- if (token.type != Token::STRING) {

- SetErrorCode(JSON_UNQUOTED_DICTIONARY_KEY, json_pos_);

- return NULL;

- }

- scoped_ptr<Value> dict_key_value(DecodeString(token));

- if (!dict_key_value.get())

- return NULL;

- // Convert the key into a wstring.

- std::string dict_key;

- bool success = dict_key_value->GetAsString(&dict_key);

- DCHECK(success);

- json_pos_ += token.length;

- token = ParseToken();

- if (token.type != Token::OBJECT_PAIR_SEPARATOR)

- return NULL;

- json_pos_ += token.length;

- token = ParseToken();

- Value* dict_value = BuildValue(false);

- if (!dict_value)

- return NULL;

- static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion(

- dict_key, dict_value);

- // After a key/value pair, we expect a comma or the end of the

- // object.

- token = ParseToken();

- if (token.type == Token::LIST_SEPARATOR) {

- json_pos_ += token.length;

- token = ParseToken();

- // Trailing commas are invalid according to the JSON RFC, but some

- // consumers need the parsing leniency, so handle accordingly.

- if (token.type == Token::OBJECT_END) {

- if (!allow_trailing_comma_) {

- SetErrorCode(JSON_TRAILING_COMMA, json_pos_);

- return NULL;

- }

- // Trailing comma OK, stop parsing the Object.

- break;

- }

- } else if (token.type != Token::OBJECT_END) {

- // Unexpected value after last object value. Bail out.

- return NULL;

- }

- if (token.type != Token::OBJECT_END)

- return NULL;

- break;

- }

- default:

- // We got a token that's not a value.

- return NULL;

- }

- json_pos_ += token.length;

- --stack_depth_;

- return node.release();

+JSONReader::JsonParseError JSONReader::error_code() const {

+ return parser_->error_code();

}

-JSONReader::Token JSONReader::ParseNumberToken() {

- // We just grab the number here. We validate the size in DecodeNumber.

- // According to RFC4627, a valid number is: [minus] int [frac] [exp]

- Token token(Token::NUMBER, json_pos_, 0);

- char c = *json_pos_;

- if ('-' == c) {

- ++token.length;

- c = token.NextChar();

- }

- if (!ReadInt(token, false))

- return Token::CreateInvalidToken();

- // Optional fraction part

- c = token.NextChar();

- if ('.' == c) {

- ++token.length;

- if (!ReadInt(token, true))

- return Token::CreateInvalidToken();

- c = token.NextChar();

- }

- // Optional exponent part

- if ('e' == c || 'E' == c) {

- ++token.length;

- c = token.NextChar();

- if ('-' == c || '+' == c) {

- ++token.length;

- c = token.NextChar();

- }

- if (!ReadInt(token, true))

- return Token::CreateInvalidToken();

- }

- return token;

-Value* JSONReader::DecodeNumber(const Token& token) {

- const std::string num_string(token.begin, token.length);

- int num_int;

- if (StringToInt(num_string, &num_int))

- return Value::CreateIntegerValue(num_int);

- double num_double;

- if (StringToDouble(num_string, &num_double) && base::IsFinite(num_double))

- return Value::CreateDoubleValue(num_double);

- return NULL;

-JSONReader::Token JSONReader::ParseStringToken() {

- Token token(Token::STRING, json_pos_, 1);

- char c = token.NextChar();

- while (json_pos_ + token.length < end_pos_) {

- if ('\\' == c) {

- ++token.length;

- c = token.NextChar();

- // Make sure the escaped char is valid.

- switch (c) {

- case 'x':

- if (!ReadHexDigits(token, 2)) {

- SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);

- return Token::CreateInvalidToken();

- }

- break;

- case 'u':

- if (!ReadHexDigits(token, 4)) {

- SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);

- return Token::CreateInvalidToken();

- }

- break;

- case '\\':

- case '/':

- case 'b':

- case 'f':

- case 'n':

- case 'r':

- case 't':

- case 'v':

- case '"':

- break;

- default:

- SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);

- return Token::CreateInvalidToken();

- }

- } else if ('"' == c) {

- ++token.length;

- return token;

- }

- ++token.length;

- c = token.NextChar();

- }

- return Token::CreateInvalidToken();

-Value* JSONReader::DecodeString(const Token& token) {

- std::string decoded_str;

- decoded_str.reserve(token.length - 2);

- for (int i = 1; i < token.length - 1; ++i) {

- char c = *(token.begin + i);

- if ('\\' == c) {

- ++i;

- c = *(token.begin + i);

- switch (c) {

- case '"':

- case '/':

- case '\\':

- decoded_str.push_back(c);

- break;

- case 'b':

- decoded_str.push_back('\b');

- break;

- case 'f':

- decoded_str.push_back('\f');

- break;

- case 'n':

- decoded_str.push_back('\n');

- break;

- case 'r':

- decoded_str.push_back('\r');

- break;

- case 't':

- decoded_str.push_back('\t');

- break;

- case 'v':

- decoded_str.push_back('\v');

- break;

- case 'x': {

- if (i + 2 >= token.length)

- return NULL;

- int hex_digit = 0;

- if (!HexStringToInt(StringPiece(token.begin + i + 1, 2), &hex_digit))

- return NULL;

- decoded_str.push_back(hex_digit);

- i += 2;

- break;

- }

- case 'u':

- if (!ConvertUTF16Units(token, &i, &decoded_str))

- return NULL;

- break;

- default:

- // We should only have valid strings at this point. If not,

- // ParseStringToken didn't do its job.

- NOTREACHED();

- return NULL;

- }

- } else {

- // Not escaped

- decoded_str.push_back(c);

- }

- return Value::CreateStringValue(decoded_str);

-bool JSONReader::ConvertUTF16Units(const Token& token,

- int* i,

- std::string* dest_string) {

- if (*i + 4 >= token.length)

- return false;

- // This is a 32-bit field because the shift operations in the

- // conversion process below cause MSVC to error about "data loss."

- // This only stores UTF-16 code units, though.

- // Consume the UTF-16 code unit, which may be a high surrogate.

- int code_unit16_high = 0;

- if (!HexStringToInt(StringPiece(token.begin + *i + 1, 4), &code_unit16_high))

- return false;

- *i += 4;

- // If this is a high surrogate, consume the next code unit to get the

- // low surrogate.

- int code_unit16_low = 0;

- if (CBU16_IS_SURROGATE(code_unit16_high)) {

- // Make sure this is the high surrogate. If not, it's an encoding

- // error.

- if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))

- return false;

- // Make sure that the token has more characters to consume the

- // lower surrogate.

- if (*i + 6 >= token.length)

- return false;

- if (*(++(*i) + token.begin) != '\\' || *(++(*i) + token.begin) != 'u')

- return false;

- if (!HexStringToInt(StringPiece(token.begin + *i + 1, 4), &code_unit16_low))

- return false;

- *i += 4;

- if (!CBU16_IS_SURROGATE(code_unit16_low) ||

- !CBU16_IS_TRAIL(code_unit16_low)) {

- return false;

- }

- } else if (!CBU16_IS_SINGLE(code_unit16_high)) {

- // If this is not a code point, it's an encoding error.

- return false;

- }

- // Convert the UTF-16 code units to a code point and then to a UTF-8

- // code unit sequence.

- char code_point[8] = { 0 };

- size_t offset = 0;

- if (!code_unit16_low) {

- CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high);

- } else {

- uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high,

- code_unit16_low);

- offset = 0;

- CBU8_APPEND_UNSAFE(code_point, offset, code_unit32);

- }

- dest_string->append(code_point);

- return true;

-JSONReader::Token JSONReader::ParseToken() {

- EatWhitespaceAndComments();

- Token token(Token::INVALID_TOKEN, 0, 0);

- switch (*json_pos_) {

- case '\0':

- token.type = Token::END_OF_INPUT;

- break;

- case 'n':

- if (NextStringMatch(kNullString, arraysize(kNullString) - 1))

- token = Token(Token::NULL_TOKEN, json_pos_, 4);

- break;

- case 't':

- if (NextStringMatch(kTrueString, arraysize(kTrueString) - 1))

- token = Token(Token::BOOL_TRUE, json_pos_, 4);

- break;

- case 'f':

- if (NextStringMatch(kFalseString, arraysize(kFalseString) - 1))

- token = Token(Token::BOOL_FALSE, json_pos_, 5);

- break;

- case '[':

- token = Token(Token::ARRAY_BEGIN, json_pos_, 1);

- break;

- case ']':

- token = Token(Token::ARRAY_END, json_pos_, 1);

- break;

- case ',':

- token = Token(Token::LIST_SEPARATOR, json_pos_, 1);

- break;

- case '{':

- token = Token(Token::OBJECT_BEGIN, json_pos_, 1);

- break;

- case '}':

- token = Token(Token::OBJECT_END, json_pos_, 1);

- break;

- case ':':

- token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1);

- break;

- case '0':

- case '1':

- case '2':

- case '3':

- case '4':

- case '5':

- case '6':

- case '7':

- case '8':

- case '9':

- case '-':

- token = ParseNumberToken();

- break;

- case '"':

- token = ParseStringToken();

- break;

- }

- return token;

-void JSONReader::EatWhitespaceAndComments() {

- while (json_pos_ != end_pos_) {

- switch (*json_pos_) {

- case ' ':

- case '\n':

- case '\r':

- case '\t':

- ++json_pos_;

- break;

- case '/':

- // TODO(tc): This isn't in the RFC so it should be a parser flag.

- if (!EatComment())

- return;

- break;

- default:

- // Not a whitespace char, just exit.

- return;

- }

-bool JSONReader::EatComment() {

- if ('/' != *json_pos_)

- return false;

- char next_char = *(json_pos_ + 1);

- if ('/' == next_char) {

- // Line comment, read until \n or \r

- json_pos_ += 2;

- while (json_pos_ != end_pos_) {

- switch (*json_pos_) {

- case '\n':

- case '\r':

- ++json_pos_;

- return true;

- default:

- ++json_pos_;

- }

- } else if ('*' == next_char) {

- // Block comment, read until */

- json_pos_ += 2;

- while (json_pos_ != end_pos_) {

- if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {

- json_pos_ += 2;

- return true;

- }

- ++json_pos_;

- }

- } else {

- return false;

- }

- return true;

-bool JSONReader::NextStringMatch(const char* str, size_t length) {

- return strncmp(json_pos_, str, length) == 0;

-void JSONReader::SetErrorCode(JsonParseError error,

- const char* error_pos) {

- int line_number = 1;

- int column_number = 1;

- // Figure out the line and column the error occured at.

- for (const char* pos = start_pos_; pos != error_pos; ++pos) {

- if (pos > end_pos_) {

- NOTREACHED();

- return;

- }

- if (*pos == '\n') {

- ++line_number;

- column_number = 1;

- } else {

- ++column_number;

- }

- error_line_ = line_number;

- error_col_ = column_number;

- error_code_ = error;

+std::string JSONReader::GetErrorMessage() const {

+ return parser_->GetErrorMessage();

}

} // namespace base

« no previous file with comments | « base/json/json_reader.h ('k') | base/json/json_reader_unittest.cc » ('j') | no next file with comments »