base/json/json_reader.cc - Issue 9801007: Improve JSONReader performance by up to 55% by using std::string instead of wstring.

Unified Diff: base/json/json_reader.cc

Issue 9801007: Improve JSONReader performance by up to 55% by using std::string instead of wstring. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Pass Windows tests Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: base/json/json_reader.cc

diff --git a/base/json/json_reader.cc b/base/json/json_reader.cc

index bbaf5fb349bcf396d9fa26f6ba0269faac0d9315..e428ff2b6a37664d752c67178506470fbdccdb6f 100644

--- a/base/json/json_reader.cc

+++ b/base/json/json_reader.cc

@@ -10,14 +10,15 @@

#include "base/stringprintf.h"

#include "base/string_number_conversions.h"

#include "base/string_util.h"

+#include "base/third_party/icu/icu_utf.h"

#include "base/utf_string_conversions.h"

#include "base/values.h"

namespace {

-const wchar_t kNullString[] = L"null";

-const wchar_t kTrueString[] = L"true";

-const wchar_t kFalseString[] = L"false";

+const char kNullString[] = "null";

+const char kTrueString[] = "true";

+const char kFalseString[] = "false";

const int kStackLimit = 100;

@@ -25,11 +26,11 @@ const int kStackLimit = 100;

// token. The method returns false if there is no valid integer at the end of

// the token.

bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {

- wchar_t first = token.NextChar();

+ char first = token.NextChar();

int len = 0;

// Read in more digits.

- wchar_t c = first;

+ char c = first;

while ('\0' != c && IsAsciiDigit(c)) {

++token.length;

++len;

@@ -50,7 +51,7 @@ bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {

// the method returns false.

bool ReadHexDigits(base::JSONReader::Token& token, int digits) {

for (int i = 1; i <= digits; ++i) {

- wchar_t c = *(token.begin + token.length + i);

+ char c = *(token.begin + token.length + i);

if (c == '\0' || !IsHexDigit(c))

return false;

}

@@ -83,6 +84,7 @@ const char* JSONReader::kUnquotedDictionaryKey =

JSONReader::JSONReader()

: start_pos_(NULL),

json_pos_(NULL),

+ end_pos_(NULL),

stack_depth_(0),

allow_trailing_comma_(false),

error_code_(JSON_NO_ERROR),

@@ -148,23 +150,21 @@ std::string JSONReader::GetErrorMessage() const {

Value* JSONReader::JsonToValue(const std::string& json, bool check_root,

bool allow_trailing_comma) {

// The input must be in UTF-8.

- if (!IsStringUTF8(json.c_str())) {

+ if (!IsStringUTF8(json.data())) {

error_code_ = JSON_UNSUPPORTED_ENCODING;

return NULL;

}

- // The conversion from UTF8 to wstring removes null bytes for us

- // (a good thing).

- std::wstring json_wide(UTF8ToWide(json));

- start_pos_ = json_wide.c_str();

- // When the input JSON string starts with a UTF-8 Byte-Order-Mark

- // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode

- // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from

- // mis-treating a Unicode BOM as an invalid character and returning NULL,

- // skip a converted Unicode BOM if it exists.

- if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {

- ++start_pos_;

+ start_pos_ = json.data();

+ end_pos_ = start_pos_ + json.size();

+ // When the input JSON string starts with a UTF-8 Byte-Order-Mark (U+FEFF)

+ // or <0xEF 0xBB 0xBF>, advance the start position to avoid the

+ // JSONReader::BuildValue() function from mis-treating a Unicode BOM as an

+ // invalid character and returning NULL.

+ if (json.size() >= 3 && start_pos_[0] == 0xEF &&

+ start_pos_[1] == 0xBB && start_pos_[2] == 0xBF) {

+ start_pos_ += 3;

}

json_pos_ = start_pos_;

@@ -356,7 +356,7 @@ JSONReader::Token JSONReader::ParseNumberToken() {

// We just grab the number here. We validate the size in DecodeNumber.

// According to RFC4627, a valid number is: [minus] int [frac] [exp]

Token token(Token::NUMBER, json_pos_, 0);

- wchar_t c = *json_pos_;

+ char c = *json_pos_;

if ('-' == c) {

++token.length;

c = token.NextChar();

@@ -390,15 +390,14 @@ JSONReader::Token JSONReader::ParseNumberToken() {

}

Value* JSONReader::DecodeNumber(const Token& token) {

- const std::wstring num_string(token.begin, token.length);

+ const std::string num_string(token.begin, token.length);

int num_int;

- if (StringToInt(WideToUTF8(num_string), &num_int))

+ if (StringToInt(num_string, &num_int))

return Value::CreateIntegerValue(num_int);

double num_double;

- if (StringToDouble(WideToUTF8(num_string), &num_double) &&

- base::IsFinite(num_double))

+ if (StringToDouble(num_string, &num_double) && base::IsFinite(num_double))

return Value::CreateDoubleValue(num_double);

return NULL;

@@ -406,8 +405,8 @@ Value* JSONReader::DecodeNumber(const Token& token) {

JSONReader::Token JSONReader::ParseStringToken() {

Token token(Token::STRING, json_pos_, 1);

- wchar_t c = token.NextChar();

- while ('\0' != c) {

+ char c = token.NextChar();

+ while (json_pos_ + token.length < end_pos_) {

if ('\\' == c) {

++token.length;

c = token.NextChar();

@@ -450,11 +449,11 @@ JSONReader::Token JSONReader::ParseStringToken() {

}

Value* JSONReader::DecodeString(const Token& token) {

- std::wstring decoded_str;

+ std::string decoded_str;

decoded_str.reserve(token.length - 2);

for (int i = 1; i < token.length - 1; ++i) {

- wchar_t c = *(token.begin + i);

+ char c = *(token.begin + i);

if ('\\' == c) {

++i;

c = *(token.begin + i);

@@ -488,13 +487,66 @@ Value* JSONReader::DecodeString(const Token& token) {

HexDigitToInt(*(token.begin + i + 2)));

i += 2;

break;

- case 'u':

- decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) +

- (HexDigitToInt(*(token.begin + i + 2)) << 8) +

- (HexDigitToInt(*(token.begin + i + 3)) << 4) +

- HexDigitToInt(*(token.begin + i + 4)));

+ case 'u': {

+ // Consume the UTF-16 code unit, which may be a high surrogate.

brettw 2012/03/22 05:51:42 This case is getting kind of crazy long, can you s

Robert Sesek 2012/03/22 17:59:57 Done.

+ if (i + 4 >= token.length)

+ return NULL;

+ uint32 code_unit16_high =

+ (HexDigitToInt(*(token.begin + i + 1)) << 12) +

+ (HexDigitToInt(*(token.begin + i + 2)) << 8) +

+ (HexDigitToInt(*(token.begin + i + 3)) << 4) +

+ HexDigitToInt(*(token.begin + i + 4));

i += 4;

+ // If this is a high surrogate, consume the next code unit to get the

+ // low surrogate.

+ // This is a 32-bit field because the shift operations in the

+ // conversion process below cause MSVC to error about "data loss."

+ // This only stores UTF-16 code units, though.

+ uint32 code_unit16_low = 0;

+ if (CBU16_IS_SURROGATE(code_unit16_high)) {

+ // Make sure this is the high surrogate. If not, it's an encoding

+ // error.

+ if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))

+ return NULL;

brettw 2012/03/22 05:51:42 You changed the behavior of this function by faili

Robert Sesek 2012/03/22 17:59:57 I think we do. The JSON spec states what's legal i

+ // Make sure that the token has more characters to consume the

+ // lower surrogate.

+ if (i + 6 >= token.length)

+ return NULL;

+ if (*(++i + token.begin) != '\\' || *(++i + token.begin) != 'u')

+ return NULL;

+ code_unit16_low =

+ (HexDigitToInt(*(token.begin + i + 1)) << 12) +

brettw 2012/03/22 05:51:42 It's sort of annoying that this is duplicated twic

Robert Sesek 2012/03/22 17:59:57 Done. Can you explain what you mean by "be strict"

+ (HexDigitToInt(*(token.begin + i + 2)) << 8) +

+ (HexDigitToInt(*(token.begin + i + 3)) << 4) +

+ HexDigitToInt(*(token.begin + i + 4));

+ if (!CBU16_IS_SURROGATE(code_unit16_low) ||

+ !CBU16_IS_TRAIL(code_unit16_low)) {

+ return NULL;

+ }

+ i += 4;

+ } else if (!CBU16_IS_SINGLE(code_unit16_high)) {

+ // If this is not a code point, it's an encoding error.

+ return NULL;

+ }

+ // Convert the UTF-16 code units to a code point and then to a UTF-8

+ // code unit sequence.

+ char code_point[8] = { 0 };

+ size_t offset = 0;

+ if (!code_unit16_low) {

+ CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high);

+ } else {

+ uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high,

+ code_unit16_low);

+ offset = 0;

+ CBU8_APPEND_UNSAFE(code_point, offset, code_unit32);

+ }

+ decoded_str.append(code_point);

break;

+ }

default:

// We should only have valid strings at this point. If not,

@@ -507,7 +559,7 @@ Value* JSONReader::DecodeString(const Token& token) {

decoded_str.push_back(c);

}

- return Value::CreateStringValue(WideToUTF16Hack(decoded_str));

+ return Value::CreateStringValue(decoded_str);

}

JSONReader::Token JSONReader::ParseToken() {

@@ -580,7 +632,7 @@ JSONReader::Token JSONReader::ParseToken() {

}

void JSONReader::EatWhitespaceAndComments() {

- while ('\0' != *json_pos_) {

+ while (json_pos_ != end_pos_) {

switch (*json_pos_) {

case ' ':

case '\n':

@@ -604,11 +656,11 @@ bool JSONReader::EatComment() {

if ('/' != *json_pos_)

return false;

- wchar_t next_char = *(json_pos_ + 1);

+ char next_char = *(json_pos_ + 1);

if ('/' == next_char) {

// Line comment, read until \n or \r

json_pos_ += 2;

- while ('\0' != *json_pos_) {

+ while (json_pos_ != end_pos_) {

switch (*json_pos_) {

case '\n':

case '\r':

@@ -621,7 +673,7 @@ bool JSONReader::EatComment() {

} else if ('*' == next_char) {

// Block comment, read until */

json_pos_ += 2;

- while ('\0' != *json_pos_) {

+ while (json_pos_ != end_pos_) {

if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {

json_pos_ += 2;

return true;

@@ -634,18 +686,18 @@ bool JSONReader::EatComment() {

return true;

}

-bool JSONReader::NextStringMatch(const wchar_t* str, size_t length) {

- return wcsncmp(json_pos_, str, length) == 0;

+bool JSONReader::NextStringMatch(const char* str, size_t length) {

+ return strncmp(json_pos_, str, length) == 0;

}

void JSONReader::SetErrorCode(JsonParseError error,

- const wchar_t* error_pos) {

+ const char* error_pos) {

int line_number = 1;

int column_number = 1;

// Figure out the line and column the error occured at.

- for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {

- if (*pos == '\0') {

+ for (const char* pos = start_pos_; pos != error_pos; ++pos) {

+ if (pos > end_pos_) {

NOTREACHED();

return;

}

« no previous file with comments | « base/json/json_reader.h ('k') | base/json/json_reader_unittest.cc » ('j') | no next file with comments »