Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(288)

Side by Side Diff: base/json/json_reader.cc

Issue 9801007: Improve JSONReader performance by up to 55% by using std::string instead of wstring. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Pass Windows tests Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/json/json_reader.h ('k') | base/json/json_reader_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/json/json_reader.h" 5 #include "base/json/json_reader.h"
6 6
7 #include "base/float_util.h" 7 #include "base/float_util.h"
8 #include "base/logging.h" 8 #include "base/logging.h"
9 #include "base/memory/scoped_ptr.h" 9 #include "base/memory/scoped_ptr.h"
10 #include "base/stringprintf.h" 10 #include "base/stringprintf.h"
11 #include "base/string_number_conversions.h" 11 #include "base/string_number_conversions.h"
12 #include "base/string_util.h" 12 #include "base/string_util.h"
13 #include "base/third_party/icu/icu_utf.h"
13 #include "base/utf_string_conversions.h" 14 #include "base/utf_string_conversions.h"
14 #include "base/values.h" 15 #include "base/values.h"
15 16
16 namespace { 17 namespace {
17 18
18 const wchar_t kNullString[] = L"null"; 19 const char kNullString[] = "null";
19 const wchar_t kTrueString[] = L"true"; 20 const char kTrueString[] = "true";
20 const wchar_t kFalseString[] = L"false"; 21 const char kFalseString[] = "false";
21 22
22 const int kStackLimit = 100; 23 const int kStackLimit = 100;
23 24
24 // A helper method for ParseNumberToken. It reads an int from the end of 25 // A helper method for ParseNumberToken. It reads an int from the end of
25 // token. The method returns false if there is no valid integer at the end of 26 // token. The method returns false if there is no valid integer at the end of
26 // the token. 27 // the token.
27 bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) { 28 bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {
28 wchar_t first = token.NextChar(); 29 char first = token.NextChar();
29 int len = 0; 30 int len = 0;
30 31
31 // Read in more digits. 32 // Read in more digits.
32 wchar_t c = first; 33 char c = first;
33 while ('\0' != c && IsAsciiDigit(c)) { 34 while ('\0' != c && IsAsciiDigit(c)) {
34 ++token.length; 35 ++token.length;
35 ++len; 36 ++len;
36 c = token.NextChar(); 37 c = token.NextChar();
37 } 38 }
38 // We need at least 1 digit. 39 // We need at least 1 digit.
39 if (len == 0) 40 if (len == 0)
40 return false; 41 return false;
41 42
42 if (!can_have_leading_zeros && len > 1 && '0' == first) 43 if (!can_have_leading_zeros && len > 1 && '0' == first)
43 return false; 44 return false;
44 45
45 return true; 46 return true;
46 } 47 }
47 48
48 // A helper method for ParseStringToken. It reads |digits| hex digits from the 49 // A helper method for ParseStringToken. It reads |digits| hex digits from the
49 // token. If the sequence if digits is not valid (contains other characters), 50 // token. If the sequence if digits is not valid (contains other characters),
50 // the method returns false. 51 // the method returns false.
51 bool ReadHexDigits(base::JSONReader::Token& token, int digits) { 52 bool ReadHexDigits(base::JSONReader::Token& token, int digits) {
52 for (int i = 1; i <= digits; ++i) { 53 for (int i = 1; i <= digits; ++i) {
53 wchar_t c = *(token.begin + token.length + i); 54 char c = *(token.begin + token.length + i);
54 if (c == '\0' || !IsHexDigit(c)) 55 if (c == '\0' || !IsHexDigit(c))
55 return false; 56 return false;
56 } 57 }
57 58
58 token.length += digits; 59 token.length += digits;
59 return true; 60 return true;
60 } 61 }
61 62
62 } // namespace 63 } // namespace
63 64
(...skipping 12 matching lines...) Expand all
76 const char* JSONReader::kUnexpectedDataAfterRoot = 77 const char* JSONReader::kUnexpectedDataAfterRoot =
77 "Unexpected data after root element."; 78 "Unexpected data after root element.";
78 const char* JSONReader::kUnsupportedEncoding = 79 const char* JSONReader::kUnsupportedEncoding =
79 "Unsupported encoding. JSON must be UTF-8."; 80 "Unsupported encoding. JSON must be UTF-8.";
80 const char* JSONReader::kUnquotedDictionaryKey = 81 const char* JSONReader::kUnquotedDictionaryKey =
81 "Dictionary keys must be quoted."; 82 "Dictionary keys must be quoted.";
82 83
83 JSONReader::JSONReader() 84 JSONReader::JSONReader()
84 : start_pos_(NULL), 85 : start_pos_(NULL),
85 json_pos_(NULL), 86 json_pos_(NULL),
87 end_pos_(NULL),
86 stack_depth_(0), 88 stack_depth_(0),
87 allow_trailing_comma_(false), 89 allow_trailing_comma_(false),
88 error_code_(JSON_NO_ERROR), 90 error_code_(JSON_NO_ERROR),
89 error_line_(0), 91 error_line_(0),
90 error_col_(0) {} 92 error_col_(0) {}
91 93
92 // static 94 // static
93 Value* JSONReader::Read(const std::string& json, 95 Value* JSONReader::Read(const std::string& json,
94 bool allow_trailing_comma) { 96 bool allow_trailing_comma) {
95 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL); 97 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL);
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
141 } 143 }
142 144
143 std::string JSONReader::GetErrorMessage() const { 145 std::string JSONReader::GetErrorMessage() const {
144 return FormatErrorMessage(error_line_, error_col_, 146 return FormatErrorMessage(error_line_, error_col_,
145 ErrorCodeToString(error_code_)); 147 ErrorCodeToString(error_code_));
146 } 148 }
147 149
148 Value* JSONReader::JsonToValue(const std::string& json, bool check_root, 150 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
149 bool allow_trailing_comma) { 151 bool allow_trailing_comma) {
150 // The input must be in UTF-8. 152 // The input must be in UTF-8.
151 if (!IsStringUTF8(json.c_str())) { 153 if (!IsStringUTF8(json.data())) {
152 error_code_ = JSON_UNSUPPORTED_ENCODING; 154 error_code_ = JSON_UNSUPPORTED_ENCODING;
153 return NULL; 155 return NULL;
154 } 156 }
155 157
156 // The conversion from UTF8 to wstring removes null bytes for us 158 start_pos_ = json.data();
157 // (a good thing). 159 end_pos_ = start_pos_ + json.size();
158 std::wstring json_wide(UTF8ToWide(json));
159 start_pos_ = json_wide.c_str();
160 160
161 // When the input JSON string starts with a UTF-8 Byte-Order-Mark 161 // When the input JSON string starts with a UTF-8 Byte-Order-Mark (U+FEFF)
162 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode 162 // or <0xEF 0xBB 0xBF>, advance the start position to avoid the
163 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from 163 // JSONReader::BuildValue() function from mis-treating a Unicode BOM as an
164 // mis-treating a Unicode BOM as an invalid character and returning NULL, 164 // invalid character and returning NULL.
165 // skip a converted Unicode BOM if it exists. 165 if (json.size() >= 3 && start_pos_[0] == 0xEF &&
166 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) { 166 start_pos_[1] == 0xBB && start_pos_[2] == 0xBF) {
167 ++start_pos_; 167 start_pos_ += 3;
168 } 168 }
169 169
170 json_pos_ = start_pos_; 170 json_pos_ = start_pos_;
171 allow_trailing_comma_ = allow_trailing_comma; 171 allow_trailing_comma_ = allow_trailing_comma;
172 stack_depth_ = 0; 172 stack_depth_ = 0;
173 error_code_ = JSON_NO_ERROR; 173 error_code_ = JSON_NO_ERROR;
174 174
175 scoped_ptr<Value> root(BuildValue(check_root)); 175 scoped_ptr<Value> root(BuildValue(check_root));
176 if (root.get()) { 176 if (root.get()) {
177 if (ParseToken().type == Token::END_OF_INPUT) { 177 if (ParseToken().type == Token::END_OF_INPUT) {
(...skipping 171 matching lines...) Expand 10 before | Expand all | Expand 10 after
349 json_pos_ += token.length; 349 json_pos_ += token.length;
350 350
351 --stack_depth_; 351 --stack_depth_;
352 return node.release(); 352 return node.release();
353 } 353 }
354 354
355 JSONReader::Token JSONReader::ParseNumberToken() { 355 JSONReader::Token JSONReader::ParseNumberToken() {
356 // We just grab the number here. We validate the size in DecodeNumber. 356 // We just grab the number here. We validate the size in DecodeNumber.
357 // According to RFC4627, a valid number is: [minus] int [frac] [exp] 357 // According to RFC4627, a valid number is: [minus] int [frac] [exp]
358 Token token(Token::NUMBER, json_pos_, 0); 358 Token token(Token::NUMBER, json_pos_, 0);
359 wchar_t c = *json_pos_; 359 char c = *json_pos_;
360 if ('-' == c) { 360 if ('-' == c) {
361 ++token.length; 361 ++token.length;
362 c = token.NextChar(); 362 c = token.NextChar();
363 } 363 }
364 364
365 if (!ReadInt(token, false)) 365 if (!ReadInt(token, false))
366 return Token::CreateInvalidToken(); 366 return Token::CreateInvalidToken();
367 367
368 // Optional fraction part 368 // Optional fraction part
369 c = token.NextChar(); 369 c = token.NextChar();
(...skipping 13 matching lines...) Expand all
383 c = token.NextChar(); 383 c = token.NextChar();
384 } 384 }
385 if (!ReadInt(token, true)) 385 if (!ReadInt(token, true))
386 return Token::CreateInvalidToken(); 386 return Token::CreateInvalidToken();
387 } 387 }
388 388
389 return token; 389 return token;
390 } 390 }
391 391
392 Value* JSONReader::DecodeNumber(const Token& token) { 392 Value* JSONReader::DecodeNumber(const Token& token) {
393 const std::wstring num_string(token.begin, token.length); 393 const std::string num_string(token.begin, token.length);
394 394
395 int num_int; 395 int num_int;
396 if (StringToInt(WideToUTF8(num_string), &num_int)) 396 if (StringToInt(num_string, &num_int))
397 return Value::CreateIntegerValue(num_int); 397 return Value::CreateIntegerValue(num_int);
398 398
399 double num_double; 399 double num_double;
400 if (StringToDouble(WideToUTF8(num_string), &num_double) && 400 if (StringToDouble(num_string, &num_double) && base::IsFinite(num_double))
401 base::IsFinite(num_double))
402 return Value::CreateDoubleValue(num_double); 401 return Value::CreateDoubleValue(num_double);
403 402
404 return NULL; 403 return NULL;
405 } 404 }
406 405
407 JSONReader::Token JSONReader::ParseStringToken() { 406 JSONReader::Token JSONReader::ParseStringToken() {
408 Token token(Token::STRING, json_pos_, 1); 407 Token token(Token::STRING, json_pos_, 1);
409 wchar_t c = token.NextChar(); 408 char c = token.NextChar();
410 while ('\0' != c) { 409 while (json_pos_ + token.length < end_pos_) {
411 if ('\\' == c) { 410 if ('\\' == c) {
412 ++token.length; 411 ++token.length;
413 c = token.NextChar(); 412 c = token.NextChar();
414 // Make sure the escaped char is valid. 413 // Make sure the escaped char is valid.
415 switch (c) { 414 switch (c) {
416 case 'x': 415 case 'x':
417 if (!ReadHexDigits(token, 2)) { 416 if (!ReadHexDigits(token, 2)) {
418 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 417 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
419 return Token::CreateInvalidToken(); 418 return Token::CreateInvalidToken();
420 } 419 }
(...skipping 22 matching lines...) Expand all
443 ++token.length; 442 ++token.length;
444 return token; 443 return token;
445 } 444 }
446 ++token.length; 445 ++token.length;
447 c = token.NextChar(); 446 c = token.NextChar();
448 } 447 }
449 return Token::CreateInvalidToken(); 448 return Token::CreateInvalidToken();
450 } 449 }
451 450
452 Value* JSONReader::DecodeString(const Token& token) { 451 Value* JSONReader::DecodeString(const Token& token) {
453 std::wstring decoded_str; 452 std::string decoded_str;
454 decoded_str.reserve(token.length - 2); 453 decoded_str.reserve(token.length - 2);
455 454
456 for (int i = 1; i < token.length - 1; ++i) { 455 for (int i = 1; i < token.length - 1; ++i) {
457 wchar_t c = *(token.begin + i); 456 char c = *(token.begin + i);
458 if ('\\' == c) { 457 if ('\\' == c) {
459 ++i; 458 ++i;
460 c = *(token.begin + i); 459 c = *(token.begin + i);
461 switch (c) { 460 switch (c) {
462 case '"': 461 case '"':
463 case '/': 462 case '/':
464 case '\\': 463 case '\\':
465 decoded_str.push_back(c); 464 decoded_str.push_back(c);
466 break; 465 break;
467 case 'b': 466 case 'b':
(...skipping 13 matching lines...) Expand all
481 break; 480 break;
482 case 'v': 481 case 'v':
483 decoded_str.push_back('\v'); 482 decoded_str.push_back('\v');
484 break; 483 break;
485 484
486 case 'x': 485 case 'x':
487 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) + 486 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +
488 HexDigitToInt(*(token.begin + i + 2))); 487 HexDigitToInt(*(token.begin + i + 2)));
489 i += 2; 488 i += 2;
490 break; 489 break;
491 case 'u': 490 case 'u': {
492 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) + 491 // Consume the UTF-16 code unit, which may be a high surrogate.
brettw 2012/03/22 05:51:42 This case is getting kind of crazy long, can you s
Robert Sesek 2012/03/22 17:59:57 Done.
493 (HexDigitToInt(*(token.begin + i + 2)) << 8) + 492 if (i + 4 >= token.length)
494 (HexDigitToInt(*(token.begin + i + 3)) << 4) + 493 return NULL;
495 HexDigitToInt(*(token.begin + i + 4))); 494 uint32 code_unit16_high =
495 (HexDigitToInt(*(token.begin + i + 1)) << 12) +
496 (HexDigitToInt(*(token.begin + i + 2)) << 8) +
497 (HexDigitToInt(*(token.begin + i + 3)) << 4) +
498 HexDigitToInt(*(token.begin + i + 4));
496 i += 4; 499 i += 4;
500
501 // If this is a high surrogate, consume the next code unit to get the
502 // low surrogate.
503 // This is a 32-bit field because the shift operations in the
504 // conversion process below cause MSVC to error about "data loss."
505 // This only stores UTF-16 code units, though.
506 uint32 code_unit16_low = 0;
507 if (CBU16_IS_SURROGATE(code_unit16_high)) {
508 // Make sure this is the high surrogate. If not, it's an encoding
509 // error.
510 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
511 return NULL;
brettw 2012/03/22 05:51:42 You changed the behavior of this function by faili
Robert Sesek 2012/03/22 17:59:57 I think we do. The JSON spec states what's legal i
512
513 // Make sure that the token has more characters to consume the
514 // lower surrogate.
515 if (i + 6 >= token.length)
516 return NULL;
517 if (*(++i + token.begin) != '\\' || *(++i + token.begin) != 'u')
518 return NULL;
519
520 code_unit16_low =
521 (HexDigitToInt(*(token.begin + i + 1)) << 12) +
brettw 2012/03/22 05:51:42 It's sort of annoying that this is duplicated twic
Robert Sesek 2012/03/22 17:59:57 Done. Can you explain what you mean by "be strict"
522 (HexDigitToInt(*(token.begin + i + 2)) << 8) +
523 (HexDigitToInt(*(token.begin + i + 3)) << 4) +
524 HexDigitToInt(*(token.begin + i + 4));
525 if (!CBU16_IS_SURROGATE(code_unit16_low) ||
526 !CBU16_IS_TRAIL(code_unit16_low)) {
527 return NULL;
528 }
529 i += 4;
530 } else if (!CBU16_IS_SINGLE(code_unit16_high)) {
531 // If this is not a code point, it's an encoding error.
532 return NULL;
533 }
534
535 // Convert the UTF-16 code units to a code point and then to a UTF-8
536 // code unit sequence.
537 char code_point[8] = { 0 };
538 size_t offset = 0;
539 if (!code_unit16_low) {
540 CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high);
541 } else {
542 uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high,
543 code_unit16_low);
544 offset = 0;
545 CBU8_APPEND_UNSAFE(code_point, offset, code_unit32);
546 }
547 decoded_str.append(code_point);
497 break; 548 break;
549 }
498 550
499 default: 551 default:
500 // We should only have valid strings at this point. If not, 552 // We should only have valid strings at this point. If not,
501 // ParseStringToken didn't do it's job. 553 // ParseStringToken didn't do it's job.
502 NOTREACHED(); 554 NOTREACHED();
503 return NULL; 555 return NULL;
504 } 556 }
505 } else { 557 } else {
506 // Not escaped 558 // Not escaped
507 decoded_str.push_back(c); 559 decoded_str.push_back(c);
508 } 560 }
509 } 561 }
510 return Value::CreateStringValue(WideToUTF16Hack(decoded_str)); 562 return Value::CreateStringValue(decoded_str);
511 } 563 }
512 564
513 JSONReader::Token JSONReader::ParseToken() { 565 JSONReader::Token JSONReader::ParseToken() {
514 EatWhitespaceAndComments(); 566 EatWhitespaceAndComments();
515 567
516 Token token(Token::INVALID_TOKEN, 0, 0); 568 Token token(Token::INVALID_TOKEN, 0, 0);
517 switch (*json_pos_) { 569 switch (*json_pos_) {
518 case '\0': 570 case '\0':
519 token.type = Token::END_OF_INPUT; 571 token.type = Token::END_OF_INPUT;
520 break; 572 break;
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
573 break; 625 break;
574 626
575 case '"': 627 case '"':
576 token = ParseStringToken(); 628 token = ParseStringToken();
577 break; 629 break;
578 } 630 }
579 return token; 631 return token;
580 } 632 }
581 633
582 void JSONReader::EatWhitespaceAndComments() { 634 void JSONReader::EatWhitespaceAndComments() {
583 while ('\0' != *json_pos_) { 635 while (json_pos_ != end_pos_) {
584 switch (*json_pos_) { 636 switch (*json_pos_) {
585 case ' ': 637 case ' ':
586 case '\n': 638 case '\n':
587 case '\r': 639 case '\r':
588 case '\t': 640 case '\t':
589 ++json_pos_; 641 ++json_pos_;
590 break; 642 break;
591 case '/': 643 case '/':
592 // TODO(tc): This isn't in the RFC so it should be a parser flag. 644 // TODO(tc): This isn't in the RFC so it should be a parser flag.
593 if (!EatComment()) 645 if (!EatComment())
594 return; 646 return;
595 break; 647 break;
596 default: 648 default:
597 // Not a whitespace char, just exit. 649 // Not a whitespace char, just exit.
598 return; 650 return;
599 } 651 }
600 } 652 }
601 } 653 }
602 654
603 bool JSONReader::EatComment() { 655 bool JSONReader::EatComment() {
604 if ('/' != *json_pos_) 656 if ('/' != *json_pos_)
605 return false; 657 return false;
606 658
607 wchar_t next_char = *(json_pos_ + 1); 659 char next_char = *(json_pos_ + 1);
608 if ('/' == next_char) { 660 if ('/' == next_char) {
609 // Line comment, read until \n or \r 661 // Line comment, read until \n or \r
610 json_pos_ += 2; 662 json_pos_ += 2;
611 while ('\0' != *json_pos_) { 663 while (json_pos_ != end_pos_) {
612 switch (*json_pos_) { 664 switch (*json_pos_) {
613 case '\n': 665 case '\n':
614 case '\r': 666 case '\r':
615 ++json_pos_; 667 ++json_pos_;
616 return true; 668 return true;
617 default: 669 default:
618 ++json_pos_; 670 ++json_pos_;
619 } 671 }
620 } 672 }
621 } else if ('*' == next_char) { 673 } else if ('*' == next_char) {
622 // Block comment, read until */ 674 // Block comment, read until */
623 json_pos_ += 2; 675 json_pos_ += 2;
624 while ('\0' != *json_pos_) { 676 while (json_pos_ != end_pos_) {
625 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) { 677 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
626 json_pos_ += 2; 678 json_pos_ += 2;
627 return true; 679 return true;
628 } 680 }
629 ++json_pos_; 681 ++json_pos_;
630 } 682 }
631 } else { 683 } else {
632 return false; 684 return false;
633 } 685 }
634 return true; 686 return true;
635 } 687 }
636 688
637 bool JSONReader::NextStringMatch(const wchar_t* str, size_t length) { 689 bool JSONReader::NextStringMatch(const char* str, size_t length) {
638 return wcsncmp(json_pos_, str, length) == 0; 690 return strncmp(json_pos_, str, length) == 0;
639 } 691 }
640 692
641 void JSONReader::SetErrorCode(JsonParseError error, 693 void JSONReader::SetErrorCode(JsonParseError error,
642 const wchar_t* error_pos) { 694 const char* error_pos) {
643 int line_number = 1; 695 int line_number = 1;
644 int column_number = 1; 696 int column_number = 1;
645 697
646 // Figure out the line and column the error occured at. 698 // Figure out the line and column the error occured at.
647 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) { 699 for (const char* pos = start_pos_; pos != error_pos; ++pos) {
648 if (*pos == '\0') { 700 if (pos > end_pos_) {
649 NOTREACHED(); 701 NOTREACHED();
650 return; 702 return;
651 } 703 }
652 704
653 if (*pos == '\n') { 705 if (*pos == '\n') {
654 ++line_number; 706 ++line_number;
655 column_number = 1; 707 column_number = 1;
656 } else { 708 } else {
657 ++column_number; 709 ++column_number;
658 } 710 }
659 } 711 }
660 712
661 error_line_ = line_number; 713 error_line_ = line_number;
662 error_col_ = column_number; 714 error_col_ = column_number;
663 error_code_ = error; 715 error_code_ = error;
664 } 716 }
665 717
666 } // namespace base 718 } // namespace base
OLDNEW
« no previous file with comments | « base/json/json_reader.h ('k') | base/json/json_reader_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698