Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(39)

Side by Side Diff: src/unicode-inl.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2007-2010 the V8 project authors. All rights reserved. 1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
71 } else { 71 } else {
72 entries_[c & kMask] = CacheEntry(c, 0); 72 entries_[c & kMask] = CacheEntry(c, 0);
73 return 0; 73 return 0;
74 } 74 }
75 } else { 75 } else {
76 return length; 76 return length;
77 } 77 }
78 } 78 }
79 79
80 80
81 unsigned Utf8::Encode(char* str, uchar c) { 81 unsigned Utf8::Encode(char* str, uchar c, int previous) {
82 static const int kMask = ~(1 << 6); 82 static const int kMask = ~(1 << 6);
83 if (c <= kMaxOneByteChar) { 83 if (c <= kMaxOneByteChar) {
84 str[0] = c; 84 str[0] = c;
85 return 1; 85 return 1;
86 } else if (c <= kMaxTwoByteChar) { 86 } else if (c <= kMaxTwoByteChar) {
87 str[0] = 0xC0 | (c >> 6); 87 str[0] = 0xC0 | (c >> 6);
88 str[1] = 0x80 | (c & kMask); 88 str[1] = 0x80 | (c & kMask);
89 return 2; 89 return 2;
90 } else if (c <= kMaxThreeByteChar) { 90 } else if (c <= kMaxThreeByteChar) {
91 if (Utf16::IsTrailSurrogate(c) &&
92 Utf16::IsLeadSurrogate(previous)) {
93 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
94 return Encode(str - kUnmatchedSize,
95 Utf16::CombineSurrogatePair(previous, c),
96 Utf16::kNoPreviousCharacter) - kUnmatchedSize;
97 }
91 str[0] = 0xE0 | (c >> 12); 98 str[0] = 0xE0 | (c >> 12);
92 str[1] = 0x80 | ((c >> 6) & kMask); 99 str[1] = 0x80 | ((c >> 6) & kMask);
93 str[2] = 0x80 | (c & kMask); 100 str[2] = 0x80 | (c & kMask);
94 return 3; 101 return 3;
95 } else { 102 } else {
96 str[0] = 0xF0 | (c >> 18); 103 str[0] = 0xF0 | (c >> 18);
97 str[1] = 0x80 | ((c >> 12) & kMask); 104 str[1] = 0x80 | ((c >> 12) & kMask);
98 str[2] = 0x80 | ((c >> 6) & kMask); 105 str[2] = 0x80 | ((c >> 6) & kMask);
99 str[3] = 0x80 | (c & kMask); 106 str[3] = 0x80 | (c & kMask);
100 return 4; 107 return 4;
101 } 108 }
102 } 109 }
103 110
104 111
105 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { 112 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
106 if (length <= 0) return kBadChar; 113 if (length <= 0) return kBadChar;
107 byte first = bytes[0]; 114 byte first = bytes[0];
108 // Characters between 0000 and 0007F are encoded as a single character 115 // Characters between 0000 and 0007F are encoded as a single character
109 if (first <= kMaxOneByteChar) { 116 if (first <= kMaxOneByteChar) {
110 *cursor += 1; 117 *cursor += 1;
111 return first; 118 return first;
112 } 119 }
113 return CalculateValue(bytes, length, cursor); 120 return CalculateValue(bytes, length, cursor);
114 } 121 }
115 122
116 unsigned Utf8::Length(uchar c) { 123 unsigned Utf8::Length(uchar c, int previous) {
117 if (c <= kMaxOneByteChar) { 124 if (c <= kMaxOneByteChar) {
118 return 1; 125 return 1;
119 } else if (c <= kMaxTwoByteChar) { 126 } else if (c <= kMaxTwoByteChar) {
120 return 2; 127 return 2;
121 } else if (c <= kMaxThreeByteChar) { 128 } else if (c <= kMaxThreeByteChar) {
129 if (Utf16::IsTrailSurrogate(c) &&
130 Utf16::IsLeadSurrogate(previous)) {
131 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
132 }
122 return 3; 133 return 3;
123 } else { 134 } else {
124 return 4; 135 return 4;
125 } 136 }
126 } 137 }
127 138
128 uchar CharacterStream::GetNext() { 139 uchar CharacterStream::GetNext() {
129 uchar result = DecodeCharacter(buffer_, &cursor_); 140 uchar result = DecodeCharacter(buffer_, &cursor_);
130 if (remaining_ == 1) { 141 if (remaining_ == 1) {
131 cursor_ = 0; 142 cursor_ = 0;
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
229 240
230 template <unsigned s> 241 template <unsigned s>
231 Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length) 242 Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)
232 : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data, 243 : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data,
233 length)) { 244 length)) {
234 } 245 }
235 246
236 } // namespace unibrow 247 } // namespace unibrow
237 248
238 #endif // V8_UNICODE_INL_H_ 249 #endif // V8_UNICODE_INL_H_
OLDNEW
« src/unicode.h ('K') | « src/unicode.cc ('k') | src/x64/regexp-macro-assembler-x64.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698