OLD | NEW |
1 // Copyright 2007-2010 the V8 project authors. All rights reserved. | 1 // Copyright 2007-2010 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
71 } else { | 71 } else { |
72 entries_[c & kMask] = CacheEntry(c, 0); | 72 entries_[c & kMask] = CacheEntry(c, 0); |
73 return 0; | 73 return 0; |
74 } | 74 } |
75 } else { | 75 } else { |
76 return length; | 76 return length; |
77 } | 77 } |
78 } | 78 } |
79 | 79 |
80 | 80 |
81 unsigned Utf8::Encode(char* str, uchar c) { | 81 unsigned Utf8::Encode(char* str, uchar c, int previous) { |
82 static const int kMask = ~(1 << 6); | 82 static const int kMask = ~(1 << 6); |
83 if (c <= kMaxOneByteChar) { | 83 if (c <= kMaxOneByteChar) { |
84 str[0] = c; | 84 str[0] = c; |
85 return 1; | 85 return 1; |
86 } else if (c <= kMaxTwoByteChar) { | 86 } else if (c <= kMaxTwoByteChar) { |
87 str[0] = 0xC0 | (c >> 6); | 87 str[0] = 0xC0 | (c >> 6); |
88 str[1] = 0x80 | (c & kMask); | 88 str[1] = 0x80 | (c & kMask); |
89 return 2; | 89 return 2; |
90 } else if (c <= kMaxThreeByteChar) { | 90 } else if (c <= kMaxThreeByteChar) { |
| 91 if (Utf16::IsTrailSurrogate(c) && |
| 92 Utf16::IsLeadSurrogate(previous)) { |
| 93 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
| 94 return Encode(str - kUnmatchedSize, |
| 95 Utf16::CombineSurrogatePair(previous, c), |
| 96 Utf16::kNoPreviousCharacter) - kUnmatchedSize; |
| 97 } |
91 str[0] = 0xE0 | (c >> 12); | 98 str[0] = 0xE0 | (c >> 12); |
92 str[1] = 0x80 | ((c >> 6) & kMask); | 99 str[1] = 0x80 | ((c >> 6) & kMask); |
93 str[2] = 0x80 | (c & kMask); | 100 str[2] = 0x80 | (c & kMask); |
94 return 3; | 101 return 3; |
95 } else { | 102 } else { |
96 str[0] = 0xF0 | (c >> 18); | 103 str[0] = 0xF0 | (c >> 18); |
97 str[1] = 0x80 | ((c >> 12) & kMask); | 104 str[1] = 0x80 | ((c >> 12) & kMask); |
98 str[2] = 0x80 | ((c >> 6) & kMask); | 105 str[2] = 0x80 | ((c >> 6) & kMask); |
99 str[3] = 0x80 | (c & kMask); | 106 str[3] = 0x80 | (c & kMask); |
100 return 4; | 107 return 4; |
101 } | 108 } |
102 } | 109 } |
103 | 110 |
104 | 111 |
105 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { | 112 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { |
106 if (length <= 0) return kBadChar; | 113 if (length <= 0) return kBadChar; |
107 byte first = bytes[0]; | 114 byte first = bytes[0]; |
108 // Characters between 0000 and 0007F are encoded as a single character | 115 // Characters between 0000 and 0007F are encoded as a single character |
109 if (first <= kMaxOneByteChar) { | 116 if (first <= kMaxOneByteChar) { |
110 *cursor += 1; | 117 *cursor += 1; |
111 return first; | 118 return first; |
112 } | 119 } |
113 return CalculateValue(bytes, length, cursor); | 120 return CalculateValue(bytes, length, cursor); |
114 } | 121 } |
115 | 122 |
116 unsigned Utf8::Length(uchar c) { | 123 unsigned Utf8::Length(uchar c, int previous) { |
117 if (c <= kMaxOneByteChar) { | 124 if (c <= kMaxOneByteChar) { |
118 return 1; | 125 return 1; |
119 } else if (c <= kMaxTwoByteChar) { | 126 } else if (c <= kMaxTwoByteChar) { |
120 return 2; | 127 return 2; |
121 } else if (c <= kMaxThreeByteChar) { | 128 } else if (c <= kMaxThreeByteChar) { |
| 129 if (Utf16::IsTrailSurrogate(c) && |
| 130 Utf16::IsLeadSurrogate(previous)) { |
| 131 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; |
| 132 } |
122 return 3; | 133 return 3; |
123 } else { | 134 } else { |
124 return 4; | 135 return 4; |
125 } | 136 } |
126 } | 137 } |
127 | 138 |
128 uchar CharacterStream::GetNext() { | 139 uchar CharacterStream::GetNext() { |
129 uchar result = DecodeCharacter(buffer_, &cursor_); | 140 uchar result = DecodeCharacter(buffer_, &cursor_); |
130 if (remaining_ == 1) { | 141 if (remaining_ == 1) { |
131 cursor_ = 0; | 142 cursor_ = 0; |
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
229 | 240 |
230 template <unsigned s> | 241 template <unsigned s> |
231 Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length) | 242 Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length) |
232 : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data, | 243 : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data, |
233 length)) { | 244 length)) { |
234 } | 245 } |
235 | 246 |
236 } // namespace unibrow | 247 } // namespace unibrow |
237 | 248 |
238 #endif // V8_UNICODE_INL_H_ | 249 #endif // V8_UNICODE_INL_H_ |
OLD | NEW |