OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 18 matching lines...) Expand all Loading... |
29 | 29 |
30 #include "scanner-character-streams.h" | 30 #include "scanner-character-streams.h" |
31 | 31 |
32 #include "handles.h" | 32 #include "handles.h" |
33 #include "unicode-inl.h" | 33 #include "unicode-inl.h" |
34 | 34 |
35 namespace v8 { | 35 namespace v8 { |
36 namespace internal { | 36 namespace internal { |
37 | 37 |
38 // ---------------------------------------------------------------------------- | 38 // ---------------------------------------------------------------------------- |
39 // BufferedUC16CharacterStreams | 39 // BufferedUtf16CharacterStreams |
40 | 40 |
41 BufferedUC16CharacterStream::BufferedUC16CharacterStream() | 41 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() |
42 : UC16CharacterStream(), | 42 : Utf16CharacterStream(), |
43 pushback_limit_(NULL) { | 43 pushback_limit_(NULL) { |
44 // Initialize buffer as being empty. First read will fill the buffer. | 44 // Initialize buffer as being empty. First read will fill the buffer. |
45 buffer_cursor_ = buffer_; | 45 buffer_cursor_ = buffer_; |
46 buffer_end_ = buffer_; | 46 buffer_end_ = buffer_; |
47 } | 47 } |
48 | 48 |
49 BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { } | 49 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { } |
50 | 50 |
51 void BufferedUC16CharacterStream::PushBack(uc32 character) { | 51 void BufferedUtf16CharacterStream::PushBack(uc32 character) { |
52 if (character == kEndOfInput) { | 52 if (character == kEndOfInput) { |
53 pos_--; | 53 pos_--; |
54 return; | 54 return; |
55 } | 55 } |
56 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { | 56 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { |
57 // buffer_ is writable, buffer_cursor_ is const pointer. | 57 // buffer_ is writable, buffer_cursor_ is const pointer. |
58 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); | 58 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); |
59 pos_--; | 59 pos_--; |
60 return; | 60 return; |
61 } | 61 } |
62 SlowPushBack(static_cast<uc16>(character)); | 62 SlowPushBack(static_cast<uc16>(character)); |
63 } | 63 } |
64 | 64 |
65 | 65 |
66 void BufferedUC16CharacterStream::SlowPushBack(uc16 character) { | 66 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) { |
67 // In pushback mode, the end of the buffer contains pushback, | 67 // In pushback mode, the end of the buffer contains pushback, |
68 // and the start of the buffer (from buffer start to pushback_limit_) | 68 // and the start of the buffer (from buffer start to pushback_limit_) |
69 // contains valid data that comes just after the pushback. | 69 // contains valid data that comes just after the pushback. |
70 // We NULL the pushback_limit_ if pushing all the way back to the | 70 // We NULL the pushback_limit_ if pushing all the way back to the |
71 // start of the buffer. | 71 // start of the buffer. |
72 | 72 |
73 if (pushback_limit_ == NULL) { | 73 if (pushback_limit_ == NULL) { |
74 // Enter pushback mode. | 74 // Enter pushback mode. |
75 pushback_limit_ = buffer_end_; | 75 pushback_limit_ = buffer_end_; |
76 buffer_end_ = buffer_ + kBufferSize; | 76 buffer_end_ = buffer_ + kBufferSize; |
77 buffer_cursor_ = buffer_end_; | 77 buffer_cursor_ = buffer_end_; |
78 } | 78 } |
79 // Ensure that there is room for at least one pushback. | 79 // Ensure that there is room for at least one pushback. |
80 ASSERT(buffer_cursor_ > buffer_); | 80 ASSERT(buffer_cursor_ > buffer_); |
81 ASSERT(pos_ > 0); | 81 ASSERT(pos_ > 0); |
82 buffer_[--buffer_cursor_ - buffer_] = character; | 82 buffer_[--buffer_cursor_ - buffer_] = character; |
83 if (buffer_cursor_ == buffer_) { | 83 if (buffer_cursor_ == buffer_) { |
84 pushback_limit_ = NULL; | 84 pushback_limit_ = NULL; |
85 } else if (buffer_cursor_ < pushback_limit_) { | 85 } else if (buffer_cursor_ < pushback_limit_) { |
86 pushback_limit_ = buffer_cursor_; | 86 pushback_limit_ = buffer_cursor_; |
87 } | 87 } |
88 pos_--; | 88 pos_--; |
89 } | 89 } |
90 | 90 |
91 | 91 |
92 bool BufferedUC16CharacterStream::ReadBlock() { | 92 bool BufferedUtf16CharacterStream::ReadBlock() { |
93 buffer_cursor_ = buffer_; | 93 buffer_cursor_ = buffer_; |
94 if (pushback_limit_ != NULL) { | 94 if (pushback_limit_ != NULL) { |
95 // Leave pushback mode. | 95 // Leave pushback mode. |
96 buffer_end_ = pushback_limit_; | 96 buffer_end_ = pushback_limit_; |
97 pushback_limit_ = NULL; | 97 pushback_limit_ = NULL; |
98 // If there were any valid characters left at the | 98 // If there were any valid characters left at the |
99 // start of the buffer, use those. | 99 // start of the buffer, use those. |
100 if (buffer_cursor_ < buffer_end_) return true; | 100 if (buffer_cursor_ < buffer_end_) return true; |
101 // Otherwise read a new block. | 101 // Otherwise read a new block. |
102 } | 102 } |
103 unsigned length = FillBuffer(pos_, kBufferSize); | 103 unsigned length = FillBuffer(pos_, kBufferSize); |
104 buffer_end_ = buffer_ + length; | 104 buffer_end_ = buffer_ + length; |
105 return length > 0; | 105 return length > 0; |
106 } | 106 } |
107 | 107 |
108 | 108 |
109 unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) { | 109 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) { |
110 // Leave pushback mode (i.e., ignore that there might be valid data | 110 // Leave pushback mode (i.e., ignore that there might be valid data |
111 // in the buffer before the pushback_limit_ point). | 111 // in the buffer before the pushback_limit_ point). |
112 pushback_limit_ = NULL; | 112 pushback_limit_ = NULL; |
113 return BufferSeekForward(delta); | 113 return BufferSeekForward(delta); |
114 } | 114 } |
115 | 115 |
116 // ---------------------------------------------------------------------------- | 116 // ---------------------------------------------------------------------------- |
117 // GenericStringUC16CharacterStream | 117 // GenericStringUtf16CharacterStream |
118 | 118 |
119 | 119 |
120 GenericStringUC16CharacterStream::GenericStringUC16CharacterStream( | 120 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( |
121 Handle<String> data, | 121 Handle<String> data, |
122 unsigned start_position, | 122 unsigned start_position, |
123 unsigned end_position) | 123 unsigned end_position) |
124 : string_(data), | 124 : string_(data), |
125 length_(end_position) { | 125 length_(end_position) { |
126 ASSERT(end_position >= start_position); | 126 ASSERT(end_position >= start_position); |
127 buffer_cursor_ = buffer_; | 127 buffer_cursor_ = buffer_; |
128 buffer_end_ = buffer_; | 128 buffer_end_ = buffer_; |
129 pos_ = start_position; | 129 pos_ = start_position; |
130 } | 130 } |
131 | 131 |
132 | 132 |
133 GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { } | 133 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } |
134 | 134 |
135 | 135 |
136 unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) { | 136 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) { |
137 unsigned old_pos = pos_; | 137 unsigned old_pos = pos_; |
138 pos_ = Min(pos_ + delta, length_); | 138 pos_ = Min(pos_ + delta, length_); |
139 ReadBlock(); | 139 ReadBlock(); |
140 return pos_ - old_pos; | 140 return pos_ - old_pos; |
141 } | 141 } |
142 | 142 |
143 | 143 |
144 unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos, | 144 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos, |
145 unsigned length) { | 145 unsigned length) { |
146 if (from_pos >= length_) return 0; | 146 if (from_pos >= length_) return 0; |
147 if (from_pos + length > length_) { | 147 if (from_pos + length > length_) { |
148 length = length_ - from_pos; | 148 length = length_ - from_pos; |
149 } | 149 } |
150 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); | 150 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); |
151 return length; | 151 return length; |
152 } | 152 } |
153 | 153 |
154 | 154 |
155 // ---------------------------------------------------------------------------- | 155 // ---------------------------------------------------------------------------- |
156 // Utf8ToUC16CharacterStream | 156 // Utf8ToUtf16CharacterStream |
157 Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data, | 157 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, |
158 unsigned length) | 158 unsigned length) |
159 : BufferedUC16CharacterStream(), | 159 : BufferedUtf16CharacterStream(), |
160 raw_data_(data), | 160 raw_data_(data), |
161 raw_data_length_(length), | 161 raw_data_length_(length), |
162 raw_data_pos_(0), | 162 raw_data_pos_(0), |
163 raw_character_position_(0) { | 163 raw_character_position_(0) { |
164 ReadBlock(); | 164 ReadBlock(); |
165 } | 165 } |
166 | 166 |
167 | 167 |
168 Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { } | 168 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } |
169 | 169 |
170 | 170 |
171 unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) { | 171 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { |
172 unsigned old_pos = pos_; | 172 unsigned old_pos = pos_; |
173 unsigned target_pos = pos_ + delta; | 173 unsigned target_pos = pos_ + delta; |
174 SetRawPosition(target_pos); | 174 SetRawPosition(target_pos); |
175 pos_ = raw_character_position_; | 175 pos_ = raw_character_position_; |
176 ReadBlock(); | 176 ReadBlock(); |
177 return pos_ - old_pos; | 177 return pos_ - old_pos; |
178 } | 178 } |
179 | 179 |
180 | 180 |
181 unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position, | 181 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position, |
182 unsigned length) { | 182 unsigned length) { |
183 static const unibrow::uchar kMaxUC16Character = 0xffff; | 183 static const unibrow::uchar kMaxUtf16Character = 0xffff; |
184 SetRawPosition(char_position); | 184 SetRawPosition(char_position); |
185 if (raw_character_position_ != char_position) { | 185 if (raw_character_position_ != char_position) { |
186 // char_position was not a valid position in the stream (hit the end | 186 // char_position was not a valid position in the stream (hit the end |
187 // while spooling to it). | 187 // while spooling to it). |
188 return 0u; | 188 return 0u; |
189 } | 189 } |
190 unsigned i = 0; | 190 unsigned i = 0; |
191 while (i < length) { | 191 while (i < length - 1) { |
192 if (raw_data_pos_ == raw_data_length_) break; | 192 if (raw_data_pos_ == raw_data_length_) break; |
193 unibrow::uchar c = raw_data_[raw_data_pos_]; | 193 unibrow::uchar c = raw_data_[raw_data_pos_]; |
194 if (c <= unibrow::Utf8::kMaxOneByteChar) { | 194 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
195 raw_data_pos_++; | 195 raw_data_pos_++; |
196 } else { | 196 } else { |
197 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, | 197 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, |
198 raw_data_length_ - raw_data_pos_, | 198 raw_data_length_ - raw_data_pos_, |
199 &raw_data_pos_); | 199 &raw_data_pos_); |
200 // Don't allow characters outside of the BMP. | |
201 if (c > kMaxUC16Character) { | |
202 c = unibrow::Utf8::kBadChar; | |
203 } | |
204 } | 200 } |
205 buffer_[i++] = static_cast<uc16>(c); | 201 if (c > kMaxUtf16Character) { |
| 202 buffer_[i++] = unibrow::Utf16::LeadSurrogate(c); |
| 203 buffer_[i++] = unibrow::Utf16::TrailSurrogate(c); |
| 204 } else { |
| 205 buffer_[i++] = static_cast<uc16>(c); |
| 206 } |
206 } | 207 } |
207 raw_character_position_ = char_position + i; | 208 raw_character_position_ = char_position + i; |
208 return i; | 209 return i; |
209 } | 210 } |
210 | 211 |
211 | 212 |
212 static const byte kUtf8MultiByteMask = 0xC0; | 213 static const byte kUtf8MultiByteMask = 0xC0; |
213 static const byte kUtf8MultiByteCharStart = 0xC0; | 214 static const byte kUtf8MultiByteCharStart = 0xC0; |
214 static const byte kUtf8MultiByteCharFollower = 0x80; | 215 static const byte kUtf8MultiByteCharFollower = 0x80; |
215 | 216 |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
259 // 3 if value in range 0xF0 .. 0xF7. | 260 // 3 if value in range 0xF0 .. 0xF7. |
260 // Encode that in a single value. | 261 // Encode that in a single value. |
261 unsigned additional_bytes = | 262 unsigned additional_bytes = |
262 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | 263 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; |
263 *cursor += additional_bytes; | 264 *cursor += additional_bytes; |
264 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | 265 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); |
265 } | 266 } |
266 } | 267 } |
267 | 268 |
268 | 269 |
269 void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) { | 270 // This can't set a raw position between two surrogate pairs, since there |
| 271 // is no position in the UTF8 stream that corresponds to that. This assumes |
| 272 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If |
| 273 // it is illegally coded as two 3 byte sequences then there is no problem here. |
| 274 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) { |
270 if (raw_character_position_ > target_position) { | 275 if (raw_character_position_ > target_position) { |
271 // Spool backwards in utf8 buffer. | 276 // Spool backwards in utf8 buffer. |
272 do { | 277 do { |
| 278 int old_pos = raw_data_pos_; |
273 Utf8CharacterBack(raw_data_, &raw_data_pos_); | 279 Utf8CharacterBack(raw_data_, &raw_data_pos_); |
274 raw_character_position_--; | 280 raw_character_position_--; |
| 281 ASSERT(old_pos - raw_data_pos_ <= 4); |
| 282 // Step back over both code units for surrogate pairs. |
| 283 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; |
275 } while (raw_character_position_ > target_position); | 284 } while (raw_character_position_ > target_position); |
| 285 // No surrogate pair splitting. |
| 286 ASSERT(raw_character_position_ == target_position); |
276 return; | 287 return; |
277 } | 288 } |
278 // Spool forwards in the utf8 buffer. | 289 // Spool forwards in the utf8 buffer. |
279 while (raw_character_position_ < target_position) { | 290 while (raw_character_position_ < target_position) { |
280 if (raw_data_pos_ == raw_data_length_) return; | 291 if (raw_data_pos_ == raw_data_length_) return; |
| 292 int old_pos = raw_data_pos_; |
281 Utf8CharacterForward(raw_data_, &raw_data_pos_); | 293 Utf8CharacterForward(raw_data_, &raw_data_pos_); |
282 raw_character_position_++; | 294 raw_character_position_++; |
| 295 ASSERT(raw_data_pos_ - old_pos <= 4); |
| 296 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; |
283 } | 297 } |
| 298 // No surrogate pair splitting. |
| 299 ASSERT(raw_character_position_ == target_position); |
284 } | 300 } |
285 | 301 |
286 | 302 |
287 // ---------------------------------------------------------------------------- | 303 // ---------------------------------------------------------------------------- |
288 // ExternalTwoByteStringUC16CharacterStream | 304 // ExternalTwoByteStringUtf16CharacterStream |
289 | 305 |
290 ExternalTwoByteStringUC16CharacterStream:: | 306 ExternalTwoByteStringUtf16CharacterStream:: |
291 ~ExternalTwoByteStringUC16CharacterStream() { } | 307 ~ExternalTwoByteStringUtf16CharacterStream() { } |
292 | 308 |
293 | 309 |
294 ExternalTwoByteStringUC16CharacterStream | 310 ExternalTwoByteStringUtf16CharacterStream |
295 ::ExternalTwoByteStringUC16CharacterStream( | 311 ::ExternalTwoByteStringUtf16CharacterStream( |
296 Handle<ExternalTwoByteString> data, | 312 Handle<ExternalTwoByteString> data, |
297 int start_position, | 313 int start_position, |
298 int end_position) | 314 int end_position) |
299 : UC16CharacterStream(), | 315 : Utf16CharacterStream(), |
300 source_(data), | 316 source_(data), |
301 raw_data_(data->GetTwoByteData(start_position)) { | 317 raw_data_(data->GetTwoByteData(start_position)) { |
302 buffer_cursor_ = raw_data_, | 318 buffer_cursor_ = raw_data_, |
303 buffer_end_ = raw_data_ + (end_position - start_position); | 319 buffer_end_ = raw_data_ + (end_position - start_position); |
304 pos_ = start_position; | 320 pos_ = start_position; |
305 } | 321 } |
306 | 322 |
307 } } // namespace v8::internal | 323 } } // namespace v8::internal |
OLD | NEW |