Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/scanner-character-streams.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 18 matching lines...) Expand all
29 29
30 #include "scanner-character-streams.h" 30 #include "scanner-character-streams.h"
31 31
32 #include "handles.h" 32 #include "handles.h"
33 #include "unicode-inl.h" 33 #include "unicode-inl.h"
34 34
35 namespace v8 { 35 namespace v8 {
36 namespace internal { 36 namespace internal {
37 37
38 // ---------------------------------------------------------------------------- 38 // ----------------------------------------------------------------------------
39 // BufferedUC16CharacterStreams 39 // BufferedUtf16CharacterStreams
40 40
41 BufferedUC16CharacterStream::BufferedUC16CharacterStream() 41 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
42 : UC16CharacterStream(), 42 : Utf16CharacterStream(),
43 pushback_limit_(NULL) { 43 pushback_limit_(NULL) {
44 // Initialize buffer as being empty. First read will fill the buffer. 44 // Initialize buffer as being empty. First read will fill the buffer.
45 buffer_cursor_ = buffer_; 45 buffer_cursor_ = buffer_;
46 buffer_end_ = buffer_; 46 buffer_end_ = buffer_;
47 } 47 }
48 48
49 BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { } 49 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
50 50
51 void BufferedUC16CharacterStream::PushBack(uc32 character) { 51 void BufferedUtf16CharacterStream::PushBack(uc32 character) {
52 if (character == kEndOfInput) { 52 if (character == kEndOfInput) {
53 pos_--; 53 pos_--;
54 return; 54 return;
55 } 55 }
56 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { 56 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
57 // buffer_ is writable, buffer_cursor_ is const pointer. 57 // buffer_ is writable, buffer_cursor_ is const pointer.
58 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); 58 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
59 pos_--; 59 pos_--;
60 return; 60 return;
61 } 61 }
62 SlowPushBack(static_cast<uc16>(character)); 62 SlowPushBack(static_cast<uc16>(character));
63 } 63 }
64 64
65 65
66 void BufferedUC16CharacterStream::SlowPushBack(uc16 character) { 66 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
67 // In pushback mode, the end of the buffer contains pushback, 67 // In pushback mode, the end of the buffer contains pushback,
68 // and the start of the buffer (from buffer start to pushback_limit_) 68 // and the start of the buffer (from buffer start to pushback_limit_)
69 // contains valid data that comes just after the pushback. 69 // contains valid data that comes just after the pushback.
70 // We NULL the pushback_limit_ if pushing all the way back to the 70 // We NULL the pushback_limit_ if pushing all the way back to the
71 // start of the buffer. 71 // start of the buffer.
72 72
73 if (pushback_limit_ == NULL) { 73 if (pushback_limit_ == NULL) {
74 // Enter pushback mode. 74 // Enter pushback mode.
75 pushback_limit_ = buffer_end_; 75 pushback_limit_ = buffer_end_;
76 buffer_end_ = buffer_ + kBufferSize; 76 buffer_end_ = buffer_ + kBufferSize;
77 buffer_cursor_ = buffer_end_; 77 buffer_cursor_ = buffer_end_;
78 } 78 }
79 // Ensure that there is room for at least one pushback. 79 // Ensure that there is room for at least one pushback.
80 ASSERT(buffer_cursor_ > buffer_); 80 ASSERT(buffer_cursor_ > buffer_);
81 ASSERT(pos_ > 0); 81 ASSERT(pos_ > 0);
82 buffer_[--buffer_cursor_ - buffer_] = character; 82 buffer_[--buffer_cursor_ - buffer_] = character;
83 if (buffer_cursor_ == buffer_) { 83 if (buffer_cursor_ == buffer_) {
84 pushback_limit_ = NULL; 84 pushback_limit_ = NULL;
85 } else if (buffer_cursor_ < pushback_limit_) { 85 } else if (buffer_cursor_ < pushback_limit_) {
86 pushback_limit_ = buffer_cursor_; 86 pushback_limit_ = buffer_cursor_;
87 } 87 }
88 pos_--; 88 pos_--;
89 } 89 }
90 90
91 91
92 bool BufferedUC16CharacterStream::ReadBlock() { 92 bool BufferedUtf16CharacterStream::ReadBlock() {
93 buffer_cursor_ = buffer_; 93 buffer_cursor_ = buffer_;
94 if (pushback_limit_ != NULL) { 94 if (pushback_limit_ != NULL) {
95 // Leave pushback mode. 95 // Leave pushback mode.
96 buffer_end_ = pushback_limit_; 96 buffer_end_ = pushback_limit_;
97 pushback_limit_ = NULL; 97 pushback_limit_ = NULL;
98 // If there were any valid characters left at the 98 // If there were any valid characters left at the
99 // start of the buffer, use those. 99 // start of the buffer, use those.
100 if (buffer_cursor_ < buffer_end_) return true; 100 if (buffer_cursor_ < buffer_end_) return true;
101 // Otherwise read a new block. 101 // Otherwise read a new block.
102 } 102 }
103 unsigned length = FillBuffer(pos_, kBufferSize); 103 unsigned length = FillBuffer(pos_, kBufferSize);
104 buffer_end_ = buffer_ + length; 104 buffer_end_ = buffer_ + length;
105 return length > 0; 105 return length > 0;
106 } 106 }
107 107
108 108
109 unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) { 109 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
110 // Leave pushback mode (i.e., ignore that there might be valid data 110 // Leave pushback mode (i.e., ignore that there might be valid data
111 // in the buffer before the pushback_limit_ point). 111 // in the buffer before the pushback_limit_ point).
112 pushback_limit_ = NULL; 112 pushback_limit_ = NULL;
113 return BufferSeekForward(delta); 113 return BufferSeekForward(delta);
114 } 114 }
115 115
116 // ---------------------------------------------------------------------------- 116 // ----------------------------------------------------------------------------
117 // GenericStringUC16CharacterStream 117 // GenericStringUtf16CharacterStream
118 118
119 119
120 GenericStringUC16CharacterStream::GenericStringUC16CharacterStream( 120 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
121 Handle<String> data, 121 Handle<String> data,
122 unsigned start_position, 122 unsigned start_position,
123 unsigned end_position) 123 unsigned end_position)
124 : string_(data), 124 : string_(data),
125 length_(end_position) { 125 length_(end_position) {
126 ASSERT(end_position >= start_position); 126 ASSERT(end_position >= start_position);
127 buffer_cursor_ = buffer_; 127 buffer_cursor_ = buffer_;
128 buffer_end_ = buffer_; 128 buffer_end_ = buffer_;
129 pos_ = start_position; 129 pos_ = start_position;
130 } 130 }
131 131
132 132
133 GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { } 133 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
134 134
135 135
136 unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) { 136 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
137 unsigned old_pos = pos_; 137 unsigned old_pos = pos_;
138 pos_ = Min(pos_ + delta, length_); 138 pos_ = Min(pos_ + delta, length_);
139 ReadBlock(); 139 ReadBlock();
140 return pos_ - old_pos; 140 return pos_ - old_pos;
141 } 141 }
142 142
143 143
144 unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos, 144 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
145 unsigned length) { 145 unsigned length) {
146 if (from_pos >= length_) return 0; 146 if (from_pos >= length_) return 0;
147 if (from_pos + length > length_) { 147 if (from_pos + length > length_) {
148 length = length_ - from_pos; 148 length = length_ - from_pos;
149 } 149 }
150 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); 150 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
151 return length; 151 return length;
152 } 152 }
153 153
154 154
155 // ---------------------------------------------------------------------------- 155 // ----------------------------------------------------------------------------
156 // Utf8ToUC16CharacterStream 156 // Utf8ToUtf16CharacterStream
157 Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data, 157 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
158 unsigned length) 158 unsigned length)
159 : BufferedUC16CharacterStream(), 159 : BufferedUtf16CharacterStream(),
160 raw_data_(data), 160 raw_data_(data),
161 raw_data_length_(length), 161 raw_data_length_(length),
162 raw_data_pos_(0), 162 raw_data_pos_(0),
163 raw_character_position_(0) { 163 raw_character_position_(0) {
164 ReadBlock(); 164 ReadBlock();
165 } 165 }
166 166
167 167
168 Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { } 168 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
169 169
170 170
171 unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) { 171 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
172 unsigned old_pos = pos_; 172 unsigned old_pos = pos_;
173 unsigned target_pos = pos_ + delta; 173 unsigned target_pos = pos_ + delta;
174 SetRawPosition(target_pos); 174 SetRawPosition(target_pos);
175 pos_ = raw_character_position_; 175 pos_ = raw_character_position_;
176 ReadBlock(); 176 ReadBlock();
177 return pos_ - old_pos; 177 return pos_ - old_pos;
178 } 178 }
179 179
180 180
181 unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position, 181 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
182 unsigned length) { 182 unsigned length) {
183 static const unibrow::uchar kMaxUC16Character = 0xffff; 183 static const unibrow::uchar kMaxUtf16Character = 0xffff;
184 SetRawPosition(char_position); 184 SetRawPosition(char_position);
185 if (raw_character_position_ != char_position) { 185 if (raw_character_position_ != char_position) {
186 // char_position was not a valid position in the stream (hit the end 186 // char_position was not a valid position in the stream (hit the end
187 // while spooling to it). 187 // while spooling to it).
188 return 0u; 188 return 0u;
189 } 189 }
190 unsigned i = 0; 190 unsigned i = 0;
191 while (i < length) { 191 while (i < length - 1) {
192 if (raw_data_pos_ == raw_data_length_) break; 192 if (raw_data_pos_ == raw_data_length_) break;
193 unibrow::uchar c = raw_data_[raw_data_pos_]; 193 unibrow::uchar c = raw_data_[raw_data_pos_];
194 if (c <= unibrow::Utf8::kMaxOneByteChar) { 194 if (c <= unibrow::Utf8::kMaxOneByteChar) {
195 raw_data_pos_++; 195 raw_data_pos_++;
196 } else { 196 } else {
197 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, 197 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
198 raw_data_length_ - raw_data_pos_, 198 raw_data_length_ - raw_data_pos_,
199 &raw_data_pos_); 199 &raw_data_pos_);
200 // Don't allow characters outside of the BMP.
201 if (c > kMaxUC16Character) {
202 c = unibrow::Utf8::kBadChar;
203 }
204 } 200 }
205 buffer_[i++] = static_cast<uc16>(c); 201 if (c > kMaxUtf16Character) {
202 buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
203 buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
204 } else {
205 buffer_[i++] = static_cast<uc16>(c);
206 }
206 } 207 }
207 raw_character_position_ = char_position + i; 208 raw_character_position_ = char_position + i;
208 return i; 209 return i;
209 } 210 }
210 211
211 212
212 static const byte kUtf8MultiByteMask = 0xC0; 213 static const byte kUtf8MultiByteMask = 0xC0;
213 static const byte kUtf8MultiByteCharStart = 0xC0; 214 static const byte kUtf8MultiByteCharStart = 0xC0;
214 static const byte kUtf8MultiByteCharFollower = 0x80; 215 static const byte kUtf8MultiByteCharFollower = 0x80;
215 216
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
259 // 3 if value in range 0xF0 .. 0xF7. 260 // 3 if value in range 0xF0 .. 0xF7.
260 // Encode that in a single value. 261 // Encode that in a single value.
261 unsigned additional_bytes = 262 unsigned additional_bytes =
262 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; 263 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
263 *cursor += additional_bytes; 264 *cursor += additional_bytes;
264 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); 265 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
265 } 266 }
266 } 267 }
267 268
268 269
269 void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) { 270 // This can't set a raw position between two surrogate pairs, since there
271 // is no position in the UTF8 stream that corresponds to that. This assumes
272 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
273 // it is illegally coded as two 3 byte sequences then there is no problem here.
274 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
270 if (raw_character_position_ > target_position) { 275 if (raw_character_position_ > target_position) {
271 // Spool backwards in utf8 buffer. 276 // Spool backwards in utf8 buffer.
272 do { 277 do {
278 int old_pos = raw_data_pos_;
273 Utf8CharacterBack(raw_data_, &raw_data_pos_); 279 Utf8CharacterBack(raw_data_, &raw_data_pos_);
274 raw_character_position_--; 280 raw_character_position_--;
281 ASSERT(old_pos - raw_data_pos_ <= 4);
282 // Step back over both code units for surrogate pairs.
283 if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
275 } while (raw_character_position_ > target_position); 284 } while (raw_character_position_ > target_position);
285 // No surrogate pair splitting.
286 ASSERT(raw_character_position_ == target_position);
276 return; 287 return;
277 } 288 }
278 // Spool forwards in the utf8 buffer. 289 // Spool forwards in the utf8 buffer.
279 while (raw_character_position_ < target_position) { 290 while (raw_character_position_ < target_position) {
280 if (raw_data_pos_ == raw_data_length_) return; 291 if (raw_data_pos_ == raw_data_length_) return;
292 int old_pos = raw_data_pos_;
281 Utf8CharacterForward(raw_data_, &raw_data_pos_); 293 Utf8CharacterForward(raw_data_, &raw_data_pos_);
282 raw_character_position_++; 294 raw_character_position_++;
295 ASSERT(raw_data_pos_ - old_pos <= 4);
296 if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
283 } 297 }
298 // No surrogate pair splitting.
299 ASSERT(raw_character_position_ == target_position);
284 } 300 }
285 301
286 302
287 // ---------------------------------------------------------------------------- 303 // ----------------------------------------------------------------------------
288 // ExternalTwoByteStringUC16CharacterStream 304 // ExternalTwoByteStringUtf16CharacterStream
289 305
290 ExternalTwoByteStringUC16CharacterStream:: 306 ExternalTwoByteStringUtf16CharacterStream::
291 ~ExternalTwoByteStringUC16CharacterStream() { } 307 ~ExternalTwoByteStringUtf16CharacterStream() { }
292 308
293 309
294 ExternalTwoByteStringUC16CharacterStream 310 ExternalTwoByteStringUtf16CharacterStream
295 ::ExternalTwoByteStringUC16CharacterStream( 311 ::ExternalTwoByteStringUtf16CharacterStream(
296 Handle<ExternalTwoByteString> data, 312 Handle<ExternalTwoByteString> data,
297 int start_position, 313 int start_position,
298 int end_position) 314 int end_position)
299 : UC16CharacterStream(), 315 : Utf16CharacterStream(),
300 source_(data), 316 source_(data),
301 raw_data_(data->GetTwoByteData(start_position)) { 317 raw_data_(data->GetTwoByteData(start_position)) {
302 buffer_cursor_ = raw_data_, 318 buffer_cursor_ = raw_data_,
303 buffer_end_ = raw_data_ + (end_position - start_position); 319 buffer_end_ = raw_data_ + (end_position - start_position);
304 pos_ = start_position; 320 pos_ = start_position;
305 } 321 }
306 322
307 } } // namespace v8::internal 323 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698