Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(203)

Side by Side Diff: src/unicode.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after
93 CacheEntry entries_[kSize]; 93 CacheEntry entries_[kSize];
94 }; 94 };
95 95
96 class UnicodeData { 96 class UnicodeData {
97 private: 97 private:
98 friend class Test; 98 friend class Test;
99 static int GetByteCount(); 99 static int GetByteCount();
100 static const uchar kMaxCodePoint; 100 static const uchar kMaxCodePoint;
101 }; 101 };
102 102
103 // --- U t f 8 --- 103 // --- U t f 8 a n d 16 ---
104 104
105 template <typename Data> 105 template <typename Data>
106 class Buffer { 106 class Buffer {
107 public: 107 public:
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109 inline Buffer() : data_(0), length_(0) { } 109 inline Buffer() : data_(0), length_(0) { }
110 Data data() { return data_; } 110 Data data() { return data_; }
111 unsigned length() { return length_; } 111 unsigned length() { return length_; }
112 private: 112 private:
113 Data data_; 113 Data data_;
114 unsigned length_; 114 unsigned length_;
115 }; 115 };
116 116
117
118 class Utf16 {
119 public:
120 static inline bool IsLeadSurrogate(int32_t code) {
121 if (code == kNoPreviousCharacter) return false;
rossberg 2012/03/12 10:55:05 I still think this is implied by the bit masking b
Erik Corry 2012/03/12 12:34:10 Yes, I think that would be too implicit.
122 return (code & 0xfc00) == 0xd800;
123 }
124 static inline bool IsTrailSurrogate(int32_t code) {
125 if (code == kNoPreviousCharacter) return false;
126 return (code & 0xfc00) == 0xdc00;
127 }
128
129 static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) {
130 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
131 }
132 static const int kNoPreviousCharacter = -1;
rossberg 2012/03/12 10:55:05 int32_t?
Erik Corry 2012/03/12 12:34:10 Done.
133 static const uchar kMaxNonSurrogateCharCode = 0xffff;
134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135 // of UTF-8 data. The special case where the unit is a surrogate
136 // trail produces 1 byte net, because the encoding of the pair is
137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate
138 // can be reclaimed.
139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141 // The illegality stems from the surrogate not being part of a pair.
142 static const int kUtf8BytesToCodeASurrogate = 3;
143 static inline uchar LeadSurrogate(int32_t char_code) {
144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
145 }
146 static inline uchar TrailSurrogate(int32_t char_code) {
147 return 0xdc00 + (char_code & 0x3ff);
148 }
149 };
150
151
117 class Utf8 { 152 class Utf8 {
118 public: 153 public:
119 static inline uchar Length(uchar chr); 154 static inline uchar Length(uchar chr, int previous);
120 static inline unsigned Encode(char* out, uchar c); 155 static inline unsigned Encode(
156 char* out, uchar c, int previous);
121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, 157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
122 unsigned capacity, unsigned* chars_read, unsigned* offset); 158 unsigned capacity, unsigned* chars_read, unsigned* offset);
123 static uchar CalculateValue(const byte* str, 159 static uchar CalculateValue(const byte* str,
124 unsigned length, 160 unsigned length,
125 unsigned* cursor); 161 unsigned* cursor);
126 static const uchar kBadChar = 0xFFFD; 162 static const uchar kBadChar = 0xFFFD;
127 static const unsigned kMaxEncodedSize = 4; 163 static const unsigned kMaxEncodedSize = 4;
128 static const unsigned kMaxOneByteChar = 0x7f; 164 static const unsigned kMaxOneByteChar = 0x7f;
129 static const unsigned kMaxTwoByteChar = 0x7ff; 165 static const unsigned kMaxTwoByteChar = 0x7ff;
130 static const unsigned kMaxThreeByteChar = 0xffff; 166 static const unsigned kMaxThreeByteChar = 0xffff;
131 static const unsigned kMaxFourByteChar = 0x1fffff; 167 static const unsigned kMaxFourByteChar = 0x1fffff;
132 168
169 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170 // that match are coded as a 4 byte UTF-8 sequence.
171 static const unsigned kBytesSavedByCombiningSurrogates = 2;
172 static const unsigned kSizeOfUnmatchedSurrogate = 3;
173
133 private: 174 private:
134 template <unsigned s> friend class Utf8InputBuffer; 175 template <unsigned s> friend class Utf8InputBuffer;
135 friend class Test; 176 friend class Test;
136 static inline uchar ValueOf(const byte* str, 177 static inline uchar ValueOf(const byte* str,
137 unsigned length, 178 unsigned length,
138 unsigned* cursor); 179 unsigned* cursor);
139 }; 180 };
140 181
141 // --- C h a r a c t e r S t r e a m --- 182 // --- C h a r a c t e r S t r e a m ---
142 183
143 class CharacterStream { 184 class CharacterStream {
144 public: 185 public:
145 inline uchar GetNext(); 186 inline uchar GetNext();
146 inline bool has_more() { return remaining_ != 0; } 187 inline bool has_more() { return remaining_ != 0; }
147 // Note that default implementation is not efficient. 188 // Note that default implementation is not efficient.
148 virtual void Seek(unsigned); 189 virtual void Seek(unsigned);
149 unsigned Length(); 190 unsigned Length();
191 unsigned Utf16Length();
150 virtual ~CharacterStream() { } 192 virtual ~CharacterStream() { }
151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, 193 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
152 unsigned& offset); 194 unsigned& offset);
153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, 195 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
154 unsigned capacity, unsigned& offset); 196 unsigned capacity, unsigned& offset);
155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, 197 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
156 unsigned capacity, unsigned& offset); 198 unsigned capacity, unsigned& offset);
157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); 199 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
158 virtual void Rewind() = 0; 200 virtual void Rewind() = 0;
201
159 protected: 202 protected:
160 virtual void FillBuffer() = 0; 203 virtual void FillBuffer() = 0;
161 // The number of characters left in the current buffer 204 // The number of characters left in the current buffer
162 unsigned remaining_; 205 unsigned remaining_;
163 // The current offset within the buffer 206 // The current offset within the buffer
164 unsigned cursor_; 207 unsigned cursor_;
165 // The buffer containing the decoded characters. 208 // The buffer containing the decoded characters.
166 const byte* buffer_; 209 const byte* buffer_;
167 }; 210 };
168 211
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
271 static const int kMaxWidth = 1; 314 static const int kMaxWidth = 1;
272 static int Convert(uchar c, 315 static int Convert(uchar c,
273 uchar n, 316 uchar n,
274 uchar* result, 317 uchar* result,
275 bool* allow_caching_ptr); 318 bool* allow_caching_ptr);
276 }; 319 };
277 320
278 } // namespace unibrow 321 } // namespace unibrow
279 322
280 #endif // V8_UNICODE_H_ 323 #endif // V8_UNICODE_H_
OLDNEW
« src/handles.cc ('K') | « src/scanner-character-streams.cc ('k') | src/unicode.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698