 Chromium Code Reviews
 Chromium Code Reviews Issue 9600009:
  Fix input and output to handle UTF16 surrogate pairs.  (Closed) 
  Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
    
  
    Issue 9600009:
  Fix input and output to handle UTF16 surrogate pairs.  (Closed) 
  Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/| OLD | NEW | 
|---|---|
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. | 
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without | 
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are | 
| 4 // met: | 4 // met: | 
| 5 // | 5 // | 
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright | 
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. | 
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above | 
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following | 
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided | 
| (...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 107 public: | 107 public: | 
| 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } | 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } | 
| 109 inline Buffer() : data_(0), length_(0) { } | 109 inline Buffer() : data_(0), length_(0) { } | 
| 110 Data data() { return data_; } | 110 Data data() { return data_; } | 
| 111 unsigned length() { return length_; } | 111 unsigned length() { return length_; } | 
| 112 private: | 112 private: | 
| 113 Data data_; | 113 Data data_; | 
| 114 unsigned length_; | 114 unsigned length_; | 
| 115 }; | 115 }; | 
| 116 | 116 | 
| 117 | |
| 118 class Utf16 { | |
| 
rossberg
2012/03/07 13:32:47
Nit: this doesn't quite fit into the above Utf8 se
 
Erik Corry
2012/03/11 19:29:22
Done.
 | |
| 119 public: | |
| 120 static inline bool IsLeadSurrogate(uchar code) { | |
| 121 return (code & 0xfc00) == 0xd800; | |
| 122 } | |
| 123 static inline bool IsTrailSurrogate(uchar code) { | |
| 124 return (code & 0xfc00) == 0xdc00; | |
| 125 } | |
| 126 static inline int CombineSurrogatePair(uchar lead, uchar trail) { | |
| 
rossberg
2012/03/07 13:32:47
Isn't int32_t more accurate as result type?
 
Erik Corry
2012/03/11 19:29:22
Done.
 | |
| 127 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); | |
| 128 } | |
| 129 static const uchar kMaxNonSurrogateCharCode = 0xffff; | |
| 130 static inline uchar LeadSurrogate(int char_code) { | |
| 
rossberg
2012/03/07 13:32:47
Similar here (and below), isn't char_code an int32
 
Erik Corry
2012/03/11 19:29:22
Done.
 | |
| 131 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | |
| 132 } | |
| 133 static inline uchar TrailSurrogate(int char_code) { | |
| 134 return 0xdc00 + (char_code & 0x3ff); | |
| 135 } | |
| 136 }; | |
| 137 | |
| 138 | |
| 117 class Utf8 { | 139 class Utf8 { | 
| 118 public: | 140 public: | 
| 119 static inline uchar Length(uchar chr); | 141 static inline uchar Length(uchar chr, int previous); | 
| 120 static inline unsigned Encode(char* out, uchar c); | 142 static inline unsigned Encode( | 
| 143 char* out, uchar c, int previous); | |
| 121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, | 144 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, | 
| 122 unsigned capacity, unsigned* chars_read, unsigned* offset); | 145 unsigned capacity, unsigned* chars_read, unsigned* offset); | 
| 123 static uchar CalculateValue(const byte* str, | 146 static uchar CalculateValue(const byte* str, | 
| 124 unsigned length, | 147 unsigned length, | 
| 125 unsigned* cursor); | 148 unsigned* cursor); | 
| 126 static const uchar kBadChar = 0xFFFD; | 149 static const uchar kBadChar = 0xFFFD; | 
| 127 static const unsigned kMaxEncodedSize = 4; | 150 static const unsigned kMaxEncodedSize = 4; | 
| 128 static const unsigned kMaxOneByteChar = 0x7f; | 151 static const unsigned kMaxOneByteChar = 0x7f; | 
| 129 static const unsigned kMaxTwoByteChar = 0x7ff; | 152 static const unsigned kMaxTwoByteChar = 0x7ff; | 
| 130 static const unsigned kMaxThreeByteChar = 0xffff; | 153 static const unsigned kMaxThreeByteChar = 0xffff; | 
| 131 static const unsigned kMaxFourByteChar = 0x1fffff; | 154 static const unsigned kMaxFourByteChar = 0x1fffff; | 
| 132 | 155 | 
| 156 static const int kNoPreviousCharacter = -1; | |
| 157 | |
| 158 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together | |
| 159 // that match are coded as a 4 byte UTF-8 sequence. | |
| 160 static const unsigned kBytesSavedByCombiningSurrogates = 2; | |
| 161 static const unsigned kSizeOfUnmatchedSurrogate = 3; | |
| 162 | |
| 133 private: | 163 private: | 
| 134 template <unsigned s> friend class Utf8InputBuffer; | 164 template <unsigned s> friend class Utf8InputBuffer; | 
| 135 friend class Test; | 165 friend class Test; | 
| 136 static inline uchar ValueOf(const byte* str, | 166 static inline uchar ValueOf(const byte* str, | 
| 137 unsigned length, | 167 unsigned length, | 
| 138 unsigned* cursor); | 168 unsigned* cursor); | 
| 139 }; | 169 }; | 
| 140 | 170 | 
| 141 // --- C h a r a c t e r S t r e a m --- | 171 // --- C h a r a c t e r S t r e a m --- | 
| 142 | 172 | 
| 143 class CharacterStream { | 173 class CharacterStream { | 
| 144 public: | 174 public: | 
| 145 inline uchar GetNext(); | 175 inline uchar GetNext(); | 
| 146 inline bool has_more() { return remaining_ != 0; } | 176 inline bool has_more() { return remaining_ != 0; } | 
| 147 // Note that default implementation is not efficient. | 177 // Note that default implementation is not efficient. | 
| 148 virtual void Seek(unsigned); | 178 virtual void Seek(unsigned); | 
| 149 unsigned Length(); | 179 unsigned Length(); | 
| 180 unsigned Utf16Length(); | |
| 150 virtual ~CharacterStream() { } | 181 virtual ~CharacterStream() { } | 
| 151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, | 182 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, | 
| 152 unsigned& offset); | 183 unsigned& offset); | 
| 153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, | 184 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, | 
| 154 unsigned capacity, unsigned& offset); | 185 unsigned capacity, unsigned& offset); | 
| 155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, | 186 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, | 
| 156 unsigned capacity, unsigned& offset); | 187 unsigned capacity, unsigned& offset); | 
| 157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); | 188 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); | 
| 158 virtual void Rewind() = 0; | 189 virtual void Rewind() = 0; | 
| 190 | |
| 159 protected: | 191 protected: | 
| 160 virtual void FillBuffer() = 0; | 192 virtual void FillBuffer() = 0; | 
| 161 // The number of characters left in the current buffer | 193 // The number of characters left in the current buffer | 
| 162 unsigned remaining_; | 194 unsigned remaining_; | 
| 163 // The current offset within the buffer | 195 // The current offset within the buffer | 
| 164 unsigned cursor_; | 196 unsigned cursor_; | 
| 165 // The buffer containing the decoded characters. | 197 // The buffer containing the decoded characters. | 
| 166 const byte* buffer_; | 198 const byte* buffer_; | 
| 167 }; | 199 }; | 
| 168 | 200 | 
| (...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 271 static const int kMaxWidth = 1; | 303 static const int kMaxWidth = 1; | 
| 272 static int Convert(uchar c, | 304 static int Convert(uchar c, | 
| 273 uchar n, | 305 uchar n, | 
| 274 uchar* result, | 306 uchar* result, | 
| 275 bool* allow_caching_ptr); | 307 bool* allow_caching_ptr); | 
| 276 }; | 308 }; | 
| 277 | 309 | 
| 278 } // namespace unibrow | 310 } // namespace unibrow | 
| 279 | 311 | 
| 280 #endif // V8_UNICODE_H_ | 312 #endif // V8_UNICODE_H_ | 
| OLD | NEW |