src/unicode.h - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Unified Diff: src/unicode.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/unicode.h

===================================================================

--- src/unicode.h (revision 10944)

+++ src/unicode.h (working copy)

@@ -100,7 +100,7 @@

static const uchar kMaxCodePoint;

};

-// --- U t f 8 ---

+// --- U t f 8 a n d 16 ---

template <typename Data>

class Buffer {

@@ -114,10 +114,46 @@

unsigned length_;

};

+class Utf16 {

+ public:

+ static inline bool IsLeadSurrogate(int32_t code) {

+ if (code == kNoPreviousCharacter) return false;

rossberg 2012/03/12 10:55:05 I still think this is implied by the bit masking b

Erik Corry 2012/03/12 12:34:10 Yes, I think that would be too implicit.

+ return (code & 0xfc00) == 0xd800;

+ }

+ static inline bool IsTrailSurrogate(int32_t code) {

+ if (code == kNoPreviousCharacter) return false;

+ return (code & 0xfc00) == 0xdc00;

+ }

+ static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) {

+ return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);

+ }

+ static const int kNoPreviousCharacter = -1;

rossberg 2012/03/12 10:55:05 int32_t?

Erik Corry 2012/03/12 12:34:10 Done.

+ static const uchar kMaxNonSurrogateCharCode = 0xffff;

+ // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes

+ // of UTF-8 data. The special case where the unit is a surrogate

+ // trail produces 1 byte net, because the encoding of the pair is

+ // 4 bytes and the 3 bytes that were used to encode the lead surrogate

+ // can be reclaimed.

+ static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;

+ // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.

+ // The illegality stems from the surrogate not being part of a pair.

+ static const int kUtf8BytesToCodeASurrogate = 3;

+ static inline uchar LeadSurrogate(int32_t char_code) {

+ return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);

+ }

+ static inline uchar TrailSurrogate(int32_t char_code) {

+ return 0xdc00 + (char_code & 0x3ff);

+ }

+};

class Utf8 {

public:

- static inline uchar Length(uchar chr);

- static inline unsigned Encode(char* out, uchar c);

+ static inline uchar Length(uchar chr, int previous);

+ static inline unsigned Encode(

+ char* out, uchar c, int previous);

static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,

unsigned capacity, unsigned* chars_read, unsigned* offset);

static uchar CalculateValue(const byte* str,

@@ -130,6 +166,11 @@

static const unsigned kMaxThreeByteChar = 0xffff;

static const unsigned kMaxFourByteChar = 0x1fffff;

+ // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together

+ // that match are coded as a 4 byte UTF-8 sequence.

+ static const unsigned kBytesSavedByCombiningSurrogates = 2;

+ static const unsigned kSizeOfUnmatchedSurrogate = 3;

private:

template <unsigned s> friend class Utf8InputBuffer;

friend class Test;

@@ -147,6 +188,7 @@

// Note that default implementation is not efficient.

virtual void Seek(unsigned);

unsigned Length();

+ unsigned Utf16Length();

virtual ~CharacterStream() { }

static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,

unsigned& offset);

@@ -156,6 +198,7 @@

unsigned capacity, unsigned& offset);

static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);

virtual void Rewind() = 0;

protected:

virtual void FillBuffer() = 0;

// The number of characters left in the current buffer

« src/handles.cc ('K') | « src/scanner-character-streams.cc ('k') | src/unicode.cc » ('j') | no next file with comments »