src/unicode-inl.h - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Side by Side Diff: src/unicode-inl.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2007-2010 the V8 project authors. All rights reserved.	1 // Copyright 2007-2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
71 } else {	71 } else {

72 entries_[c & kMask] = CacheEntry(c, 0);	72 entries_[c & kMask] = CacheEntry(c, 0);

73 return 0;	73 return 0;

74 }	74 }

75 } else {	75 } else {

76 return length;	76 return length;

77 }	77 }

78 }	78 }

79	79

80	80

81 unsigned Utf8::Encode(char* str, uchar c) {	81 unsigned Utf8::Encode(char* str, uchar c, int previous) {

82 static const int kMask = ~(1 << 6);	82 static const int kMask = ~(1 << 6);

83 if (c <= kMaxOneByteChar) {	83 if (c <= kMaxOneByteChar) {

84 str[0] = c;	84 str[0] = c;

85 return 1;	85 return 1;

86 } else if (c <= kMaxTwoByteChar) {	86 } else if (c <= kMaxTwoByteChar) {

87 str[0] = 0xC0 \| (c >> 6);	87 str[0] = 0xC0 \| (c >> 6);

88 str[1] = 0x80 \| (c & kMask);	88 str[1] = 0x80 \| (c & kMask);

89 return 2;	89 return 2;

90 } else if (c <= kMaxThreeByteChar) {	90 } else if (c <= kMaxThreeByteChar) {

	91 if (Utf16::IsTrailSurrogate(c) &&

	92 Utf16::IsLeadSurrogate(previous)) {

	93 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;

	94 return Encode(str - kUnmatchedSize,

	95 Utf16::CombineSurrogatePair(previous, c),

	96 Utf16::kNoPreviousCharacter) - kUnmatchedSize;

	97 }

91 str[0] = 0xE0 \| (c >> 12);	98 str[0] = 0xE0 \| (c >> 12);

92 str[1] = 0x80 \| ((c >> 6) & kMask);	99 str[1] = 0x80 \| ((c >> 6) & kMask);

93 str[2] = 0x80 \| (c & kMask);	100 str[2] = 0x80 \| (c & kMask);

94 return 3;	101 return 3;

95 } else {	102 } else {

96 str[0] = 0xF0 \| (c >> 18);	103 str[0] = 0xF0 \| (c >> 18);

97 str[1] = 0x80 \| ((c >> 12) & kMask);	104 str[1] = 0x80 \| ((c >> 12) & kMask);

98 str[2] = 0x80 \| ((c >> 6) & kMask);	105 str[2] = 0x80 \| ((c >> 6) & kMask);

99 str[3] = 0x80 \| (c & kMask);	106 str[3] = 0x80 \| (c & kMask);

100 return 4;	107 return 4;

101 }	108 }

102 }	109 }

103	110

104	111

105 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {	112 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {

106 if (length <= 0) return kBadChar;	113 if (length <= 0) return kBadChar;

107 byte first = bytes[0];	114 byte first = bytes[0];

108 // Characters between 0000 and 0007F are encoded as a single character	115 // Characters between 0000 and 0007F are encoded as a single character

109 if (first <= kMaxOneByteChar) {	116 if (first <= kMaxOneByteChar) {

110 *cursor += 1;	117 *cursor += 1;

111 return first;	118 return first;

112 }	119 }

113 return CalculateValue(bytes, length, cursor);	120 return CalculateValue(bytes, length, cursor);

114 }	121 }

115	122

116 unsigned Utf8::Length(uchar c) {	123 unsigned Utf8::Length(uchar c, int previous) {

117 if (c <= kMaxOneByteChar) {	124 if (c <= kMaxOneByteChar) {

118 return 1;	125 return 1;

119 } else if (c <= kMaxTwoByteChar) {	126 } else if (c <= kMaxTwoByteChar) {

120 return 2;	127 return 2;

121 } else if (c <= kMaxThreeByteChar) {	128 } else if (c <= kMaxThreeByteChar) {

	129 if (Utf16::IsTrailSurrogate(c) &&

	130 Utf16::IsLeadSurrogate(previous)) {

	131 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;

	132 }

122 return 3;	133 return 3;

123 } else {	134 } else {

124 return 4;	135 return 4;

125 }	136 }

126 }	137 }

127	138

128 uchar CharacterStream::GetNext() {	139 uchar CharacterStream::GetNext() {

129 uchar result = DecodeCharacter(buffer_, &cursor_);	140 uchar result = DecodeCharacter(buffer_, &cursor_);

130 if (remaining_ == 1) {	141 if (remaining_ == 1) {

131 cursor_ = 0;	142 cursor_ = 0;

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
229	240

230 template <unsigned s>	241 template <unsigned s>

231 Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)	242 Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)

232 : InputBuffer<Utf8, Buffer<const char>, s>(Buffer<const char>(data,	243 : InputBuffer<Utf8, Buffer<const char>, s>(Buffer<const char>(data,

233 length)) {	244 length)) {

234 }	245 }

235	246

236 } // namespace unibrow	247 } // namespace unibrow

237	248

238 #endif // V8_UNICODE_INL_H_	249 #endif // V8_UNICODE_INL_H_

OLD	NEW

« src/unicode.h ('K') | « src/unicode.cc ('k') | src/x64/regexp-macro-assembler-x64.cc » ('j') | no next file with comments »