OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2007 Apple Inc. All rights reserved. | 2 * Copyright (C) 2007 Apple Inc. All rights reserved. |
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> | 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> |
4 * | 4 * |
5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
7 * are met: | 7 * are met: |
8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
11 * notice, this list of conditions and the following disclaimer in the | 11 * notice, this list of conditions and the following disclaimer in the |
12 * documentation and/or other materials provided with the distribution. | 12 * documentation and/or other materials provided with the distribution. |
13 * | 13 * |
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY | 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR | 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 */ | 25 */ |
26 | 26 |
27 #include "config.h" | 27 #include "config.h" |
28 #include "wtf/unicode/UTF8.h" | 28 #include "wtf/unicode/UTF8.h" |
29 | 29 |
30 #include "wtf/ASCIICType.h" | 30 #include "wtf/ASCIICType.h" |
31 #include "wtf/StringHasher.h" | 31 #include "wtf/StringHasher.h" |
32 #include "wtf/unicode/CharacterNames.h" | 32 #include "wtf/unicode/CharacterNames.h" |
33 | 33 |
34 namespace WTF { | 34 namespace WTF { |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
119 } | 119 } |
120 | 120 |
121 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | 121 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
122 // into the first byte, depending on how many bytes follow. There are | 122 // into the first byte, depending on how many bytes follow. There are |
123 // as many entries in this table as there are UTF-8 sequence types. | 123 // as many entries in this table as there are UTF-8 sequence types. |
124 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs | 124 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs |
125 // for *legal* UTF-8 will be 4 or fewer bytes total. | 125 // for *legal* UTF-8 will be 4 or fewer bytes total. |
126 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
F8, 0xFC }; | 126 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
F8, 0xFC }; |
127 | 127 |
128 ConversionResult convertLatin1ToUTF8( | 128 ConversionResult convertLatin1ToUTF8( |
129 const LChar** sourceStart, const LChar* sou
rceEnd, | 129 const LChar** sourceStart, const LChar* sou
rceEnd, |
130 char** targetStart, char* targetEnd) | 130 char** targetStart, char* targetEnd) |
131 { | 131 { |
132 ConversionResult result = conversionOK; | 132 ConversionResult result = conversionOK; |
133 const LChar* source = *sourceStart; | 133 const LChar* source = *sourceStart; |
134 char* target = *targetStart; | 134 char* target = *targetStart; |
135 while (source < sourceEnd) { | 135 while (source < sourceEnd) { |
136 UChar32 ch; | 136 UChar32 ch; |
137 unsigned short bytesToWrite = 0; | 137 unsigned short bytesToWrite = 0; |
138 const UChar32 byteMask = 0xBF; | 138 const UChar32 byteMask = 0xBF; |
139 const UChar32 byteMark = 0x80; | 139 const UChar32 byteMark = 0x80; |
140 const LChar* oldSource = source; // In case we have to back up because o
f target overflow. | 140 const LChar* oldSource = source; // In case we have to back up because o
f target overflow. |
141 ch = static_cast<unsigned short>(*source++); | 141 ch = static_cast<unsigned short>(*source++); |
142 | 142 |
143 // Figure out how many bytes the result will require | 143 // Figure out how many bytes the result will require |
144 if (ch < (UChar32)0x80) | 144 if (ch < (UChar32)0x80) |
145 bytesToWrite = 1; | 145 bytesToWrite = 1; |
146 else | 146 else |
147 bytesToWrite = 2; | 147 bytesToWrite = 2; |
148 | 148 |
149 target += bytesToWrite; | 149 target += bytesToWrite; |
(...skipping 11 matching lines...) Expand all Loading... |
161 *--target = (char)(ch | firstByteMark[bytesToWrite]); | 161 *--target = (char)(ch | firstByteMark[bytesToWrite]); |
162 } | 162 } |
163 target += bytesToWrite; | 163 target += bytesToWrite; |
164 } | 164 } |
165 *sourceStart = source; | 165 *sourceStart = source; |
166 *targetStart = target; | 166 *targetStart = target; |
167 return result; | 167 return result; |
168 } | 168 } |
169 | 169 |
170 ConversionResult convertUTF16ToUTF8( | 170 ConversionResult convertUTF16ToUTF8( |
171 const UChar** sourceStart, const UChar* sourceEnd, | 171 const UChar** sourceStart, const UChar* sourceEnd, |
172 char** targetStart, char* targetEnd, bool strict) | 172 char** targetStart, char* targetEnd, bool strict) |
173 { | 173 { |
174 ConversionResult result = conversionOK; | 174 ConversionResult result = conversionOK; |
175 const UChar* source = *sourceStart; | 175 const UChar* source = *sourceStart; |
176 char* target = *targetStart; | 176 char* target = *targetStart; |
177 while (source < sourceEnd) { | 177 while (source < sourceEnd) { |
178 UChar32 ch; | 178 UChar32 ch; |
179 unsigned short bytesToWrite = 0; | 179 unsigned short bytesToWrite = 0; |
180 const UChar32 byteMask = 0xBF; | 180 const UChar32 byteMask = 0xBF; |
181 const UChar32 byteMark = 0x80; | 181 const UChar32 byteMark = 0x80; |
182 const UChar* oldSource = source; // In case we have to back up because o
f target overflow. | 182 const UChar* oldSource = source; // In case we have to back up because o
f target overflow. |
183 ch = static_cast<unsigned short>(*source++); | 183 ch = static_cast<unsigned short>(*source++); |
184 // If we have a surrogate pair, convert to UChar32 first. | 184 // If we have a surrogate pair, convert to UChar32 first. |
185 if (ch >= 0xD800 && ch <= 0xDBFF) { | 185 if (ch >= 0xD800 && ch <= 0xDBFF) { |
186 // If the 16 bits following the high surrogate are in the source buf
fer... | 186 // If the 16 bits following the high surrogate are in the source buf
fer... |
187 if (source < sourceEnd) { | 187 if (source < sourceEnd) { |
188 UChar32 ch2 = static_cast<unsigned short>(*source); | 188 UChar32 ch2 = static_cast<unsigned short>(*source); |
189 // If it's a low surrogate, convert to UChar32. | 189 // If it's a low surrogate, convert to UChar32. |
190 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | 190 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
191 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | 191 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
288 case 4: character += static_cast<unsigned char>(*sequence++); character
<<= 6; | 288 case 4: character += static_cast<unsigned char>(*sequence++); character
<<= 6; |
289 case 3: character += static_cast<unsigned char>(*sequence++); character
<<= 6; | 289 case 3: character += static_cast<unsigned char>(*sequence++); character
<<= 6; |
290 case 2: character += static_cast<unsigned char>(*sequence++); character
<<= 6; | 290 case 2: character += static_cast<unsigned char>(*sequence++); character
<<= 6; |
291 case 1: character += static_cast<unsigned char>(*sequence++); | 291 case 1: character += static_cast<unsigned char>(*sequence++); |
292 } | 292 } |
293 | 293 |
294 return character - offsetsFromUTF8[length - 1]; | 294 return character - offsetsFromUTF8[length - 1]; |
295 } | 295 } |
296 | 296 |
297 ConversionResult convertUTF8ToUTF16( | 297 ConversionResult convertUTF8ToUTF16( |
298 const char** sourceStart, const char* sourceEnd, | 298 const char** sourceStart, const char* sourceEnd, |
299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) | 299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) |
300 { | 300 { |
301 ConversionResult result = conversionOK; | 301 ConversionResult result = conversionOK; |
302 const char* source = *sourceStart; | 302 const char* source = *sourceStart; |
303 UChar* target = *targetStart; | 303 UChar* target = *targetStart; |
304 UChar orAllData = 0; | 304 UChar orAllData = 0; |
305 while (source < sourceEnd) { | 305 while (source < sourceEnd) { |
306 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | 306 int utf8SequenceLength = inlineUTF8SequenceLength(*source); |
307 if (sourceEnd - source < utf8SequenceLength) { | 307 if (sourceEnd - source < utf8SequenceLength) { |
308 result = sourceExhausted; | 308 result = sourceExhausted; |
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
462 return equalWithUTF8Internal(a, aEnd, b, bEnd); | 462 return equalWithUTF8Internal(a, aEnd, b, bEnd); |
463 } | 463 } |
464 | 464 |
465 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const
char* bEnd) | 465 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const
char* bEnd) |
466 { | 466 { |
467 return equalWithUTF8Internal(a, aEnd, b, bEnd); | 467 return equalWithUTF8Internal(a, aEnd, b, bEnd); |
468 } | 468 } |
469 | 469 |
470 } // namespace Unicode | 470 } // namespace Unicode |
471 } // namespace WTF | 471 } // namespace WTF |
OLD | NEW |