Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: Source/core/fetch/TextResourceDecoder.cpp

Issue 23623012: Move TextResourceDecoder from loader/ to fetch/ (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « Source/core/fetch/TextResourceDecoder.h ('k') | Source/core/fetch/XSLStyleSheetResource.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
11 This library is distributed in the hope that it will be useful, 11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details. 14 Library General Public License for more details.
15 15
16 You should have received a copy of the GNU Library General Public License 16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to 17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA. 19 Boston, MA 02110-1301, USA.
20 */ 20 */
21 21
22 22
23 #include "config.h" 23 #include "config.h"
24 #include "core/loader/TextResourceDecoder.h" 24 #include "core/fetch/TextResourceDecoder.h"
25 25
26 #include "HTMLNames.h" 26 #include "HTMLNames.h"
27 #include "core/dom/DOMImplementation.h" 27 #include "core/dom/DOMImplementation.h"
28 #include "core/html/parser/HTMLMetaCharsetParser.h" 28 #include "core/html/parser/HTMLMetaCharsetParser.h"
29 #include "core/platform/text/TextEncodingDetector.h" 29 #include "core/platform/text/TextEncodingDetector.h"
30 #include "wtf/StringExtras.h" 30 #include "wtf/StringExtras.h"
31 #include "wtf/text/TextCodec.h" 31 #include "wtf/text/TextCodec.h"
32 #include "wtf/text/TextEncoding.h" 32 #include "wtf/text/TextEncoding.h"
33 #include "wtf/text/TextEncodingRegistry.h" 33 #include "wtf/text/TextEncodingRegistry.h"
34 34
(...skipping 133 matching lines...) Expand 10 before | Expand all | Expand 10 after
168 int euc = 0; 168 int euc = 0;
169 169
170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str); 170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
171 171
172 code = ASCII; 172 code = ASCII;
173 173
174 i = 0; 174 i = 0;
175 while (i < size) { 175 while (i < size) {
176 if (ptr[i] == ESC && (size - i >= 3)) { 176 if (ptr[i] == ESC && (size - i >= 3)) {
177 if (bytesEqual(str + i + 1, '$', 'B') 177 if (bytesEqual(str + i + 1, '$', 'B')
178 || bytesEqual(str + i + 1, '(', 'B') 178 || bytesEqual(str + i + 1, '(', 'B')
179 || bytesEqual(str + i + 1, '$', '@') 179 || bytesEqual(str + i + 1, '$', '@')
180 || bytesEqual(str + i + 1, '(', 'J')) { 180 || bytesEqual(str + i + 1, '(', 'J')) {
181 code = JIS; 181 code = JIS;
182 goto breakBreak; 182 goto breakBreak;
183 } 183 }
184 if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')' , 'I')) { 184 if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')' , 'I')) {
185 code = JIS; 185 code = JIS;
186 i += 3; 186 i += 3;
187 } else { 187 } else {
188 i++; 188 i++;
189 } 189 }
190 bfr = false; 190 bfr = false;
191 bfk = 0; 191 bfk = 0;
192 } else { 192 } else {
193 if (ptr[i] < 0x20) { 193 if (ptr[i] < 0x20) {
194 bfr = false; 194 bfr = false;
195 bfk = 0; 195 bfk = 0;
196 /* ?? check kudokuten ?? && ?? hiragana ?? */ 196 /* ?? check kudokuten ?? && ?? hiragana ?? */
197 if ((i >= 2) && (ptr[i - 2] == 0x81) 197 if ((i >= 2) && (ptr[i - 2] == 0x81)
198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) { 198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
199 code = SJIS; 199 code = SJIS;
200 sjis += 100; /* kudokuten */ 200 sjis += 100; /* kudokuten */
201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1) 201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1) && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
202 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
203 code = EUC; 202 code = EUC;
204 euc += 100; /* kudokuten */ 203 euc += 100; /* kudokuten */
205 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) { 204 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
206 sjis += 40; /* hiragana */ 205 sjis += 40; /* hiragana */
207 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) { 206 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
208 euc += 40; /* hiragana */ 207 euc += 40; /* hiragana */
209 } 208 }
210 } else { 209 } else {
211 /* ?? check hiragana or katana ?? */ 210 /* ?? check hiragana or katana ?? */
212 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) { 211 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
213 sjis++; /* hiragana */ 212 sjis++; /* hiragana */
214 } else if ((size - i > 1) && (ptr[i] == 0x83) 213 } else if ((size - i > 1) && (ptr[i] == 0x83) && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
215 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
216 sjis++; /* katakana */ 214 sjis++; /* katakana */
217 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) { 215 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
218 euc++; /* hiragana */ 216 euc++; /* hiragana */
219 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) { 217 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
220 euc++; /* katakana */ 218 euc++; /* katakana */
221 } 219 }
222 if (bfr) { 220 if (bfr) {
223 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanj i(ptr[i - 1])) { 221 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanj i(ptr[i - 1])) {
224 code = SJIS; 222 code = SJIS;
225 goto breakBreak; 223 goto breakBreak;
226 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc) )) { 224 }
225
226 if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) & & ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
227 code = SJIS; 227 code = SJIS;
228 goto breakBreak; 228 goto breakBreak;
229 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) { 229 }
230
231 if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
230 code = EUC; 232 code = EUC;
231 goto breakBreak; 233 goto breakBreak;
232 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) { 234 }
235
236 if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) & & (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
233 code = EUC; 237 code = EUC;
234 goto breakBreak; 238 goto breakBreak;
235 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && ( 0x8e == ptr[i - 1])) { 239 }
240
241 if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
236 code = SJIS; 242 code = SJIS;
237 goto breakBreak; 243 goto breakBreak;
238 } else if (ptr[i] <= 0x7f) { 244 }
245
246 if (ptr[i] <= 0x7f) {
239 code = SJIS; 247 code = SJIS;
240 goto breakBreak; 248 goto breakBreak;
241 } else {
242 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
243 euc++; /* sjis hankaku kana kigo */
244 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
245 ; /* sjis hankaku kana */
246 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
247 euc++;
248 } else if (0x8e == ptr[i]) {
249 euc++;
250 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
251 sjis++;
252 }
253 bfr = false;
254 bfk = 0;
255 } 249 }
250
251 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
252 euc++; /* sjis hankaku kana kigo */
253 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
254 /* sjis hankaku kana */
255 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
256 euc++;
257 } else if (0x8e == ptr[i]) {
258 euc++;
259 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
260 sjis++;
261 }
262
263 bfr = false;
264 bfk = 0;
256 } else if (0x8e == ptr[i]) { 265 } else if (0x8e == ptr[i]) {
257 if (size - i <= 1) { 266 if (size - i <= 1) {
258 ; 267
259 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) { 268 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
260 /* EUC KANA or SJIS KANJI */ 269 /* EUC KANA or SJIS KANJI */
261 if (bfk == 1) { 270 if (bfk == 1) {
262 euc += 100; 271 euc += 100;
263 } 272 }
264 bfk++; 273 bfk++;
265 i++; 274 i++;
266 } else { 275 } else {
267 /* SJIS only */ 276 /* SJIS only */
268 code = SJIS; 277 code = SJIS;
269 goto breakBreak; 278 goto breakBreak;
270 } 279 }
271 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) { 280 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
272 /* SJIS only */ 281 /* SJIS only */
273 code = SJIS; 282 code = SJIS;
274 if ((size - i >= 1) 283 if ((size - i >= 1) && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e) || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc)))
275 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
276 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
277 goto breakBreak; 284 goto breakBreak;
278 }
279 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) { 285 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
280 /* EUC only */ 286 /* EUC only */
281 code = EUC; 287 code = EUC;
282 if ((size - i >= 1) 288 if ((size - i >= 1) && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe))
283 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
284 goto breakBreak; 289 goto breakBreak;
285 }
286 } else if (ptr[i] <= 0x7f) { 290 } else if (ptr[i] <= 0x7f) {
287 ; 291
288 } else { 292 } else {
289 bfr = true; 293 bfr = true;
290 bfk = 0; 294 bfk = 0;
291 } 295 }
292 } 296 }
293 i++; 297 i++;
294 } 298 }
295 } 299 }
296 if (code == ASCII) { 300 if (code == ASCII) {
297 if (sjis > euc) { 301 if (sjis > euc) {
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
346 } 350 }
347 351
348 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source) 352 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)
349 { 353 {
350 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings). 354 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
351 if (!encoding.isValid()) 355 if (!encoding.isValid())
352 return; 356 return;
353 357
354 // When encoding comes from meta tag (i.e. it cannot be XML files sent via X HR), 358 // When encoding comes from meta tag (i.e. it cannot be XML files sent via X HR),
355 // treat x-user-defined as windows-1252 (bug 18270) 359 // treat x-user-defined as windows-1252 (bug 18270)
356 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-def ined") == 0) 360 if (source == EncodingFromMetaTag && !strcasecmp(encoding.name(), "x-user-de fined"))
357 m_encoding = "windows-1252"; 361 m_encoding = "windows-1252";
358 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset) 362 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
359 m_encoding = encoding.closestByteBasedEquivalent(); 363 m_encoding = encoding.closestByteBasedEquivalent();
360 else 364 else
361 m_encoding = encoding; 365 m_encoding = encoding;
362 366
363 m_codec.clear(); 367 m_codec.clear();
364 m_source = source; 368 m_source = source;
365 } 369 }
366 370
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
426 size_t buf2Len = len; 430 size_t buf2Len = len;
427 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data()); 431 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());
428 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); 432 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
429 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 433 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
430 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 434 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
431 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 435 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
432 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; 436 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
433 437
434 // Check for the BOM. 438 // Check for the BOM.
435 if (c1 == 0xFF && c2 == 0xFE) { 439 if (c1 == 0xFF && c2 == 0xFE) {
436 if (c3 != 0 || c4 != 0) { 440 if (c3 || c4) {
437 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 441 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
438 lengthOfBOM = 2; 442 lengthOfBOM = 2;
439 } else { 443 } else {
440 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 444 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
441 lengthOfBOM = 4; 445 lengthOfBOM = 4;
442 } 446 }
443 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { 447 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
444 setEncoding(UTF8Encoding(), AutoDetectedEncoding); 448 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
445 lengthOfBOM = 3; 449 lengthOfBOM = 3;
446 } else if (c1 == 0xFE && c2 == 0xFF) { 450 } else if (c1 == 0xFE && c2 == 0xFF) {
447 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 451 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
448 lengthOfBOM = 2; 452 lengthOfBOM = 2;
449 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { 453 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {
450 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 454 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
451 lengthOfBOM = 4; 455 lengthOfBOM = 4;
452 } 456 }
453 457
454 if (lengthOfBOM || bufferLength + len >= 4) 458 if (lengthOfBOM || bufferLength + len >= 4)
455 m_checkedForBOM = true; 459 m_checkedForBOM = true;
456 460
457 return lengthOfBOM; 461 return lengthOfBOM;
458 } 462 }
459 463
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
527 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>') 531 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
528 ++xmlDeclarationEnd; 532 ++xmlDeclarationEnd;
529 if (xmlDeclarationEnd == pEnd) 533 if (xmlDeclarationEnd == pEnd)
530 return false; 534 return false;
531 // No need for +1, because we have an extra "?" to lose at the end of XM L declaration. 535 // No need for +1, because we have an extra "?" to lose at the end of XM L declaration.
532 int len = 0; 536 int len = 0;
533 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len); 537 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
534 if (pos != -1) 538 if (pos != -1)
535 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader) ; 539 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader) ;
536 // continue looking for a charset - it may be specified in an HTTP-Equiv meta 540 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
537 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0)) 541 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0)) {
538 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 542 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
539 else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x')) 543 } else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x')) {
540 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 544 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
541 else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0)) 545 } else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0)) {
542 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 546 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
543 else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?')) 547 } else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?')) {
544 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 548 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
549 }
545 550
546 m_checkedForXMLCharset = true; 551 m_checkedForXMLCharset = true;
547 return true; 552 return true;
548 } 553 }
549 554
550 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length) 555 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
551 { 556 {
552 if (m_source == UserChosenEncoding || m_source == EncodingFromHTTPHeader || m_source == AutoDetectedEncoding) { 557 if (m_source == UserChosenEncoding || m_source == EncodingFromHTTPHeader || m_source == AutoDetectedEncoding) {
553 m_checkedForMetaCharset = true; 558 m_checkedForMetaCharset = true;
554 return; 559 return;
555 } 560 }
556 561
557 if (!m_charsetParser) 562 if (!m_charsetParser)
558 m_charsetParser = HTMLMetaCharsetParser::create(); 563 m_charsetParser = HTMLMetaCharsetParser::create();
559 564
560 if (!m_charsetParser->checkForMetaCharset(data, length)) 565 if (!m_charsetParser->checkForMetaCharset(data, length))
561 return; 566 return;
562 567
563 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); 568 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
564 m_charsetParser.clear(); 569 m_charsetParser.clear();
565 m_checkedForMetaCharset = true; 570 m_checkedForMetaCharset = true;
566 return; 571 return;
567 } 572 }
568 573
569 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len) 574 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
570 { 575 {
571 switch (KanjiCode::judge(data, len)) { 576 switch (KanjiCode::judge(data, len)) {
572 case KanjiCode::JIS: 577 case KanjiCode::JIS:
573 setEncoding("ISO-2022-JP", EncodingFromContentSniffing); 578 setEncoding("ISO-2022-JP", EncodingFromContentSniffing);
574 break; 579 break;
575 case KanjiCode::EUC: 580 case KanjiCode::EUC:
576 setEncoding("EUC-JP", EncodingFromContentSniffing); 581 setEncoding("EUC-JP", EncodingFromContentSniffing);
577 break; 582 break;
578 case KanjiCode::SJIS: 583 case KanjiCode::SJIS:
579 setEncoding("Shift_JIS", EncodingFromContentSniffing); 584 setEncoding("Shift_JIS", EncodingFromContentSniffing);
580 break; 585 break;
581 case KanjiCode::ASCII: 586 case KanjiCode::ASCII:
582 case KanjiCode::UTF16: 587 case KanjiCode::UTF16:
583 case KanjiCode::UTF8: 588 case KanjiCode::UTF8:
584 break; 589 break;
585 } 590 }
586 } 591 }
587 592
588 // We use the encoding detector in two cases: 593 // We use the encoding detector in two cases:
589 // 1. Encoding detector is turned ON and no other encoding source is 594 // 1. Encoding detector is turned ON and no other encoding source is
590 // available (that is, it's DefaultEncoding). 595 // available (that is, it's DefaultEncoding).
591 // 2. Encoding detector is turned ON and the encoding is set to 596 // 2. Encoding detector is turned ON and the encoding is set to
592 // the encoding of the parent frame, which is also auto-detected. 597 // the encoding of the parent frame, which is also auto-detected.
593 // Note that condition #2 is NOT satisfied unless parent-child frame 598 // Note that condition #2 is NOT satisfied unless parent-child frame
594 // relationship is compliant to the same-origin policy. If they're from 599 // relationship is compliant to the same-origin policy. If they're from
595 // different domains, |m_source| would not be set to EncodingFromParentFrame 600 // different domains, |m_source| would not be set to EncodingFromParentFrame
596 // in the first place. 601 // in the first place.
597 bool TextResourceDecoder::shouldAutoDetect() const 602 bool TextResourceDecoder::shouldAutoDetect() const
598 { 603 {
599 // Just checking m_hintEncoding suffices here because it's only set 604 // Just checking m_hintEncoding suffices here because it's only set
600 // in setHintEncoding when the source is AutoDetectedEncoding. 605 // in setHintEncoding when the source is AutoDetectedEncoding.
601 return m_usesEncodingDetector 606 return m_usesEncodingDetector
602 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding)); 607 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
603 } 608 }
604 609
605 String TextResourceDecoder::decode(const char* data, size_t len) 610 String TextResourceDecoder::decode(const char* data, size_t len)
606 { 611 {
607 size_t lengthOfBOM = 0; 612 size_t lengthOfBOM = 0;
608 if (!m_checkedForBOM) 613 if (!m_checkedForBOM)
609 lengthOfBOM = checkForBOM(data, len); 614 lengthOfBOM = checkForBOM(data, len);
610 615
611 bool movedDataToBuffer = false; 616 bool movedDataToBuffer = false;
612 617
613 if (m_contentType == CSS && !m_checkedForCSSCharset) 618 if (m_contentType == CSS && !m_checkedForCSSCharset) {
614 if (!checkForCSSCharset(data, len, movedDataToBuffer)) 619 if (!checkForCSSCharset(data, len, movedDataToBuffer))
615 return emptyString(); 620 return emptyString();
621 }
616 622
617 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForXMLChars et) 623 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForXMLChars et) {
618 if (!checkForXMLCharset(data, len, movedDataToBuffer)) 624 if (!checkForXMLCharset(data, len, movedDataToBuffer))
619 return emptyString(); 625 return emptyString();
626 }
620 627
621 // FIXME: It would be more efficient to move this logic below checkForMetaCh arset because 628 // FIXME: It would be more efficient to move this logic below checkForMetaCh arset because
622 // checkForMetaCharset can overrule these detections. 629 // checkForMetaCharset can overrule these detections.
623 if (shouldAutoDetect()) { 630 if (shouldAutoDetect()) {
624 if (m_encoding.isJapanese()) 631 if (m_encoding.isJapanese()) {
625 detectJapaneseEncoding(data, len); // FIXME: We should use detectTex tEncoding() for all languages. 632 detectJapaneseEncoding(data, len); // FIXME: We should use detectTex tEncoding() for all languages.
626 else { 633 } else {
627 WTF::TextEncoding detectedEncoding; 634 WTF::TextEncoding detectedEncoding;
628 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding) ) 635 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding) )
629 setEncoding(detectedEncoding, EncodingFromContentSniffing); 636 setEncoding(detectedEncoding, EncodingFromContentSniffing);
630 } 637 }
631 } 638 }
632 639
633 ASSERT(m_encoding.isValid()); 640 ASSERT(m_encoding.isValid());
634 641
635 const char* dataForDecode = data + lengthOfBOM; 642 const char* dataForDecode = data + lengthOfBOM;
636 size_t lengthForDecode = len - lengthOfBOM; 643 size_t lengthForDecode = len - lengthOfBOM;
(...skipping 16 matching lines...) Expand all
653 m_codec = newTextCodec(m_encoding); 660 m_codec = newTextCodec(m_encoding);
654 661
655 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_con tentType == XML && !m_useLenientXMLDecoding, m_sawError); 662 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_con tentType == XML && !m_useLenientXMLDecoding, m_sawError);
656 663
657 m_buffer.clear(); 664 m_buffer.clear();
658 return result; 665 return result;
659 } 666 }
660 667
661 String TextResourceDecoder::flush() 668 String TextResourceDecoder::flush()
662 { 669 {
663 // If we can not identify the encoding even after a document is completely 670 // If we can not identify the encoding even after a document is completely
664 // loaded, we need to detect the encoding if other conditions for 671 // loaded, we need to detect the encoding if other conditions for
665 // autodetection is satisfied. 672 // autodetection is satisfied.
666 if (m_buffer.size() && shouldAutoDetect() 673 if (m_buffer.size() && shouldAutoDetect()
667 && ((!m_checkedForXMLCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) { 674 && ((!m_checkedForXMLCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
668 WTF::TextEncoding detectedEncoding; 675 WTF::TextEncoding detectedEncoding;
669 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding)) 676 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
670 setEncoding(detectedEncoding, EncodingFromContentSniffing); 677 setEncoding(detectedEncoding, EncodingFromContentSniffing);
671 } 678 }
672 679
673 if (!m_codec) 680 if (!m_codec)
674 m_codec = newTextCodec(m_encoding); 681 m_codec = newTextCodec(m_encoding);
675 682
676 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_co ntentType == XML && !m_useLenientXMLDecoding, m_sawError); 683 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_co ntentType == XML && !m_useLenientXMLDecoding, m_sawError);
677 m_buffer.clear(); 684 m_buffer.clear();
678 m_codec.clear(); 685 m_codec.clear();
679 m_checkedForBOM = false; // Skip BOM again when re-decoding. 686 m_checkedForBOM = false; // Skip BOM again when re-decoding.
680 return result; 687 return result;
681 } 688 }
682 689
683 } 690 }
OLDNEW
« no previous file with comments | « Source/core/fetch/TextResourceDecoder.h ('k') | Source/core/fetch/XSLStyleSheetResource.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698