Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(210)

Side by Side Diff: Source/core/loader/TextResourceDecoder.cpp

Issue 23623012: Move TextResourceDecoder from loader/ to fetch/ (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20 */
21
22
23 #include "config.h"
24 #include "core/loader/TextResourceDecoder.h"
25
26 #include "HTMLNames.h"
27 #include "core/dom/DOMImplementation.h"
28 #include "core/html/parser/HTMLMetaCharsetParser.h"
29 #include "core/platform/text/TextEncodingDetector.h"
30 #include "wtf/StringExtras.h"
31 #include "wtf/text/TextCodec.h"
32 #include "wtf/text/TextEncoding.h"
33 #include "wtf/text/TextEncodingRegistry.h"
34
35 using namespace WTF;
36
37 namespace WebCore {
38
39 using namespace HTMLNames;
40
41 static inline bool bytesEqual(const char* p, char b0, char b1)
42 {
43 return p[0] == b0 && p[1] == b1;
44 }
45
46 static inline bool bytesEqual(const char* p, char b0, char b1, char b2)
47 {
48 return p[0] == b0 && p[1] == b1 && p[2] == b2;
49 }
50
51 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
52 {
53 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;
54 }
55
56 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)
57 {
58 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;
59 }
60
61 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)
62 {
63 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;
64 }
65
66 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)
67 {
68 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;
69 }
70
71 // You might think we should put these find functions elsewhere, perhaps with th e
72 // similar functions that operate on UChar, but arguably only the decoder has
73 // a reason to process strings of char rather than UChar.
74
75 static int find(const char* subject, size_t subjectLength, const char* target)
76 {
77 size_t targetLength = strlen(target);
78 if (targetLength > subjectLength)
79 return -1;
80 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
81 bool match = true;
82 for (size_t j = 0; j < targetLength; ++j) {
83 if (subject[i + j] != target[j]) {
84 match = false;
85 break;
86 }
87 }
88 if (match)
89 return i;
90 }
91 return -1;
92 }
93
94 static WTF::TextEncoding findTextEncoding(const char* encodingName, int length)
95 {
96 Vector<char, 64> buffer(length + 1);
97 memcpy(buffer.data(), encodingName, length);
98 buffer[length] = '\0';
99 return buffer.data();
100 }
101
102 class KanjiCode {
103 public:
104 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
105 static enum Type judge(const char* str, int length);
106 static const int ESC = 0x1b;
107 static const unsigned char sjisMap[256];
108 static int ISkanji(int code)
109 {
110 if (code >= 0x100)
111 return 0;
112 return sjisMap[code & 0xff] & 1;
113 }
114 static int ISkana(int code)
115 {
116 if (code >= 0x100)
117 return 0;
118 return sjisMap[code & 0xff] & 2;
119 }
120 };
121
122 const unsigned char KanjiCode::sjisMap[256] = {
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
139 };
140
141 /*
142 * EUC-JP is
143 * [0xa1 - 0xfe][0xa1 - 0xfe]
144 * 0x8e[0xa1 - 0xfe](SS2)
145 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
146 *
147 * Shift_Jis is
148 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
149 *
150 * Shift_Jis Hankaku Kana is
151 * [0xa1 - 0xdf]
152 */
153
154 /*
155 * KanjiCode::judge() is based on judge_jcode() from jvim
156 * http://hp.vector.co.jp/authors/VA003457/vim/
157 *
158 * Special Thanks to Kenichi Tsuchida
159 */
160
161 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
162 {
163 enum Type code;
164 int i;
165 int bfr = false; /* Kana Moji */
166 int bfk = 0; /* EUC Kana */
167 int sjis = 0;
168 int euc = 0;
169
170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
171
172 code = ASCII;
173
174 i = 0;
175 while (i < size) {
176 if (ptr[i] == ESC && (size - i >= 3)) {
177 if (bytesEqual(str + i + 1, '$', 'B')
178 || bytesEqual(str + i + 1, '(', 'B')
179 || bytesEqual(str + i + 1, '$', '@')
180 || bytesEqual(str + i + 1, '(', 'J')) {
181 code = JIS;
182 goto breakBreak;
183 }
184 if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')' , 'I')) {
185 code = JIS;
186 i += 3;
187 } else {
188 i++;
189 }
190 bfr = false;
191 bfk = 0;
192 } else {
193 if (ptr[i] < 0x20) {
194 bfr = false;
195 bfk = 0;
196 /* ?? check kudokuten ?? && ?? hiragana ?? */
197 if ((i >= 2) && (ptr[i - 2] == 0x81)
198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
199 code = SJIS;
200 sjis += 100; /* kudokuten */
201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
202 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
203 code = EUC;
204 euc += 100; /* kudokuten */
205 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
206 sjis += 40; /* hiragana */
207 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
208 euc += 40; /* hiragana */
209 }
210 } else {
211 /* ?? check hiragana or katana ?? */
212 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
213 sjis++; /* hiragana */
214 } else if ((size - i > 1) && (ptr[i] == 0x83)
215 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
216 sjis++; /* katakana */
217 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
218 euc++; /* hiragana */
219 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
220 euc++; /* katakana */
221 }
222 if (bfr) {
223 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanj i(ptr[i - 1])) {
224 code = SJIS;
225 goto breakBreak;
226 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc) )) {
227 code = SJIS;
228 goto breakBreak;
229 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
230 code = EUC;
231 goto breakBreak;
232 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
233 code = EUC;
234 goto breakBreak;
235 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && ( 0x8e == ptr[i - 1])) {
236 code = SJIS;
237 goto breakBreak;
238 } else if (ptr[i] <= 0x7f) {
239 code = SJIS;
240 goto breakBreak;
241 } else {
242 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
243 euc++; /* sjis hankaku kana kigo */
244 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
245 ; /* sjis hankaku kana */
246 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
247 euc++;
248 } else if (0x8e == ptr[i]) {
249 euc++;
250 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
251 sjis++;
252 }
253 bfr = false;
254 bfk = 0;
255 }
256 } else if (0x8e == ptr[i]) {
257 if (size - i <= 1) {
258 ;
259 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
260 /* EUC KANA or SJIS KANJI */
261 if (bfk == 1) {
262 euc += 100;
263 }
264 bfk++;
265 i++;
266 } else {
267 /* SJIS only */
268 code = SJIS;
269 goto breakBreak;
270 }
271 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
272 /* SJIS only */
273 code = SJIS;
274 if ((size - i >= 1)
275 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
276 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
277 goto breakBreak;
278 }
279 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
280 /* EUC only */
281 code = EUC;
282 if ((size - i >= 1)
283 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
284 goto breakBreak;
285 }
286 } else if (ptr[i] <= 0x7f) {
287 ;
288 } else {
289 bfr = true;
290 bfk = 0;
291 }
292 }
293 i++;
294 }
295 }
296 if (code == ASCII) {
297 if (sjis > euc) {
298 code = SJIS;
299 } else if (sjis < euc) {
300 code = EUC;
301 }
302 }
303 breakBreak:
304 return (code);
305 }
306
307 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
308 {
309 if (equalIgnoringCase(mimeType, "text/css"))
310 return CSS;
311 if (equalIgnoringCase(mimeType, "text/html"))
312 return HTML;
313 if (DOMImplementation::isXMLMIMEType(mimeType))
314 return XML;
315 return PlainText;
316 }
317
318 const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType conten tType, const WTF::TextEncoding& specifiedDefaultEncoding)
319 {
320 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
321 // for text/xml. This matches Firefox.
322 if (contentType == XML)
323 return UTF8Encoding();
324 if (!specifiedDefaultEncoding.isValid())
325 return Latin1Encoding();
326 return specifiedDefaultEncoding;
327 }
328
329 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector)
330 : m_contentType(determineContentType(mimeType))
331 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
332 , m_source(DefaultEncoding)
333 , m_hintEncoding(0)
334 , m_checkedForBOM(false)
335 , m_checkedForCSSCharset(false)
336 , m_checkedForXMLCharset(false)
337 , m_checkedForMetaCharset(false)
338 , m_useLenientXMLDecoding(false)
339 , m_sawError(false)
340 , m_usesEncodingDetector(usesEncodingDetector)
341 {
342 }
343
344 TextResourceDecoder::~TextResourceDecoder()
345 {
346 }
347
348 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)
349 {
350 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
351 if (!encoding.isValid())
352 return;
353
354 // When encoding comes from meta tag (i.e. it cannot be XML files sent via X HR),
355 // treat x-user-defined as windows-1252 (bug 18270)
356 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-def ined") == 0)
357 m_encoding = "windows-1252";
358 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
359 m_encoding = encoding.closestByteBasedEquivalent();
360 else
361 m_encoding = encoding;
362
363 m_codec.clear();
364 m_source = source;
365 }
366
367 // Returns the position of the encoding string.
368 static int findXMLEncoding(const char* str, int len, int& encodingLength)
369 {
370 int pos = find(str, len, "encoding");
371 if (pos == -1)
372 return -1;
373 pos += 8;
374
375 // Skip spaces and stray control characters.
376 while (pos < len && str[pos] <= ' ')
377 ++pos;
378
379 // Skip equals sign.
380 if (pos >= len || str[pos] != '=')
381 return -1;
382 ++pos;
383
384 // Skip spaces and stray control characters.
385 while (pos < len && str[pos] <= ' ')
386 ++pos;
387
388 // Skip quotation mark.
389 if (pos >= len)
390 return - 1;
391 char quoteMark = str[pos];
392 if (quoteMark != '"' && quoteMark != '\'')
393 return -1;
394 ++pos;
395
396 // Find the trailing quotation mark.
397 int end = pos;
398 while (end < len && str[end] != quoteMark)
399 ++end;
400 if (end >= len)
401 return -1;
402
403 encodingLength = end - pos;
404 return pos;
405 }
406
407 // true if there is more to parse
408 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
409 {
410 while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
411 ++pos;
412 return pos != dataEnd;
413 }
414
415 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
416 {
417 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure s ign of a Unicode encoding.
418 // We let it override even a user-chosen encoding.
419 ASSERT(!m_checkedForBOM);
420
421 size_t lengthOfBOM = 0;
422
423 size_t bufferLength = m_buffer.size();
424
425 size_t buf1Len = bufferLength;
426 size_t buf2Len = len;
427 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());
428 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
429 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
430 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
431 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
432 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
433
434 // Check for the BOM.
435 if (c1 == 0xFF && c2 == 0xFE) {
436 if (c3 != 0 || c4 != 0) {
437 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
438 lengthOfBOM = 2;
439 } else {
440 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
441 lengthOfBOM = 4;
442 }
443 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
444 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
445 lengthOfBOM = 3;
446 } else if (c1 == 0xFE && c2 == 0xFF) {
447 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
448 lengthOfBOM = 2;
449 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
450 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
451 lengthOfBOM = 4;
452 }
453
454 if (lengthOfBOM || bufferLength + len >= 4)
455 m_checkedForBOM = true;
456
457 return lengthOfBOM;
458 }
459
460 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
461 {
462 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
463 m_checkedForCSSCharset = true;
464 return true;
465 }
466
467 size_t oldSize = m_buffer.size();
468 m_buffer.grow(oldSize + len);
469 memcpy(m_buffer.data() + oldSize, data, len);
470
471 movedDataToBuffer = true;
472
473 if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13
474 return false;
475
476 const char* dataStart = m_buffer.data();
477 const char* dataEnd = dataStart + m_buffer.size();
478
479 if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {
480 dataStart += 10;
481 const char* pos = dataStart;
482
483 while (pos < dataEnd && *pos != '"')
484 ++pos;
485 if (pos == dataEnd)
486 return false;
487
488 int encodingNameLength = pos - dataStart;
489
490 ++pos;
491
492 if (*pos == ';')
493 setEncoding(findTextEncoding(dataStart, encodingNameLength), Encodin gFromCSSCharset);
494 }
495
496 m_checkedForCSSCharset = true;
497 return true;
498 }
499
500 bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer)
501 {
502 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
503 m_checkedForXMLCharset = true;
504 return true;
505 }
506
507 // This is not completely efficient, since the function might go
508 // through the HTML head several times.
509
510 size_t oldSize = m_buffer.size();
511 m_buffer.grow(oldSize + len);
512 memcpy(m_buffer.data() + oldSize, data, len);
513
514 movedDataToBuffer = true;
515
516 const char* ptr = m_buffer.data();
517 const char* pEnd = ptr + m_buffer.size();
518
519 // Is there enough data available to check for XML declaration?
520 if (m_buffer.size() < 8)
521 return false;
522
523 // Handle XML declaration, which can have encoding in it. This encoding is h onored even for HTML documents.
524 // It is an error for an XML declaration not to be at the start of an XML do cument, and it is ignored in HTML documents in such case.
525 if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {
526 const char* xmlDeclarationEnd = ptr;
527 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
528 ++xmlDeclarationEnd;
529 if (xmlDeclarationEnd == pEnd)
530 return false;
531 // No need for +1, because we have an extra "?" to lose at the end of XM L declaration.
532 int len = 0;
533 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
534 if (pos != -1)
535 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader) ;
536 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
537 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0))
538 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
539 else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x'))
540 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
541 else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0))
542 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
543 else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?'))
544 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
545
546 m_checkedForXMLCharset = true;
547 return true;
548 }
549
550 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
551 {
552 if (m_source == UserChosenEncoding || m_source == EncodingFromHTTPHeader || m_source == AutoDetectedEncoding) {
553 m_checkedForMetaCharset = true;
554 return;
555 }
556
557 if (!m_charsetParser)
558 m_charsetParser = HTMLMetaCharsetParser::create();
559
560 if (!m_charsetParser->checkForMetaCharset(data, length))
561 return;
562
563 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
564 m_charsetParser.clear();
565 m_checkedForMetaCharset = true;
566 return;
567 }
568
569 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
570 {
571 switch (KanjiCode::judge(data, len)) {
572 case KanjiCode::JIS:
573 setEncoding("ISO-2022-JP", EncodingFromContentSniffing);
574 break;
575 case KanjiCode::EUC:
576 setEncoding("EUC-JP", EncodingFromContentSniffing);
577 break;
578 case KanjiCode::SJIS:
579 setEncoding("Shift_JIS", EncodingFromContentSniffing);
580 break;
581 case KanjiCode::ASCII:
582 case KanjiCode::UTF16:
583 case KanjiCode::UTF8:
584 break;
585 }
586 }
587
588 // We use the encoding detector in two cases:
589 // 1. Encoding detector is turned ON and no other encoding source is
590 // available (that is, it's DefaultEncoding).
591 // 2. Encoding detector is turned ON and the encoding is set to
592 // the encoding of the parent frame, which is also auto-detected.
593 // Note that condition #2 is NOT satisfied unless parent-child frame
594 // relationship is compliant to the same-origin policy. If they're from
595 // different domains, |m_source| would not be set to EncodingFromParentFrame
596 // in the first place.
597 bool TextResourceDecoder::shouldAutoDetect() const
598 {
599 // Just checking m_hintEncoding suffices here because it's only set
600 // in setHintEncoding when the source is AutoDetectedEncoding.
601 return m_usesEncodingDetector
602 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
603 }
604
605 String TextResourceDecoder::decode(const char* data, size_t len)
606 {
607 size_t lengthOfBOM = 0;
608 if (!m_checkedForBOM)
609 lengthOfBOM = checkForBOM(data, len);
610
611 bool movedDataToBuffer = false;
612
613 if (m_contentType == CSS && !m_checkedForCSSCharset)
614 if (!checkForCSSCharset(data, len, movedDataToBuffer))
615 return emptyString();
616
617 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForXMLChars et)
618 if (!checkForXMLCharset(data, len, movedDataToBuffer))
619 return emptyString();
620
621 // FIXME: It would be more efficient to move this logic below checkForMetaCh arset because
622 // checkForMetaCharset can overrule these detections.
623 if (shouldAutoDetect()) {
624 if (m_encoding.isJapanese())
625 detectJapaneseEncoding(data, len); // FIXME: We should use detectTex tEncoding() for all languages.
626 else {
627 WTF::TextEncoding detectedEncoding;
628 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding) )
629 setEncoding(detectedEncoding, EncodingFromContentSniffing);
630 }
631 }
632
633 ASSERT(m_encoding.isValid());
634
635 const char* dataForDecode = data + lengthOfBOM;
636 size_t lengthForDecode = len - lengthOfBOM;
637
638 if (!m_buffer.isEmpty()) {
639 if (!movedDataToBuffer) {
640 size_t oldSize = m_buffer.size();
641 m_buffer.grow(oldSize + len);
642 memcpy(m_buffer.data() + oldSize, data, len);
643 }
644
645 dataForDecode = m_buffer.data() + lengthOfBOM;
646 lengthForDecode = m_buffer.size() - lengthOfBOM;
647 }
648
649 if (m_contentType == HTML && !m_checkedForMetaCharset)
650 checkForMetaCharset(dataForDecode, lengthForDecode);
651
652 if (!m_codec)
653 m_codec = newTextCodec(m_encoding);
654
655 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_con tentType == XML && !m_useLenientXMLDecoding, m_sawError);
656
657 m_buffer.clear();
658 return result;
659 }
660
661 String TextResourceDecoder::flush()
662 {
663 // If we can not identify the encoding even after a document is completely
664 // loaded, we need to detect the encoding if other conditions for
665 // autodetection is satisfied.
666 if (m_buffer.size() && shouldAutoDetect()
667 && ((!m_checkedForXMLCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
668 WTF::TextEncoding detectedEncoding;
669 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
670 setEncoding(detectedEncoding, EncodingFromContentSniffing);
671 }
672
673 if (!m_codec)
674 m_codec = newTextCodec(m_encoding);
675
676 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_co ntentType == XML && !m_useLenientXMLDecoding, m_sawError);
677 m_buffer.clear();
678 m_codec.clear();
679 m_checkedForBOM = false; // Skip BOM again when re-decoding.
680 return result;
681 }
682
683 }
OLDNEW
« no previous file with comments | « Source/core/loader/TextResourceDecoder.h ('k') | Source/core/loader/TextResourceDecoderBuilder.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698