Source/core/loader/TextResourceDecoder.cpp - Issue 23623012: Move TextResourceDecoder from loader/ to fetch/

Side by Side Diff: Source/core/loader/TextResourceDecoder.cpp

Issue 23623012: Move TextResourceDecoder from loader/ to fetch/ (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 7 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /*

2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)

3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.

4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)

5

6 This library is free software; you can redistribute it and/or

7 modify it under the terms of the GNU Library General Public

8 License as published by the Free Software Foundation; either

9 version 2 of the License, or (at your option) any later version.

10

11 This library is distributed in the hope that it will be useful,

12 but WITHOUT ANY WARRANTY; without even the implied warranty of

13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 Library General Public License for more details.

15

16 You should have received a copy of the GNU Library General Public License

17 along with this library; see the file COPYING.LIB. If not, write to

18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

19 Boston, MA 02110-1301, USA.

20 */

21

22

23 #include "config.h"

24 #include "core/loader/TextResourceDecoder.h"

25

26 #include "HTMLNames.h"

27 #include "core/dom/DOMImplementation.h"

28 #include "core/html/parser/HTMLMetaCharsetParser.h"

29 #include "core/platform/text/TextEncodingDetector.h"

30 #include "wtf/StringExtras.h"

31 #include "wtf/text/TextCodec.h"

32 #include "wtf/text/TextEncoding.h"

33 #include "wtf/text/TextEncodingRegistry.h"

34

35 using namespace WTF;

36

37 namespace WebCore {

38

39 using namespace HTMLNames;

40

41 static inline bool bytesEqual(const char* p, char b0, char b1)

42 {

43 return p[0] == b0 && p[1] == b1;

44 }

45

46 static inline bool bytesEqual(const char* p, char b0, char b1, char b2)

47 {

48 return p[0] == b0 && p[1] == b1 && p[2] == b2;

49 }

50

51 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)

52 {

53 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;

54 }

55

56 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)

57 {

58 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;

59 }

60

61 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)

62 {

63 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;

64 }

65

66 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)

67 {

68 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;

69 }

70

71 // You might think we should put these find functions elsewhere, perhaps with th e

72 // similar functions that operate on UChar, but arguably only the decoder has

73 // a reason to process strings of char rather than UChar.

74

75 static int find(const char* subject, size_t subjectLength, const char* target)

76 {

77 size_t targetLength = strlen(target);

78 if (targetLength > subjectLength)

79 return -1;

80 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {

81 bool match = true;

82 for (size_t j = 0; j < targetLength; ++j) {

83 if (subject[i + j] != target[j]) {

84 match = false;

85 break;

86 }

87 }

88 if (match)

89 return i;

90 }

91 return -1;

92 }

93

94 static WTF::TextEncoding findTextEncoding(const char* encodingName, int length)

95 {

96 Vector<char, 64> buffer(length + 1);

97 memcpy(buffer.data(), encodingName, length);

98 buffer[length] = '\0';

99 return buffer.data();

100 }

101

102 class KanjiCode {

103 public:

104 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };

105 static enum Type judge(const char* str, int length);

106 static const int ESC = 0x1b;

107 static const unsigned char sjisMap[256];

108 static int ISkanji(int code)

109 {

110 if (code >= 0x100)

111 return 0;

112 return sjisMap[code & 0xff] & 1;

113 }

114 static int ISkana(int code)

115 {

116 if (code >= 0x100)

117 return 0;

118 return sjisMap[code & 0xff] & 2;

119 }

120 };

121

122 const unsigned char KanjiCode::sjisMap[256] = {

123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

131 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

133 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

134 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

136 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0

139 };

140

141 /*

142 * EUC-JP is

143 * [0xa1 - 0xfe][0xa1 - 0xfe]

144 * 0x8e[0xa1 - 0xfe](SS2)

145 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)

146 *

147 * Shift_Jis is

148 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]

149 *

150 * Shift_Jis Hankaku Kana is

151 * [0xa1 - 0xdf]

152 */

153

154 /*

155 * KanjiCode::judge() is based on judge_jcode() from jvim

156 * http://hp.vector.co.jp/authors/VA003457/vim/

157 *

158 * Special Thanks to Kenichi Tsuchida

159 */

160

161 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)

162 {

163 enum Type code;

164 int i;

165 int bfr = false; /* Kana Moji */

166 int bfk = 0; /* EUC Kana */

167 int sjis = 0;

168 int euc = 0;

169

170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);

171

172 code = ASCII;

173

174 i = 0;

175 while (i < size) {

176 if (ptr[i] == ESC && (size - i >= 3)) {

177 if (bytesEqual(str + i + 1, '$', 'B')

178 \|\| bytesEqual(str + i + 1, '(', 'B')

179 \|\| bytesEqual(str + i + 1, '$', '@')

180 \|\| bytesEqual(str + i + 1, '(', 'J')) {

181 code = JIS;

182 goto breakBreak;

183 }

184 if (bytesEqual(str + i + 1, '(', 'I') \|\| bytesEqual(str + i + 1, ')' , 'I')) {

185 code = JIS;

186 i += 3;

187 } else {

188 i++;

189 }

190 bfr = false;

191 bfk = 0;

192 } else {

193 if (ptr[i] < 0x20) {

194 bfr = false;

195 bfk = 0;

196 /* ?? check kudokuten ?? && ?? hiragana ?? */

197 if ((i >= 2) && (ptr[i - 2] == 0x81)

198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {

199 code = SJIS;

200 sjis += 100; /* kudokuten */

201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)

202 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {

203 code = EUC;

204 euc += 100; /* kudokuten */

205 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {

206 sjis += 40; /* hiragana */

207 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {

208 euc += 40; /* hiragana */

209 }

210 } else {

211 /* ?? check hiragana or katana ?? */

212 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {

213 sjis++; /* hiragana */

214 } else if ((size - i > 1) && (ptr[i] == 0x83)

215 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {

216 sjis++; /* katakana */

217 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {

218 euc++; /* hiragana */

219 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {

220 euc++; /* katakana */

221 }

222 if (bfr) {

223 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanj i(ptr[i - 1])) {

224 code = SJIS;

225 goto breakBreak;

226 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) \|\| (0x7e < ptr[i] && ptr[i] <= 0xfc) )) {

227 code = SJIS;

228 goto breakBreak;

229 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {

230 code = EUC;

231 goto breakBreak;

232 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {

233 code = EUC;

234 goto breakBreak;

235 } else if ((i >= 1) && (ptr[i] < 0xa0 \|\| 0xdf < ptr[i]) && ( 0x8e == ptr[i - 1])) {

236 code = SJIS;

237 goto breakBreak;

238 } else if (ptr[i] <= 0x7f) {

239 code = SJIS;

240 goto breakBreak;

241 } else {

242 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {

243 euc++; /* sjis hankaku kana kigo */

244 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {

245 ; /* sjis hankaku kana */

246 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {

247 euc++;

248 } else if (0x8e == ptr[i]) {

249 euc++;

250 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {

251 sjis++;

252 }

253 bfr = false;

254 bfk = 0;

255 }

256 } else if (0x8e == ptr[i]) {

257 if (size - i <= 1) {

258 ;

259 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {

260 /* EUC KANA or SJIS KANJI */

261 if (bfk == 1) {

262 euc += 100;

263 }

264 bfk++;

265 i++;

266 } else {

267 /* SJIS only */

268 code = SJIS;

269 goto breakBreak;

270 }

271 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {

272 /* SJIS only */

273 code = SJIS;

274 if ((size - i >= 1)

275 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)

276 \|\| (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {

277 goto breakBreak;

278 }

279 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {

280 /* EUC only */

281 code = EUC;

282 if ((size - i >= 1)

283 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {

284 goto breakBreak;

285 }

286 } else if (ptr[i] <= 0x7f) {

287 ;

288 } else {

289 bfr = true;

290 bfk = 0;

291 }

292 }

293 i++;

294 }

295 }

296 if (code == ASCII) {

297 if (sjis > euc) {

298 code = SJIS;

299 } else if (sjis < euc) {

300 code = EUC;

301 }

302 }

303 breakBreak:

304 return (code);

305 }

306

307 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)

308 {

309 if (equalIgnoringCase(mimeType, "text/css"))

310 return CSS;

311 if (equalIgnoringCase(mimeType, "text/html"))

312 return HTML;

313 if (DOMImplementation::isXMLMIMEType(mimeType))

314 return XML;

315 return PlainText;

316 }

317

318 const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType conten tType, const WTF::TextEncoding& specifiedDefaultEncoding)

319 {

320 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII

321 // for text/xml. This matches Firefox.

322 if (contentType == XML)

323 return UTF8Encoding();

324 if (!specifiedDefaultEncoding.isValid())

325 return Latin1Encoding();

326 return specifiedDefaultEncoding;

327 }

328

329 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector)

330 : m_contentType(determineContentType(mimeType))

331 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))

332 , m_source(DefaultEncoding)

333 , m_hintEncoding(0)

334 , m_checkedForBOM(false)

335 , m_checkedForCSSCharset(false)

336 , m_checkedForXMLCharset(false)

337 , m_checkedForMetaCharset(false)

338 , m_useLenientXMLDecoding(false)

339 , m_sawError(false)

340 , m_usesEncodingDetector(usesEncodingDetector)

341 {

342 }

343

344 TextResourceDecoder::~TextResourceDecoder()

345 {

346 }

347

348 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)

349 {

350 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).

351 if (!encoding.isValid())

352 return;

353

354 // When encoding comes from meta tag (i.e. it cannot be XML files sent via X HR),

355 // treat x-user-defined as windows-1252 (bug 18270)

356 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-def ined") == 0)

357 m_encoding = "windows-1252";

358 else if (source == EncodingFromMetaTag \|\| source == EncodingFromXMLHeader \|\| source == EncodingFromCSSCharset)

359 m_encoding = encoding.closestByteBasedEquivalent();

360 else

361 m_encoding = encoding;

362

363 m_codec.clear();

364 m_source = source;

365 }

366

367 // Returns the position of the encoding string.

368 static int findXMLEncoding(const char* str, int len, int& encodingLength)

369 {

370 int pos = find(str, len, "encoding");

371 if (pos == -1)

372 return -1;

373 pos += 8;

374

375 // Skip spaces and stray control characters.

376 while (pos < len && str[pos] <= ' ')

377 ++pos;

378

379 // Skip equals sign.

380 if (pos >= len \|\| str[pos] != '=')

381 return -1;

382 ++pos;

383

384 // Skip spaces and stray control characters.

385 while (pos < len && str[pos] <= ' ')

386 ++pos;

387

388 // Skip quotation mark.

389 if (pos >= len)

390 return - 1;

391 char quoteMark = str[pos];

392 if (quoteMark != '"' && quoteMark != '\'')

393 return -1;

394 ++pos;

395

396 // Find the trailing quotation mark.

397 int end = pos;

398 while (end < len && str[end] != quoteMark)

399 ++end;

400 if (end >= len)

401 return -1;

402

403 encodingLength = end - pos;

404 return pos;

405 }

406

407 // true if there is more to parse

408 static inline bool skipWhitespace(const char& pos, const char dataEnd)

409 {

410 while (pos < dataEnd && (pos == '\t' \|\| pos == ' '))

411 ++pos;

412 return pos != dataEnd;

413 }

414

415 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)

416 {

417 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure s ign of a Unicode encoding.

418 // We let it override even a user-chosen encoding.

419 ASSERT(!m_checkedForBOM);

420

421 size_t lengthOfBOM = 0;

422

423 size_t bufferLength = m_buffer.size();

424

425 size_t buf1Len = bufferLength;

426 size_t buf2Len = len;

427 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());

428 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);

429 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

430 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

431 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

432 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

433

434 // Check for the BOM.

435 if (c1 == 0xFF && c2 == 0xFE) {

436 if (c3 != 0 \|\| c4 != 0) {

437 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

438 lengthOfBOM = 2;

439 } else {

440 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

441 lengthOfBOM = 4;

442 }

443 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {

444 setEncoding(UTF8Encoding(), AutoDetectedEncoding);

445 lengthOfBOM = 3;

446 } else if (c1 == 0xFE && c2 == 0xFF) {

447 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

448 lengthOfBOM = 2;

449 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {

450 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

451 lengthOfBOM = 4;

452 }

453

454 if (lengthOfBOM \|\| bufferLength + len >= 4)

455 m_checkedForBOM = true;

456

457 return lengthOfBOM;

458 }

459

460 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)

461 {

462 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {

463 m_checkedForCSSCharset = true;

464 return true;

465 }

466

467 size_t oldSize = m_buffer.size();

468 m_buffer.grow(oldSize + len);

469 memcpy(m_buffer.data() + oldSize, data, len);

470

471 movedDataToBuffer = true;

472

473 if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13

474 return false;

475

476 const char* dataStart = m_buffer.data();

477 const char* dataEnd = dataStart + m_buffer.size();

478

479 if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {

480 dataStart += 10;

481 const char* pos = dataStart;

482

483 while (pos < dataEnd && *pos != '"')

484 ++pos;

485 if (pos == dataEnd)

486 return false;

487

488 int encodingNameLength = pos - dataStart;

489

490 ++pos;

491

492 if (*pos == ';')

493 setEncoding(findTextEncoding(dataStart, encodingNameLength), Encodin gFromCSSCharset);

494 }

495

496 m_checkedForCSSCharset = true;

497 return true;

498 }

499

500 bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer)

501 {

502 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {

503 m_checkedForXMLCharset = true;

504 return true;

505 }

506

507 // This is not completely efficient, since the function might go

508 // through the HTML head several times.

509

510 size_t oldSize = m_buffer.size();

511 m_buffer.grow(oldSize + len);

512 memcpy(m_buffer.data() + oldSize, data, len);

513

514 movedDataToBuffer = true;

515

516 const char* ptr = m_buffer.data();

517 const char* pEnd = ptr + m_buffer.size();

518

519 // Is there enough data available to check for XML declaration?

520 if (m_buffer.size() < 8)

521 return false;

522

523 // Handle XML declaration, which can have encoding in it. This encoding is h onored even for HTML documents.

524 // It is an error for an XML declaration not to be at the start of an XML do cument, and it is ignored in HTML documents in such case.

525 if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {

526 const char* xmlDeclarationEnd = ptr;

527 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')

528 ++xmlDeclarationEnd;

529 if (xmlDeclarationEnd == pEnd)

530 return false;

531 // No need for +1, because we have an extra "?" to lose at the end of XM L declaration.

532 int len = 0;

533 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);

534 if (pos != -1)

535 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader) ;

536 // continue looking for a charset - it may be specified in an HTTP-Equiv meta

537 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0))

538 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

539 else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x'))

540 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

541 else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0))

542 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

543 else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?'))

544 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

545

546 m_checkedForXMLCharset = true;

547 return true;

548 }

549

550 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)

551 {

552 if (m_source == UserChosenEncoding \|\| m_source == EncodingFromHTTPHeader \|\| m_source == AutoDetectedEncoding) {

553 m_checkedForMetaCharset = true;

554 return;

555 }

556

557 if (!m_charsetParser)

558 m_charsetParser = HTMLMetaCharsetParser::create();

559

560 if (!m_charsetParser->checkForMetaCharset(data, length))

561 return;

562

563 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);

564 m_charsetParser.clear();

565 m_checkedForMetaCharset = true;

566 return;

567 }

568

569 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)

570 {

571 switch (KanjiCode::judge(data, len)) {

572 case KanjiCode::JIS:

573 setEncoding("ISO-2022-JP", EncodingFromContentSniffing);

574 break;

575 case KanjiCode::EUC:

576 setEncoding("EUC-JP", EncodingFromContentSniffing);

577 break;

578 case KanjiCode::SJIS:

579 setEncoding("Shift_JIS", EncodingFromContentSniffing);

580 break;

581 case KanjiCode::ASCII:

582 case KanjiCode::UTF16:

583 case KanjiCode::UTF8:

584 break;

585 }

586 }

587

588 // We use the encoding detector in two cases:

589 // 1. Encoding detector is turned ON and no other encoding source is

590 // available (that is, it's DefaultEncoding).

591 // 2. Encoding detector is turned ON and the encoding is set to

592 // the encoding of the parent frame, which is also auto-detected.

593 // Note that condition #2 is NOT satisfied unless parent-child frame

594 // relationship is compliant to the same-origin policy. If they're from

595 // different domains, \|m_source\| would not be set to EncodingFromParentFrame

596 // in the first place.

597 bool TextResourceDecoder::shouldAutoDetect() const

598 {

599 // Just checking m_hintEncoding suffices here because it's only set

600 // in setHintEncoding when the source is AutoDetectedEncoding.

601 return m_usesEncodingDetector

602 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));

603 }

604

605 String TextResourceDecoder::decode(const char* data, size_t len)

606 {

607 size_t lengthOfBOM = 0;

608 if (!m_checkedForBOM)

609 lengthOfBOM = checkForBOM(data, len);

610

611 bool movedDataToBuffer = false;

612

613 if (m_contentType == CSS && !m_checkedForCSSCharset)

614 if (!checkForCSSCharset(data, len, movedDataToBuffer))

615 return emptyString();

616

617 if ((m_contentType == HTML \|\| m_contentType == XML) && !m_checkedForXMLChars et)

618 if (!checkForXMLCharset(data, len, movedDataToBuffer))

619 return emptyString();

620

621 // FIXME: It would be more efficient to move this logic below checkForMetaCh arset because

622 // checkForMetaCharset can overrule these detections.

623 if (shouldAutoDetect()) {

624 if (m_encoding.isJapanese())

625 detectJapaneseEncoding(data, len); // FIXME: We should use detectTex tEncoding() for all languages.

626 else {

627 WTF::TextEncoding detectedEncoding;

628 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding) )

629 setEncoding(detectedEncoding, EncodingFromContentSniffing);

630 }

631 }

632

633 ASSERT(m_encoding.isValid());

634

635 const char* dataForDecode = data + lengthOfBOM;

636 size_t lengthForDecode = len - lengthOfBOM;

637

638 if (!m_buffer.isEmpty()) {

639 if (!movedDataToBuffer) {

640 size_t oldSize = m_buffer.size();

641 m_buffer.grow(oldSize + len);

642 memcpy(m_buffer.data() + oldSize, data, len);

643 }

644

645 dataForDecode = m_buffer.data() + lengthOfBOM;

646 lengthForDecode = m_buffer.size() - lengthOfBOM;

647 }

648

649 if (m_contentType == HTML && !m_checkedForMetaCharset)

650 checkForMetaCharset(dataForDecode, lengthForDecode);

651

652 if (!m_codec)

653 m_codec = newTextCodec(m_encoding);

654

655 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_con tentType == XML && !m_useLenientXMLDecoding, m_sawError);

656

657 m_buffer.clear();

658 return result;

659 }

660

661 String TextResourceDecoder::flush()

662 {

663 // If we can not identify the encoding even after a document is completely

664 // loaded, we need to detect the encoding if other conditions for

665 // autodetection is satisfied.

666 if (m_buffer.size() && shouldAutoDetect()

667 && ((!m_checkedForXMLCharset && (m_contentType == HTML \|\| m_contentType == XML)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSS)))) {

668 WTF::TextEncoding detectedEncoding;

669 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))

670 setEncoding(detectedEncoding, EncodingFromContentSniffing);

671 }

672

673 if (!m_codec)

674 m_codec = newTextCodec(m_encoding);

675

676 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_co ntentType == XML && !m_useLenientXMLDecoding, m_sawError);

677 m_buffer.clear();

678 m_codec.clear();

679 m_checkedForBOM = false; // Skip BOM again when re-decoding.

680 return result;

681 }

682

683 }

OLD	NEW

« no previous file with comments | « Source/core/loader/TextResourceDecoder.h ('k') | Source/core/loader/TextResourceDecoderBuilder.h » ('j') | no next file with comments »