Source/core/fetch/TextResourceDecoder.cpp - Issue 23623012: Move TextResourceDecoder from loader/ to fetch/

Side by Side Diff: Source/core/fetch/TextResourceDecoder.cpp

Issue 23623012: Move TextResourceDecoder from loader/ to fetch/ (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 7 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)	2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)

3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.	3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.

4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)	4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)

5	5

6 This library is free software; you can redistribute it and/or	6 This library is free software; you can redistribute it and/or

7 modify it under the terms of the GNU Library General Public	7 modify it under the terms of the GNU Library General Public

8 License as published by the Free Software Foundation; either	8 License as published by the Free Software Foundation; either

9 version 2 of the License, or (at your option) any later version.	9 version 2 of the License, or (at your option) any later version.

10	10

11 This library is distributed in the hope that it will be useful,	11 This library is distributed in the hope that it will be useful,

12 but WITHOUT ANY WARRANTY; without even the implied warranty of	12 but WITHOUT ANY WARRANTY; without even the implied warranty of

13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 Library General Public License for more details.	14 Library General Public License for more details.

15	15

16 You should have received a copy of the GNU Library General Public License	16 You should have received a copy of the GNU Library General Public License

17 along with this library; see the file COPYING.LIB. If not, write to	17 along with this library; see the file COPYING.LIB. If not, write to

18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,	18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

19 Boston, MA 02110-1301, USA.	19 Boston, MA 02110-1301, USA.

20 */	20 */

21	21

22	22

23 #include "config.h"	23 #include "config.h"

24 #include "core/loader/TextResourceDecoder.h"	24 #include "core/fetch/TextResourceDecoder.h"

25	25

26 #include "HTMLNames.h"	26 #include "HTMLNames.h"

27 #include "core/dom/DOMImplementation.h"	27 #include "core/dom/DOMImplementation.h"

28 #include "core/html/parser/HTMLMetaCharsetParser.h"	28 #include "core/html/parser/HTMLMetaCharsetParser.h"

29 #include "core/platform/text/TextEncodingDetector.h"	29 #include "core/platform/text/TextEncodingDetector.h"

30 #include "wtf/StringExtras.h"	30 #include "wtf/StringExtras.h"

31 #include "wtf/text/TextCodec.h"	31 #include "wtf/text/TextCodec.h"

32 #include "wtf/text/TextEncoding.h"	32 #include "wtf/text/TextEncoding.h"

33 #include "wtf/text/TextEncodingRegistry.h"	33 #include "wtf/text/TextEncodingRegistry.h"

34	34

(...skipping 133 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
168 int euc = 0;	168 int euc = 0;

169	169

170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);	170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);

171	171

172 code = ASCII;	172 code = ASCII;

173	173

174 i = 0;	174 i = 0;

175 while (i < size) {	175 while (i < size) {

176 if (ptr[i] == ESC && (size - i >= 3)) {	176 if (ptr[i] == ESC && (size - i >= 3)) {

177 if (bytesEqual(str + i + 1, '$', 'B')	177 if (bytesEqual(str + i + 1, '$', 'B')

178 \|\| bytesEqual(str + i + 1, '(', 'B')	178 \|\| bytesEqual(str + i + 1, '(', 'B')

179 \|\| bytesEqual(str + i + 1, '$', '@')	179 \|\| bytesEqual(str + i + 1, '$', '@')

180 \|\| bytesEqual(str + i + 1, '(', 'J')) {	180 \|\| bytesEqual(str + i + 1, '(', 'J')) {

181 code = JIS;	181 code = JIS;

182 goto breakBreak;	182 goto breakBreak;

183 }	183 }

184 if (bytesEqual(str + i + 1, '(', 'I') \|\| bytesEqual(str + i + 1, ')' , 'I')) {	184 if (bytesEqual(str + i + 1, '(', 'I') \|\| bytesEqual(str + i + 1, ')' , 'I')) {

185 code = JIS;	185 code = JIS;

186 i += 3;	186 i += 3;

187 } else {	187 } else {

188 i++;	188 i++;

189 }	189 }

190 bfr = false;	190 bfr = false;

191 bfk = 0;	191 bfk = 0;

192 } else {	192 } else {

193 if (ptr[i] < 0x20) {	193 if (ptr[i] < 0x20) {

194 bfr = false;	194 bfr = false;

195 bfk = 0;	195 bfk = 0;

196 /* ?? check kudokuten ?? && ?? hiragana ?? */	196 /* ?? check kudokuten ?? && ?? hiragana ?? */

197 if ((i >= 2) && (ptr[i - 2] == 0x81)	197 if ((i >= 2) && (ptr[i - 2] == 0x81)

198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {	198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {

199 code = SJIS;	199 code = SJIS;

200 sjis += 100; /* kudokuten */	200 sjis += 100; /* kudokuten */

201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)	201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1) && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {

202 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {

203 code = EUC;	202 code = EUC;

204 euc += 100; /* kudokuten */	203 euc += 100; /* kudokuten */

205 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {	204 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {

206 sjis += 40; /* hiragana */	205 sjis += 40; /* hiragana */

207 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {	206 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {

208 euc += 40; /* hiragana */	207 euc += 40; /* hiragana */

209 }	208 }

210 } else {	209 } else {

211 /* ?? check hiragana or katana ?? */	210 /* ?? check hiragana or katana ?? */

212 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {	211 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {

213 sjis++; /* hiragana */	212 sjis++; /* hiragana */

214 } else if ((size - i > 1) && (ptr[i] == 0x83)	213 } else if ((size - i > 1) && (ptr[i] == 0x83) && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {

215 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {

216 sjis++; /* katakana */	214 sjis++; /* katakana */

217 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {	215 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {

218 euc++; /* hiragana */	216 euc++; /* hiragana */

219 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {	217 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {

220 euc++; /* katakana */	218 euc++; /* katakana */

221 }	219 }

222 if (bfr) {	220 if (bfr) {

223 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanj i(ptr[i - 1])) {	221 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanj i(ptr[i - 1])) {

224 code = SJIS;	222 code = SJIS;

225 goto breakBreak;	223 goto breakBreak;

226 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) \|\| (0x7e < ptr[i] && ptr[i] <= 0xfc) )) {	224 }

	225

	226 if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) & & ((0x40 <= ptr[i] && ptr[i] < 0x7e) \|\| (0x7e < ptr[i] && ptr[i] <= 0xfc))) {

227 code = SJIS;	227 code = SJIS;

228 goto breakBreak;	228 goto breakBreak;

229 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {	229 }

	230

	231 if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {

230 code = EUC;	232 code = EUC;

231 goto breakBreak;	233 goto breakBreak;

232 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {	234 }

	235

	236 if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) & & (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {

233 code = EUC;	237 code = EUC;

234 goto breakBreak;	238 goto breakBreak;

235 } else if ((i >= 1) && (ptr[i] < 0xa0 \|\| 0xdf < ptr[i]) && ( 0x8e == ptr[i - 1])) {	239 }

	240

	241 if ((i >= 1) && (ptr[i] < 0xa0 \|\| 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {

236 code = SJIS;	242 code = SJIS;

237 goto breakBreak;	243 goto breakBreak;

238 } else if (ptr[i] <= 0x7f) {	244 }

	245

	246 if (ptr[i] <= 0x7f) {

239 code = SJIS;	247 code = SJIS;

240 goto breakBreak;	248 goto breakBreak;

241 } else {

242 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {

243 euc++; /* sjis hankaku kana kigo */

244 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {

245 ; /* sjis hankaku kana */

246 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {

247 euc++;

248 } else if (0x8e == ptr[i]) {

249 euc++;

250 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {

251 sjis++;

252 }

253 bfr = false;

254 bfk = 0;

255 }	249 }

	250

	251 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {

	252 euc++; /* sjis hankaku kana kigo */

	253 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {

	254 /* sjis hankaku kana */

	255 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {

	256 euc++;

	257 } else if (0x8e == ptr[i]) {

	258 euc++;

	259 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {

	260 sjis++;

	261 }

	262

	263 bfr = false;

	264 bfk = 0;

256 } else if (0x8e == ptr[i]) {	265 } else if (0x8e == ptr[i]) {

257 if (size - i <= 1) {	266 if (size - i <= 1) {

258 ;	267

259 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {	268 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {

260 /* EUC KANA or SJIS KANJI */	269 /* EUC KANA or SJIS KANJI */

261 if (bfk == 1) {	270 if (bfk == 1) {

262 euc += 100;	271 euc += 100;

263 }	272 }

264 bfk++;	273 bfk++;

265 i++;	274 i++;

266 } else {	275 } else {

267 /* SJIS only */	276 /* SJIS only */

268 code = SJIS;	277 code = SJIS;

269 goto breakBreak;	278 goto breakBreak;

270 }	279 }

271 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {	280 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {

272 /* SJIS only */	281 /* SJIS only */

273 code = SJIS;	282 code = SJIS;

274 if ((size - i >= 1)	283 if ((size - i >= 1) && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e) \|\| (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc)))

275 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)

276 \|\| (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {

277 goto breakBreak;	284 goto breakBreak;

278 }

279 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {	285 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {

280 /* EUC only */	286 /* EUC only */

281 code = EUC;	287 code = EUC;

282 if ((size - i >= 1)	288 if ((size - i >= 1) && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe))

283 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {

284 goto breakBreak;	289 goto breakBreak;

285 }

286 } else if (ptr[i] <= 0x7f) {	290 } else if (ptr[i] <= 0x7f) {

287 ;	291

288 } else {	292 } else {

289 bfr = true;	293 bfr = true;

290 bfk = 0;	294 bfk = 0;

291 }	295 }

292 }	296 }

293 i++;	297 i++;

294 }	298 }

295 }	299 }

296 if (code == ASCII) {	300 if (code == ASCII) {

297 if (sjis > euc) {	301 if (sjis > euc) {

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
346 }	350 }

347	351

348 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)	352 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)

349 {	353 {

350 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).	354 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).

351 if (!encoding.isValid())	355 if (!encoding.isValid())

352 return;	356 return;

353	357

354 // When encoding comes from meta tag (i.e. it cannot be XML files sent via X HR),	358 // When encoding comes from meta tag (i.e. it cannot be XML files sent via X HR),

355 // treat x-user-defined as windows-1252 (bug 18270)	359 // treat x-user-defined as windows-1252 (bug 18270)

356 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-def ined") == 0)	360 if (source == EncodingFromMetaTag && !strcasecmp(encoding.name(), "x-user-de fined"))

357 m_encoding = "windows-1252";	361 m_encoding = "windows-1252";

358 else if (source == EncodingFromMetaTag \|\| source == EncodingFromXMLHeader \|\| source == EncodingFromCSSCharset)	362 else if (source == EncodingFromMetaTag \|\| source == EncodingFromXMLHeader \|\| source == EncodingFromCSSCharset)

359 m_encoding = encoding.closestByteBasedEquivalent();	363 m_encoding = encoding.closestByteBasedEquivalent();

360 else	364 else

361 m_encoding = encoding;	365 m_encoding = encoding;

362	366

363 m_codec.clear();	367 m_codec.clear();

364 m_source = source;	368 m_source = source;

365 }	369 }

366	370

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
426 size_t buf2Len = len;	430 size_t buf2Len = len;

427 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());	431 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());

428 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);	432 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);

429 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	433 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

430 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	434 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

431 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	435 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

432 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;	436 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

433	437

434 // Check for the BOM.	438 // Check for the BOM.

435 if (c1 == 0xFF && c2 == 0xFE) {	439 if (c1 == 0xFF && c2 == 0xFE) {

436 if (c3 != 0 \|\| c4 != 0) {	440 if (c3 \|\| c4) {

437 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);	441 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

438 lengthOfBOM = 2;	442 lengthOfBOM = 2;

439 } else {	443 } else {

440 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);	444 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

441 lengthOfBOM = 4;	445 lengthOfBOM = 4;

442 }	446 }

443 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {	447 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {

444 setEncoding(UTF8Encoding(), AutoDetectedEncoding);	448 setEncoding(UTF8Encoding(), AutoDetectedEncoding);

445 lengthOfBOM = 3;	449 lengthOfBOM = 3;

446 } else if (c1 == 0xFE && c2 == 0xFF) {	450 } else if (c1 == 0xFE && c2 == 0xFF) {

447 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);	451 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

448 lengthOfBOM = 2;	452 lengthOfBOM = 2;

449 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {	453 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {

450 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);	454 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

451 lengthOfBOM = 4;	455 lengthOfBOM = 4;

452 }	456 }

453	457

454 if (lengthOfBOM \|\| bufferLength + len >= 4)	458 if (lengthOfBOM \|\| bufferLength + len >= 4)

455 m_checkedForBOM = true;	459 m_checkedForBOM = true;

456	460

457 return lengthOfBOM;	461 return lengthOfBOM;

458 }	462 }

459	463

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
527 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')	531 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')

528 ++xmlDeclarationEnd;	532 ++xmlDeclarationEnd;

529 if (xmlDeclarationEnd == pEnd)	533 if (xmlDeclarationEnd == pEnd)

530 return false;	534 return false;

531 // No need for +1, because we have an extra "?" to lose at the end of XM L declaration.	535 // No need for +1, because we have an extra "?" to lose at the end of XM L declaration.

532 int len = 0;	536 int len = 0;

533 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);	537 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);

534 if (pos != -1)	538 if (pos != -1)

535 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader) ;	539 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader) ;

536 // continue looking for a charset - it may be specified in an HTTP-Equiv meta	540 // continue looking for a charset - it may be specified in an HTTP-Equiv meta

537 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0))	541 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0)) {

538 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);	542 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

539 else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x'))	543 } else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x')) {

540 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);	544 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

541 else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0))	545 } else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0)) {

542 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);	546 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

543 else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?'))	547 } else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?')) {

544 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);	548 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

	549 }

545	550

546 m_checkedForXMLCharset = true;	551 m_checkedForXMLCharset = true;

547 return true;	552 return true;

548 }	553 }

549	554

550 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)	555 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)

551 {	556 {

552 if (m_source == UserChosenEncoding \|\| m_source == EncodingFromHTTPHeader \|\| m_source == AutoDetectedEncoding) {	557 if (m_source == UserChosenEncoding \|\| m_source == EncodingFromHTTPHeader \|\| m_source == AutoDetectedEncoding) {

553 m_checkedForMetaCharset = true;	558 m_checkedForMetaCharset = true;

554 return;	559 return;

555 }	560 }

556	561

557 if (!m_charsetParser)	562 if (!m_charsetParser)

558 m_charsetParser = HTMLMetaCharsetParser::create();	563 m_charsetParser = HTMLMetaCharsetParser::create();

559	564

560 if (!m_charsetParser->checkForMetaCharset(data, length))	565 if (!m_charsetParser->checkForMetaCharset(data, length))

561 return;	566 return;

562	567

563 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);	568 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);

564 m_charsetParser.clear();	569 m_charsetParser.clear();

565 m_checkedForMetaCharset = true;	570 m_checkedForMetaCharset = true;

566 return;	571 return;

567 }	572 }

568	573

569 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)	574 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)

570 {	575 {

571 switch (KanjiCode::judge(data, len)) {	576 switch (KanjiCode::judge(data, len)) {

572 case KanjiCode::JIS:	577 case KanjiCode::JIS:

573 setEncoding("ISO-2022-JP", EncodingFromContentSniffing);	578 setEncoding("ISO-2022-JP", EncodingFromContentSniffing);

574 break;	579 break;

575 case KanjiCode::EUC:	580 case KanjiCode::EUC:

576 setEncoding("EUC-JP", EncodingFromContentSniffing);	581 setEncoding("EUC-JP", EncodingFromContentSniffing);

577 break;	582 break;

578 case KanjiCode::SJIS:	583 case KanjiCode::SJIS:

579 setEncoding("Shift_JIS", EncodingFromContentSniffing);	584 setEncoding("Shift_JIS", EncodingFromContentSniffing);

580 break;	585 break;

581 case KanjiCode::ASCII:	586 case KanjiCode::ASCII:

582 case KanjiCode::UTF16:	587 case KanjiCode::UTF16:

583 case KanjiCode::UTF8:	588 case KanjiCode::UTF8:

584 break;	589 break;

585 }	590 }

586 }	591 }

587	592

588 // We use the encoding detector in two cases:	593 // We use the encoding detector in two cases:

589 // 1. Encoding detector is turned ON and no other encoding source is	594 // 1. Encoding detector is turned ON and no other encoding source is

590 // available (that is, it's DefaultEncoding).	595 // available (that is, it's DefaultEncoding).

591 // 2. Encoding detector is turned ON and the encoding is set to	596 // 2. Encoding detector is turned ON and the encoding is set to

592 // the encoding of the parent frame, which is also auto-detected.	597 // the encoding of the parent frame, which is also auto-detected.

593 // Note that condition #2 is NOT satisfied unless parent-child frame	598 // Note that condition #2 is NOT satisfied unless parent-child frame

594 // relationship is compliant to the same-origin policy. If they're from	599 // relationship is compliant to the same-origin policy. If they're from

595 // different domains, \|m_source\| would not be set to EncodingFromParentFrame	600 // different domains, \|m_source\| would not be set to EncodingFromParentFrame

596 // in the first place.	601 // in the first place.

597 bool TextResourceDecoder::shouldAutoDetect() const	602 bool TextResourceDecoder::shouldAutoDetect() const

598 {	603 {

599 // Just checking m_hintEncoding suffices here because it's only set	604 // Just checking m_hintEncoding suffices here because it's only set

600 // in setHintEncoding when the source is AutoDetectedEncoding.	605 // in setHintEncoding when the source is AutoDetectedEncoding.

601 return m_usesEncodingDetector	606 return m_usesEncodingDetector

602 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));	607 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));

603 }	608 }

604	609

605 String TextResourceDecoder::decode(const char* data, size_t len)	610 String TextResourceDecoder::decode(const char* data, size_t len)

606 {	611 {

607 size_t lengthOfBOM = 0;	612 size_t lengthOfBOM = 0;

608 if (!m_checkedForBOM)	613 if (!m_checkedForBOM)

609 lengthOfBOM = checkForBOM(data, len);	614 lengthOfBOM = checkForBOM(data, len);

610	615

611 bool movedDataToBuffer = false;	616 bool movedDataToBuffer = false;

612	617

613 if (m_contentType == CSS && !m_checkedForCSSCharset)	618 if (m_contentType == CSS && !m_checkedForCSSCharset) {

614 if (!checkForCSSCharset(data, len, movedDataToBuffer))	619 if (!checkForCSSCharset(data, len, movedDataToBuffer))

615 return emptyString();	620 return emptyString();

	621 }

616	622

617 if ((m_contentType == HTML \|\| m_contentType == XML) && !m_checkedForXMLChars et)	623 if ((m_contentType == HTML \|\| m_contentType == XML) && !m_checkedForXMLChars et) {

618 if (!checkForXMLCharset(data, len, movedDataToBuffer))	624 if (!checkForXMLCharset(data, len, movedDataToBuffer))

619 return emptyString();	625 return emptyString();

	626 }

620	627

621 // FIXME: It would be more efficient to move this logic below checkForMetaCh arset because	628 // FIXME: It would be more efficient to move this logic below checkForMetaCh arset because

622 // checkForMetaCharset can overrule these detections.	629 // checkForMetaCharset can overrule these detections.

623 if (shouldAutoDetect()) {	630 if (shouldAutoDetect()) {

624 if (m_encoding.isJapanese())	631 if (m_encoding.isJapanese()) {

625 detectJapaneseEncoding(data, len); // FIXME: We should use detectTex tEncoding() for all languages.	632 detectJapaneseEncoding(data, len); // FIXME: We should use detectTex tEncoding() for all languages.

626 else {	633 } else {

627 WTF::TextEncoding detectedEncoding;	634 WTF::TextEncoding detectedEncoding;

628 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding) )	635 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding) )

629 setEncoding(detectedEncoding, EncodingFromContentSniffing);	636 setEncoding(detectedEncoding, EncodingFromContentSniffing);

630 }	637 }

631 }	638 }

632	639

633 ASSERT(m_encoding.isValid());	640 ASSERT(m_encoding.isValid());

634	641

635 const char* dataForDecode = data + lengthOfBOM;	642 const char* dataForDecode = data + lengthOfBOM;

636 size_t lengthForDecode = len - lengthOfBOM;	643 size_t lengthForDecode = len - lengthOfBOM;

(...skipping 16 matching lines...) Expand all Loading...
653 m_codec = newTextCodec(m_encoding);	660 m_codec = newTextCodec(m_encoding);

654	661

655 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_con tentType == XML && !m_useLenientXMLDecoding, m_sawError);	662 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_con tentType == XML && !m_useLenientXMLDecoding, m_sawError);

656	663

657 m_buffer.clear();	664 m_buffer.clear();

658 return result;	665 return result;

659 }	666 }

660	667

661 String TextResourceDecoder::flush()	668 String TextResourceDecoder::flush()

662 {	669 {

663 // If we can not identify the encoding even after a document is completely	670 // If we can not identify the encoding even after a document is completely

664 // loaded, we need to detect the encoding if other conditions for	671 // loaded, we need to detect the encoding if other conditions for

665 // autodetection is satisfied.	672 // autodetection is satisfied.

666 if (m_buffer.size() && shouldAutoDetect()	673 if (m_buffer.size() && shouldAutoDetect()

667 && ((!m_checkedForXMLCharset && (m_contentType == HTML \|\| m_contentType == XML)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSS)))) {	674 && ((!m_checkedForXMLCharset && (m_contentType == HTML \|\| m_contentType == XML)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSS)))) {

668 WTF::TextEncoding detectedEncoding;	675 WTF::TextEncoding detectedEncoding;

669 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))	676 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))

670 setEncoding(detectedEncoding, EncodingFromContentSniffing);	677 setEncoding(detectedEncoding, EncodingFromContentSniffing);

671 }	678 }

672	679

673 if (!m_codec)	680 if (!m_codec)

674 m_codec = newTextCodec(m_encoding);	681 m_codec = newTextCodec(m_encoding);

675	682

676 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_co ntentType == XML && !m_useLenientXMLDecoding, m_sawError);	683 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_co ntentType == XML && !m_useLenientXMLDecoding, m_sawError);

677 m_buffer.clear();	684 m_buffer.clear();

678 m_codec.clear();	685 m_codec.clear();

679 m_checkedForBOM = false; // Skip BOM again when re-decoding.	686 m_checkedForBOM = false; // Skip BOM again when re-decoding.

680 return result;	687 return result;

681 }	688 }

682	689

683 }	690 }

OLD	NEW

« no previous file with comments | « Source/core/fetch/TextResourceDecoder.h ('k') | Source/core/fetch/XSLStyleSheetResource.cpp » ('j') | no next file with comments »