third_party/requests/packages/charade/universaldetector.py - Issue 24076010: Add 'requests' library to third_party.

Side by Side Diff: third_party/requests/packages/charade/universaldetector.py

Issue 24076010: Add 'requests' library to third_party. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/swarm_client

Patch Set: Created 7 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 ######################## BEGIN LICENSE BLOCK ########################

	2 # The Original Code is Mozilla Universal charset detector code.

	3 #

	4 # The Initial Developer of the Original Code is

	5 # Netscape Communications Corporation.

	6 # Portions created by the Initial Developer are Copyright (C) 2001

	7 # the Initial Developer. All Rights Reserved.

	8 #

	9 # Contributor(s):

	10 # Mark Pilgrim - port to Python

	11 # Shy Shalom - original C code

	12 #

	13 # This library is free software; you can redistribute it and/or

	14 # modify it under the terms of the GNU Lesser General Public

	15 # License as published by the Free Software Foundation; either

	16 # version 2.1 of the License, or (at your option) any later version.

	17 #

	18 # This library is distributed in the hope that it will be useful,

	19 # but WITHOUT ANY WARRANTY; without even the implied warranty of

	20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

	21 # Lesser General Public License for more details.

	22 #

	23 # You should have received a copy of the GNU Lesser General Public

	24 # License along with this library; if not, write to the Free Software

	25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

	26 # 02110-1301 USA

	27 ######################### END LICENSE BLOCK #########################

	28

	29 from . import constants

	30 import sys

	31 import codecs

	32 from .latin1prober import Latin1Prober # windows-1252

	33 from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets

	34 from .sbcsgroupprober import SBCSGroupProber # single-byte character sets

	35 from .escprober import EscCharSetProber # ISO-2122, etc.

	36 import re

	37

	38 MINIMUM_THRESHOLD = 0.20

	39 ePureAscii = 0

	40 eEscAscii = 1

	41 eHighbyte = 2

	42

	43

	44 class UniversalDetector:

	45 def __init__(self):

	46 self._highBitDetector = re.compile(b'[\x80-\xFF]')

	47 self._escDetector = re.compile(b'(\033\|~{)')

	48 self._mEscCharSetProber = None

	49 self._mCharSetProbers = []

	50 self.reset()

	51

	52 def reset(self):

	53 self.result = {'encoding': None, 'confidence': 0.0}

	54 self.done = False

	55 self._mStart = True

	56 self._mGotData = False

	57 self._mInputState = ePureAscii

	58 self._mLastChar = b''

	59 if self._mEscCharSetProber:

	60 self._mEscCharSetProber.reset()

	61 for prober in self._mCharSetProbers:

	62 prober.reset()

	63

	64 def feed(self, aBuf):

	65 if self.done:

	66 return

	67

	68 aLen = len(aBuf)

	69 if not aLen:

	70 return

	71

	72 if not self._mGotData:

	73 # If the data starts with BOM, we know it is UTF

	74 if aBuf[:3] == codecs.BOM:

	75 # EF BB BF UTF-8 with BOM

	76 self.result = {'encoding': "UTF-8", 'confidence': 1.0}

	77 elif aBuf[:4] == codecs.BOM_UTF32_LE:

	78 # FF FE 00 00 UTF-32, little-endian BOM

	79 self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}

	80 elif aBuf[:4] == codecs.BOM_UTF32_BE:

	81 # 00 00 FE FF UTF-32, big-endian BOM

	82 self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}

	83 elif aBuf[:4] == b'\xFE\xFF\x00\x00':

	84 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)

	85 self.result = {

	86 'encoding': "X-ISO-10646-UCS-4-3412",

	87 'confidence': 1.0

	88 }

	89 elif aBuf[:4] == b'\x00\x00\xFF\xFE':

	90 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)

	91 self.result = {

	92 'encoding': "X-ISO-10646-UCS-4-2143",

	93 'confidence': 1.0

	94 }

	95 elif aBuf[:2] == codecs.BOM_LE:

	96 # FF FE UTF-16, little endian BOM

	97 self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}

	98 elif aBuf[:2] == codecs.BOM_BE:

	99 # FE FF UTF-16, big endian BOM

	100 self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}

	101

	102 self._mGotData = True

	103 if self.result['encoding'] and (self.result['confidence'] > 0.0):

	104 self.done = True

	105 return

	106

	107 if self._mInputState == ePureAscii:

	108 if self._highBitDetector.search(aBuf):

	109 self._mInputState = eHighbyte

	110 elif ((self._mInputState == ePureAscii) and

	111 self._escDetector.search(self._mLastChar + aBuf)):

	112 self._mInputState = eEscAscii

	113

	114 self._mLastChar = aBuf[-1:]

	115

	116 if self._mInputState == eEscAscii:

	117 if not self._mEscCharSetProber:

	118 self._mEscCharSetProber = EscCharSetProber()

	119 if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:

	120 self.result = {

	121 'encoding': self._mEscCharSetProber.get_charset_name(),

	122 'confidence': self._mEscCharSetProber.get_confidence()

	123 }

	124 self.done = True

	125 elif self._mInputState == eHighbyte:

	126 if not self._mCharSetProbers:

	127 self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),

	128 Latin1Prober()]

	129 for prober in self._mCharSetProbers:

	130 if prober.feed(aBuf) == constants.eFoundIt:

	131 self.result = {'encoding': prober.get_charset_name(),

	132 'confidence': prober.get_confidence()}

	133 self.done = True

	134 break

	135

	136 def close(self):

	137 if self.done:

	138 return

	139 if not self._mGotData:

	140 if constants._debug:

	141 sys.stderr.write('no data received!\n')

	142 return

	143 self.done = True

	144

	145 if self._mInputState == ePureAscii:

	146 self.result = {'encoding': 'ascii', 'confidence': 1.0}

	147 return self.result

	148

	149 if self._mInputState == eHighbyte:

	150 proberConfidence = None

	151 maxProberConfidence = 0.0

	152 maxProber = None

	153 for prober in self._mCharSetProbers:

	154 if not prober:

	155 continue

	156 proberConfidence = prober.get_confidence()

	157 if proberConfidence > maxProberConfidence:

	158 maxProberConfidence = proberConfidence

	159 maxProber = prober

	160 if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD):

	161 self.result = {'encoding': maxProber.get_charset_name(),

	162 'confidence': maxProber.get_confidence()}

	163 return self.result

	164

	165 if constants._debug:

	166 sys.stderr.write('no probers hit minimum threshhold\n')

	167 for prober in self._mCharSetProbers[0].mProbers:

	168 if not prober:

	169 continue

	170 sys.stderr.write('%s confidence = %s\n' %

	171 (prober.get_charset_name(),

	172 prober.get_confidence()))

OLD	NEW

« third_party/requests/adapters.py ('K') | « third_party/requests/packages/charade/sjisprober.py ('k') | third_party/requests/packages/charade/utf8prober.py » ('j') | no next file with comments »