third_party/cython/src/Cython/Compiler/StringEncoding.py - Issue 385073004: Adding cython v0.20.2 in third-party.

Side by Side Diff: third_party/cython/src/Cython/Compiler/StringEncoding.py

Issue 385073004: Adding cython v0.20.2 in third-party. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Reference cython dev list thread. Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #

	2 # Cython -- encoding related tools

	3 #

	4

	5 import re

	6 import sys

	7

	8 if sys.version_info[0] >= 3:

	9 _unicode, _str, _bytes = str, str, bytes

	10 IS_PYTHON3 = True

	11 else:

	12 _unicode, _str, _bytes = unicode, str, str

	13 IS_PYTHON3 = False

	14

	15 empty_bytes = _bytes()

	16 empty_unicode = _unicode()

	17

	18 join_bytes = empty_bytes.join

	19

	20 class UnicodeLiteralBuilder(object):

	21 """Assemble a unicode string.

	22 """

	23 def __init__(self):

	24 self.chars = []

	25

	26 def append(self, characters):

	27 if isinstance(characters, _bytes):

	28 # this came from a Py2 string literal in the parser code

	29 characters = characters.decode("ASCII")

	30 assert isinstance(characters, _unicode), str(type(characters))

	31 self.chars.append(characters)

	32

	33 if sys.maxunicode == 65535:

	34 def append_charval(self, char_number):

	35 if char_number > 65535:

	36 # wide Unicode character on narrow platform => replace

	37 # by surrogate pair

	38 char_number -= 0x10000

	39 self.chars.append( unichr((char_number // 1024) + 0xD800) )

	40 self.chars.append( unichr((char_number % 1024) + 0xDC00) )

	41 else:

	42 self.chars.append( unichr(char_number) )

	43 else:

	44 def append_charval(self, char_number):

	45 self.chars.append( unichr(char_number) )

	46

	47 def append_uescape(self, char_number, escape_string):

	48 self.append_charval(char_number)

	49

	50 def getstring(self):

	51 return EncodedString(u''.join(self.chars))

	52

	53 def getstrings(self):

	54 return (None, self.getstring())

	55

	56

	57 class BytesLiteralBuilder(object):

	58 """Assemble a byte string or char value.

	59 """

	60 def __init__(self, target_encoding):

	61 self.chars = []

	62 self.target_encoding = target_encoding

	63

	64 def append(self, characters):

	65 if isinstance(characters, _unicode):

	66 characters = characters.encode(self.target_encoding)

	67 assert isinstance(characters, _bytes), str(type(characters))

	68 self.chars.append(characters)

	69

	70 def append_charval(self, char_number):

	71 self.chars.append( unichr(char_number).encode('ISO-8859-1') )

	72

	73 def append_uescape(self, char_number, escape_string):

	74 self.append(escape_string)

	75

	76 def getstring(self):

	77 # this must return a byte string!

	78 s = BytesLiteral(join_bytes(self.chars))

	79 s.encoding = self.target_encoding

	80 return s

	81

	82 def getchar(self):

	83 # this must return a byte string!

	84 return self.getstring()

	85

	86 def getstrings(self):

	87 return (self.getstring(), None)

	88

	89 class StrLiteralBuilder(object):

	90 """Assemble both a bytes and a unicode representation of a string.

	91 """

	92 def __init__(self, target_encoding):

	93 self._bytes = BytesLiteralBuilder(target_encoding)

	94 self._unicode = UnicodeLiteralBuilder()

	95

	96 def append(self, characters):

	97 self._bytes.append(characters)

	98 self._unicode.append(characters)

	99

	100 def append_charval(self, char_number):

	101 self._bytes.append_charval(char_number)

	102 self._unicode.append_charval(char_number)

	103

	104 def append_uescape(self, char_number, escape_string):

	105 self._bytes.append(escape_string)

	106 self._unicode.append_charval(char_number)

	107

	108 def getstrings(self):

	109 return (self._bytes.getstring(), self._unicode.getstring())

	110

	111

	112 class EncodedString(_unicode):

	113 # unicode string subclass to keep track of the original encoding.

	114 # 'encoding' is None for unicode strings and the source encoding

	115 # otherwise

	116 encoding = None

	117

	118 def __deepcopy__(self, memo):

	119 return self

	120

	121 def byteencode(self):

	122 assert self.encoding is not None

	123 return self.encode(self.encoding)

	124

	125 def utf8encode(self):

	126 assert self.encoding is None

	127 return self.encode("UTF-8")

	128

	129 @property

	130 def is_unicode(self):

	131 return self.encoding is None

	132

	133 def contains_surrogates(self):

	134 return string_contains_surrogates(self)

	135

	136

	137 def string_contains_surrogates(ustring):

	138 """

	139 Check if the unicode string contains surrogate code points

	140 on a CPython platform with wide (UCS-4) or narrow (UTF-16)

	141 Unicode, i.e. characters that would be spelled as two

	142 separate code units on a narrow platform.

	143 """

	144 for c in map(ord, ustring):

	145 if c > 65535: # can only happen on wide platforms

	146 return True

	147 if 0xD800 <= c <= 0xDFFF:

	148 return True

	149 return False

	150

	151

	152 class BytesLiteral(_bytes):

	153 # bytes subclass that is compatible with EncodedString

	154 encoding = None

	155

	156 def __deepcopy__(self, memo):

	157 return self

	158

	159 def byteencode(self):

	160 if IS_PYTHON3:

	161 return _bytes(self)

	162 else:

	163 # fake-recode the string to make it a plain bytes object

	164 return self.decode('ISO-8859-1').encode('ISO-8859-1')

	165

	166 def utf8encode(self):

	167 assert False, "this is not a unicode string: %r" % self

	168

	169 def __str__(self):

	170 """Fake-decode the byte string to unicode to support %

	171 formatting of unicode strings.

	172 """

	173 return self.decode('ISO-8859-1')

	174

	175 is_unicode = False

	176

	177

	178 char_from_escape_sequence = {

	179 r'\a' : u'\a',

	180 r'\b' : u'\b',

	181 r'\f' : u'\f',

	182 r'\n' : u'\n',

	183 r'\r' : u'\r',

	184 r'\t' : u'\t',

	185 r'\v' : u'\v',

	186 }.get

	187

	188 _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))

	189

	190

	191 def _to_escape_sequence(s):

	192 if s in '\n\r\t':

	193 return repr(s)[1:-1]

	194 elif s == '"':

	195 return r'\"'

	196 elif s == '\\':

	197 return r'\\'

	198 else:

	199 # within a character sequence, oct passes much better than hex

	200 return ''.join(['\\%03o' % ord(c) for c in s])

	201

	202

	203 def _build_specials_replacer():

	204 subexps = []

	205 replacements = {}

	206 for special in _c_special:

	207 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])

	208 subexps.append(regexp)

	209 replacements[special.encode('ASCII')] = _to_escape_sequence(special).enc ode('ASCII')

	210 sub = re.compile(('(%s)' % '\|'.join(subexps)).encode('ASCII')).sub

	211 def replace_specials(m):

	212 return replacements[m.group(1)]

	213 def replace(s):

	214 return sub(replace_specials, s)

	215 return replace

	216

	217 _replace_specials = _build_specials_replacer()

	218

	219

	220 def escape_char(c):

	221 if IS_PYTHON3:

	222 c = c.decode('ISO-8859-1')

	223 if c in '\n\r\t\\':

	224 return repr(c)[1:-1]

	225 elif c == "'":

	226 return "\\'"

	227 n = ord(c)

	228 if n < 32 or n > 127:

	229 # hex works well for characters

	230 return "\\x%02X" % n

	231 else:

	232 return c

	233

	234 def escape_byte_string(s):

	235 """Escape a byte string so that it can be written into C code.

	236 Note that this returns a Unicode string instead which, when

	237 encoded as ISO-8859-1, will result in the correct byte sequence

	238 being written.

	239 """

	240 s = _replace_specials(s)

	241 try:

	242 return s.decode("ASCII") # trial decoding: plain ASCII => done

	243 except UnicodeDecodeError:

	244 pass

	245 if IS_PYTHON3:

	246 s_new = bytearray()

	247 append, extend = s_new.append, s_new.extend

	248 for b in s:

	249 if b >= 128:

	250 extend(('\\%3o' % b).encode('ASCII'))

	251 else:

	252 append(b)

	253 return s_new.decode('ISO-8859-1')

	254 else:

	255 l = []

	256 append = l.append

	257 for c in s:

	258 o = ord(c)

	259 if o >= 128:

	260 append('\\%3o' % o)

	261 else:

	262 append(c)

	263 return join_bytes(l).decode('ISO-8859-1')

	264

	265 def split_string_literal(s, limit=2000):

	266 # MSVC can't handle long string literals.

	267 if len(s) < limit:

	268 return s

	269 else:

	270 start = 0

	271 chunks = []

	272 while start < len(s):

	273 end = start + limit

	274 if len(s) > end-4 and '\\' in s[end-4:end]:

	275 end -= 4 - s[end-4:end].find('\\') # just before the backslash

	276 while s[end-1] == '\\':

	277 end -= 1

	278 if end == start:

	279 # must have been a long line of backslashes

	280 end = start + limit - (limit % 2) - 4

	281 break

	282 chunks.append(s[start:end])

	283 start = end

	284 return '""'.join(chunks)

	285

	286 def encode_pyunicode_string(s):

	287 """Create Py_UNICODE[] representation of a given unicode string.

	288 """

	289 s = map(ord, s) + [0]

	290

	291 if sys.maxunicode >= 0x10000: # Wide build or Py3.3

	292 utf16, utf32 = [], s

	293 for code_point in s:

	294 if code_point >= 0x10000: # outside of BMP

	295 high, low = divmod(code_point - 0x10000, 1024)

	296 utf16.append(high + 0xD800)

	297 utf16.append(low + 0xDC00)

	298 else:

	299 utf16.append(code_point)

	300 else:

	301 utf16, utf32 = s, []

	302 for code_unit in s:

	303 if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] < = 0xDBFF:

	304 high, low = utf32[-1], code_unit

	305 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000

	306 else:

	307 utf32.append(code_unit)

	308

	309 if utf16 == utf32:

	310 utf16 = []

	311 return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))

OLD	NEW

« no previous file with comments | « third_party/cython/src/Cython/Compiler/Scanning.py ('k') | third_party/cython/src/Cython/Compiler/Symtab.py » ('j') | no next file with comments »