third_party/cython/src/Cython/Compiler/StringEncoding.py - Issue 385073004: Adding cython v0.20.2 in third-party.

Unified Diff: third_party/cython/src/Cython/Compiler/StringEncoding.py

Issue 385073004: Adding cython v0.20.2 in third-party. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Reference cython dev list thread. Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/cython/src/Cython/Compiler/StringEncoding.py

diff --git a/third_party/cython/src/Cython/Compiler/StringEncoding.py b/third_party/cython/src/Cython/Compiler/StringEncoding.py

new file mode 100644

index 0000000000000000000000000000000000000000..4d84afa209fc06133a70769e51b48bc3d669a9c6

--- /dev/null

+++ b/third_party/cython/src/Cython/Compiler/StringEncoding.py

@@ -0,0 +1,311 @@

+# Cython -- encoding related tools

+import re

+import sys

+if sys.version_info[0] >= 3:

+ _unicode, _str, _bytes = str, str, bytes

+ IS_PYTHON3 = True

+else:

+ _unicode, _str, _bytes = unicode, str, str

+ IS_PYTHON3 = False

+empty_bytes = _bytes()

+empty_unicode = _unicode()

+join_bytes = empty_bytes.join

+class UnicodeLiteralBuilder(object):

+ """Assemble a unicode string.

+ """

+ def __init__(self):

+ self.chars = []

+ def append(self, characters):

+ if isinstance(characters, _bytes):

+ # this came from a Py2 string literal in the parser code

+ characters = characters.decode("ASCII")

+ assert isinstance(characters, _unicode), str(type(characters))

+ self.chars.append(characters)

+ if sys.maxunicode == 65535:

+ def append_charval(self, char_number):

+ if char_number > 65535:

+ # wide Unicode character on narrow platform => replace

+ # by surrogate pair

+ char_number -= 0x10000

+ self.chars.append( unichr((char_number // 1024) + 0xD800) )

+ self.chars.append( unichr((char_number % 1024) + 0xDC00) )

+ else:

+ self.chars.append( unichr(char_number) )

+ else:

+ def append_charval(self, char_number):

+ self.chars.append( unichr(char_number) )

+ def append_uescape(self, char_number, escape_string):

+ self.append_charval(char_number)

+ def getstring(self):

+ return EncodedString(u''.join(self.chars))

+ def getstrings(self):

+ return (None, self.getstring())

+class BytesLiteralBuilder(object):

+ """Assemble a byte string or char value.

+ """

+ def __init__(self, target_encoding):

+ self.chars = []

+ self.target_encoding = target_encoding

+ def append(self, characters):

+ if isinstance(characters, _unicode):

+ characters = characters.encode(self.target_encoding)

+ assert isinstance(characters, _bytes), str(type(characters))

+ self.chars.append(characters)

+ def append_charval(self, char_number):

+ self.chars.append( unichr(char_number).encode('ISO-8859-1') )

+ def append_uescape(self, char_number, escape_string):

+ self.append(escape_string)

+ def getstring(self):

+ # this *must* return a byte string!

+ s = BytesLiteral(join_bytes(self.chars))

+ s.encoding = self.target_encoding

+ return s

+ def getchar(self):

+ # this *must* return a byte string!

+ return self.getstring()

+ def getstrings(self):

+ return (self.getstring(), None)

+class StrLiteralBuilder(object):

+ """Assemble both a bytes and a unicode representation of a string.

+ """

+ def __init__(self, target_encoding):

+ self._bytes = BytesLiteralBuilder(target_encoding)

+ self._unicode = UnicodeLiteralBuilder()

+ def append(self, characters):

+ self._bytes.append(characters)

+ self._unicode.append(characters)

+ def append_charval(self, char_number):

+ self._bytes.append_charval(char_number)

+ self._unicode.append_charval(char_number)

+ def append_uescape(self, char_number, escape_string):

+ self._bytes.append(escape_string)

+ self._unicode.append_charval(char_number)

+ def getstrings(self):

+ return (self._bytes.getstring(), self._unicode.getstring())

+class EncodedString(_unicode):

+ # unicode string subclass to keep track of the original encoding.

+ # 'encoding' is None for unicode strings and the source encoding

+ # otherwise

+ encoding = None

+ def __deepcopy__(self, memo):

+ return self

+ def byteencode(self):

+ assert self.encoding is not None

+ return self.encode(self.encoding)

+ def utf8encode(self):

+ assert self.encoding is None

+ return self.encode("UTF-8")

+ @property

+ def is_unicode(self):

+ return self.encoding is None

+ def contains_surrogates(self):

+ return string_contains_surrogates(self)

+def string_contains_surrogates(ustring):

+ """

+ Check if the unicode string contains surrogate code points

+ on a CPython platform with wide (UCS-4) or narrow (UTF-16)

+ Unicode, i.e. characters that would be spelled as two

+ separate code units on a narrow platform.

+ """

+ for c in map(ord, ustring):

+ if c > 65535: # can only happen on wide platforms

+ return True

+ if 0xD800 <= c <= 0xDFFF:

+ return True

+ return False

+class BytesLiteral(_bytes):

+ # bytes subclass that is compatible with EncodedString

+ encoding = None

+ def __deepcopy__(self, memo):

+ return self

+ def byteencode(self):

+ if IS_PYTHON3:

+ return _bytes(self)

+ else:

+ # fake-recode the string to make it a plain bytes object

+ return self.decode('ISO-8859-1').encode('ISO-8859-1')

+ def utf8encode(self):

+ assert False, "this is not a unicode string: %r" % self

+ def __str__(self):

+ """Fake-decode the byte string to unicode to support %

+ formatting of unicode strings.

+ """

+ return self.decode('ISO-8859-1')

+ is_unicode = False

+char_from_escape_sequence = {

+ r'\a' : u'\a',

+ r'\b' : u'\b',

+ r'\f' : u'\f',

+ r'\n' : u'\n',

+ r'\r' : u'\r',

+ r'\t' : u'\t',

+ r'\v' : u'\v',

+ }.get

+_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))

+def _to_escape_sequence(s):

+ if s in '\n\r\t':

+ return repr(s)[1:-1]

+ elif s == '"':

+ return r'\"'

+ elif s == '\\':

+ return r'\\'

+ else:

+ # within a character sequence, oct passes much better than hex

+ return ''.join(['\\%03o' % ord(c) for c in s])

+def _build_specials_replacer():

+ subexps = []

+ replacements = {}

+ for special in _c_special:

+ regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])

+ subexps.append(regexp)

+ replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')

+ sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub

+ def replace_specials(m):

+ return replacements[m.group(1)]

+ def replace(s):

+ return sub(replace_specials, s)

+ return replace

+_replace_specials = _build_specials_replacer()

+def escape_char(c):

+ if IS_PYTHON3:

+ c = c.decode('ISO-8859-1')

+ if c in '\n\r\t\\':

+ return repr(c)[1:-1]

+ elif c == "'":

+ return "\\'"

+ n = ord(c)

+ if n < 32 or n > 127:

+ # hex works well for characters

+ return "\\x%02X" % n

+ else:

+ return c

+def escape_byte_string(s):

+ """Escape a byte string so that it can be written into C code.

+ Note that this returns a Unicode string instead which, when

+ encoded as ISO-8859-1, will result in the correct byte sequence

+ being written.

+ """

+ s = _replace_specials(s)

+ try:

+ return s.decode("ASCII") # trial decoding: plain ASCII => done

+ except UnicodeDecodeError:

+ pass

+ if IS_PYTHON3:

+ s_new = bytearray()

+ append, extend = s_new.append, s_new.extend

+ for b in s:

+ if b >= 128:

+ extend(('\\%3o' % b).encode('ASCII'))

+ else:

+ append(b)

+ return s_new.decode('ISO-8859-1')

+ else:

+ l = []

+ append = l.append

+ for c in s:

+ o = ord(c)

+ if o >= 128:

+ append('\\%3o' % o)

+ else:

+ append(c)

+ return join_bytes(l).decode('ISO-8859-1')

+def split_string_literal(s, limit=2000):

+ # MSVC can't handle long string literals.

+ if len(s) < limit:

+ return s

+ else:

+ start = 0

+ chunks = []

+ while start < len(s):

+ end = start + limit

+ if len(s) > end-4 and '\\' in s[end-4:end]:

+ end -= 4 - s[end-4:end].find('\\') # just before the backslash

+ while s[end-1] == '\\':

+ end -= 1

+ if end == start:

+ # must have been a long line of backslashes

+ end = start + limit - (limit % 2) - 4

+ break

+ chunks.append(s[start:end])

+ start = end

+ return '""'.join(chunks)

+def encode_pyunicode_string(s):

+ """Create Py_UNICODE[] representation of a given unicode string.

+ """

+ s = map(ord, s) + [0]

+ if sys.maxunicode >= 0x10000: # Wide build or Py3.3

+ utf16, utf32 = [], s

+ for code_point in s:

+ if code_point >= 0x10000: # outside of BMP

+ high, low = divmod(code_point - 0x10000, 1024)

+ utf16.append(high + 0xD800)

+ utf16.append(low + 0xDC00)

+ else:

+ utf16.append(code_point)

+ else:

+ utf16, utf32 = s, []

+ for code_unit in s:

+ if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:

+ high, low = utf32[-1], code_unit

+ utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000

+ else:

+ utf32.append(code_unit)

+ if utf16 == utf32:

+ utf16 = []

+ return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))

« no previous file with comments | « third_party/cython/src/Cython/Compiler/Scanning.py ('k') | third_party/cython/src/Cython/Compiler/Symtab.py » ('j') | no next file with comments »