OLD | NEW |
(Empty) | |
| 1 # |
| 2 # Cython -- encoding related tools |
| 3 # |
| 4 |
| 5 import re |
| 6 import sys |
| 7 |
| 8 if sys.version_info[0] >= 3: |
| 9 _unicode, _str, _bytes = str, str, bytes |
| 10 IS_PYTHON3 = True |
| 11 else: |
| 12 _unicode, _str, _bytes = unicode, str, str |
| 13 IS_PYTHON3 = False |
| 14 |
| 15 empty_bytes = _bytes() |
| 16 empty_unicode = _unicode() |
| 17 |
| 18 join_bytes = empty_bytes.join |
| 19 |
| 20 class UnicodeLiteralBuilder(object): |
| 21 """Assemble a unicode string. |
| 22 """ |
| 23 def __init__(self): |
| 24 self.chars = [] |
| 25 |
| 26 def append(self, characters): |
| 27 if isinstance(characters, _bytes): |
| 28 # this came from a Py2 string literal in the parser code |
| 29 characters = characters.decode("ASCII") |
| 30 assert isinstance(characters, _unicode), str(type(characters)) |
| 31 self.chars.append(characters) |
| 32 |
| 33 if sys.maxunicode == 65535: |
| 34 def append_charval(self, char_number): |
| 35 if char_number > 65535: |
| 36 # wide Unicode character on narrow platform => replace |
| 37 # by surrogate pair |
| 38 char_number -= 0x10000 |
| 39 self.chars.append( unichr((char_number // 1024) + 0xD800) ) |
| 40 self.chars.append( unichr((char_number % 1024) + 0xDC00) ) |
| 41 else: |
| 42 self.chars.append( unichr(char_number) ) |
| 43 else: |
| 44 def append_charval(self, char_number): |
| 45 self.chars.append( unichr(char_number) ) |
| 46 |
| 47 def append_uescape(self, char_number, escape_string): |
| 48 self.append_charval(char_number) |
| 49 |
| 50 def getstring(self): |
| 51 return EncodedString(u''.join(self.chars)) |
| 52 |
| 53 def getstrings(self): |
| 54 return (None, self.getstring()) |
| 55 |
| 56 |
| 57 class BytesLiteralBuilder(object): |
| 58 """Assemble a byte string or char value. |
| 59 """ |
| 60 def __init__(self, target_encoding): |
| 61 self.chars = [] |
| 62 self.target_encoding = target_encoding |
| 63 |
| 64 def append(self, characters): |
| 65 if isinstance(characters, _unicode): |
| 66 characters = characters.encode(self.target_encoding) |
| 67 assert isinstance(characters, _bytes), str(type(characters)) |
| 68 self.chars.append(characters) |
| 69 |
| 70 def append_charval(self, char_number): |
| 71 self.chars.append( unichr(char_number).encode('ISO-8859-1') ) |
| 72 |
| 73 def append_uescape(self, char_number, escape_string): |
| 74 self.append(escape_string) |
| 75 |
| 76 def getstring(self): |
| 77 # this *must* return a byte string! |
| 78 s = BytesLiteral(join_bytes(self.chars)) |
| 79 s.encoding = self.target_encoding |
| 80 return s |
| 81 |
| 82 def getchar(self): |
| 83 # this *must* return a byte string! |
| 84 return self.getstring() |
| 85 |
| 86 def getstrings(self): |
| 87 return (self.getstring(), None) |
| 88 |
| 89 class StrLiteralBuilder(object): |
| 90 """Assemble both a bytes and a unicode representation of a string. |
| 91 """ |
| 92 def __init__(self, target_encoding): |
| 93 self._bytes = BytesLiteralBuilder(target_encoding) |
| 94 self._unicode = UnicodeLiteralBuilder() |
| 95 |
| 96 def append(self, characters): |
| 97 self._bytes.append(characters) |
| 98 self._unicode.append(characters) |
| 99 |
| 100 def append_charval(self, char_number): |
| 101 self._bytes.append_charval(char_number) |
| 102 self._unicode.append_charval(char_number) |
| 103 |
| 104 def append_uescape(self, char_number, escape_string): |
| 105 self._bytes.append(escape_string) |
| 106 self._unicode.append_charval(char_number) |
| 107 |
| 108 def getstrings(self): |
| 109 return (self._bytes.getstring(), self._unicode.getstring()) |
| 110 |
| 111 |
| 112 class EncodedString(_unicode): |
| 113 # unicode string subclass to keep track of the original encoding. |
| 114 # 'encoding' is None for unicode strings and the source encoding |
| 115 # otherwise |
| 116 encoding = None |
| 117 |
| 118 def __deepcopy__(self, memo): |
| 119 return self |
| 120 |
| 121 def byteencode(self): |
| 122 assert self.encoding is not None |
| 123 return self.encode(self.encoding) |
| 124 |
| 125 def utf8encode(self): |
| 126 assert self.encoding is None |
| 127 return self.encode("UTF-8") |
| 128 |
| 129 @property |
| 130 def is_unicode(self): |
| 131 return self.encoding is None |
| 132 |
| 133 def contains_surrogates(self): |
| 134 return string_contains_surrogates(self) |
| 135 |
| 136 |
| 137 def string_contains_surrogates(ustring): |
| 138 """ |
| 139 Check if the unicode string contains surrogate code points |
| 140 on a CPython platform with wide (UCS-4) or narrow (UTF-16) |
| 141 Unicode, i.e. characters that would be spelled as two |
| 142 separate code units on a narrow platform. |
| 143 """ |
| 144 for c in map(ord, ustring): |
| 145 if c > 65535: # can only happen on wide platforms |
| 146 return True |
| 147 if 0xD800 <= c <= 0xDFFF: |
| 148 return True |
| 149 return False |
| 150 |
| 151 |
| 152 class BytesLiteral(_bytes): |
| 153 # bytes subclass that is compatible with EncodedString |
| 154 encoding = None |
| 155 |
| 156 def __deepcopy__(self, memo): |
| 157 return self |
| 158 |
| 159 def byteencode(self): |
| 160 if IS_PYTHON3: |
| 161 return _bytes(self) |
| 162 else: |
| 163 # fake-recode the string to make it a plain bytes object |
| 164 return self.decode('ISO-8859-1').encode('ISO-8859-1') |
| 165 |
| 166 def utf8encode(self): |
| 167 assert False, "this is not a unicode string: %r" % self |
| 168 |
| 169 def __str__(self): |
| 170 """Fake-decode the byte string to unicode to support % |
| 171 formatting of unicode strings. |
| 172 """ |
| 173 return self.decode('ISO-8859-1') |
| 174 |
| 175 is_unicode = False |
| 176 |
| 177 |
| 178 char_from_escape_sequence = { |
| 179 r'\a' : u'\a', |
| 180 r'\b' : u'\b', |
| 181 r'\f' : u'\f', |
| 182 r'\n' : u'\n', |
| 183 r'\r' : u'\r', |
| 184 r'\t' : u'\t', |
| 185 r'\v' : u'\v', |
| 186 }.get |
| 187 |
| 188 _c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) |
| 189 |
| 190 |
| 191 def _to_escape_sequence(s): |
| 192 if s in '\n\r\t': |
| 193 return repr(s)[1:-1] |
| 194 elif s == '"': |
| 195 return r'\"' |
| 196 elif s == '\\': |
| 197 return r'\\' |
| 198 else: |
| 199 # within a character sequence, oct passes much better than hex |
| 200 return ''.join(['\\%03o' % ord(c) for c in s]) |
| 201 |
| 202 |
| 203 def _build_specials_replacer(): |
| 204 subexps = [] |
| 205 replacements = {} |
| 206 for special in _c_special: |
| 207 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) |
| 208 subexps.append(regexp) |
| 209 replacements[special.encode('ASCII')] = _to_escape_sequence(special).enc
ode('ASCII') |
| 210 sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub |
| 211 def replace_specials(m): |
| 212 return replacements[m.group(1)] |
| 213 def replace(s): |
| 214 return sub(replace_specials, s) |
| 215 return replace |
| 216 |
| 217 _replace_specials = _build_specials_replacer() |
| 218 |
| 219 |
| 220 def escape_char(c): |
| 221 if IS_PYTHON3: |
| 222 c = c.decode('ISO-8859-1') |
| 223 if c in '\n\r\t\\': |
| 224 return repr(c)[1:-1] |
| 225 elif c == "'": |
| 226 return "\\'" |
| 227 n = ord(c) |
| 228 if n < 32 or n > 127: |
| 229 # hex works well for characters |
| 230 return "\\x%02X" % n |
| 231 else: |
| 232 return c |
| 233 |
| 234 def escape_byte_string(s): |
| 235 """Escape a byte string so that it can be written into C code. |
| 236 Note that this returns a Unicode string instead which, when |
| 237 encoded as ISO-8859-1, will result in the correct byte sequence |
| 238 being written. |
| 239 """ |
| 240 s = _replace_specials(s) |
| 241 try: |
| 242 return s.decode("ASCII") # trial decoding: plain ASCII => done |
| 243 except UnicodeDecodeError: |
| 244 pass |
| 245 if IS_PYTHON3: |
| 246 s_new = bytearray() |
| 247 append, extend = s_new.append, s_new.extend |
| 248 for b in s: |
| 249 if b >= 128: |
| 250 extend(('\\%3o' % b).encode('ASCII')) |
| 251 else: |
| 252 append(b) |
| 253 return s_new.decode('ISO-8859-1') |
| 254 else: |
| 255 l = [] |
| 256 append = l.append |
| 257 for c in s: |
| 258 o = ord(c) |
| 259 if o >= 128: |
| 260 append('\\%3o' % o) |
| 261 else: |
| 262 append(c) |
| 263 return join_bytes(l).decode('ISO-8859-1') |
| 264 |
| 265 def split_string_literal(s, limit=2000): |
| 266 # MSVC can't handle long string literals. |
| 267 if len(s) < limit: |
| 268 return s |
| 269 else: |
| 270 start = 0 |
| 271 chunks = [] |
| 272 while start < len(s): |
| 273 end = start + limit |
| 274 if len(s) > end-4 and '\\' in s[end-4:end]: |
| 275 end -= 4 - s[end-4:end].find('\\') # just before the backslash |
| 276 while s[end-1] == '\\': |
| 277 end -= 1 |
| 278 if end == start: |
| 279 # must have been a long line of backslashes |
| 280 end = start + limit - (limit % 2) - 4 |
| 281 break |
| 282 chunks.append(s[start:end]) |
| 283 start = end |
| 284 return '""'.join(chunks) |
| 285 |
| 286 def encode_pyunicode_string(s): |
| 287 """Create Py_UNICODE[] representation of a given unicode string. |
| 288 """ |
| 289 s = map(ord, s) + [0] |
| 290 |
| 291 if sys.maxunicode >= 0x10000: # Wide build or Py3.3 |
| 292 utf16, utf32 = [], s |
| 293 for code_point in s: |
| 294 if code_point >= 0x10000: # outside of BMP |
| 295 high, low = divmod(code_point - 0x10000, 1024) |
| 296 utf16.append(high + 0xD800) |
| 297 utf16.append(low + 0xDC00) |
| 298 else: |
| 299 utf16.append(code_point) |
| 300 else: |
| 301 utf16, utf32 = s, [] |
| 302 for code_unit in s: |
| 303 if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <
= 0xDBFF: |
| 304 high, low = utf32[-1], code_unit |
| 305 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 |
| 306 else: |
| 307 utf32.append(code_unit) |
| 308 |
| 309 if utf16 == utf32: |
| 310 utf16 = [] |
| 311 return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32)) |
OLD | NEW |