OLD | NEW |
(Empty) | |
| 1 # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. |
| 2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr |
| 3 # |
| 4 # This file is part of logilab-common. |
| 5 # |
| 6 # logilab-common is free software: you can redistribute it and/or modify it unde
r |
| 7 # the terms of the GNU Lesser General Public License as published by the Free |
| 8 # Software Foundation, either version 2.1 of the License, or (at your option) an
y |
| 9 # later version. |
| 10 # |
| 11 # logilab-common is distributed in the hope that it will be useful, but WITHOUT |
| 12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 13 # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more |
| 14 # details. |
| 15 # |
| 16 # You should have received a copy of the GNU Lesser General Public License along |
| 17 # with logilab-common. If not, see <http://www.gnu.org/licenses/>. |
| 18 """Some text manipulation utility functions. |
| 19 |
| 20 |
| 21 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ |
| 22 unquote, colorize_ansi |
| 23 :group text manipulation: searchall, splitstrip |
| 24 :sort: text formatting, text manipulation |
| 25 |
| 26 :type ANSI_STYLES: dict(str) |
| 27 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code |
| 28 |
| 29 :type ANSI_COLORS: dict(str) |
| 30 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code |
| 31 |
| 32 :type ANSI_PREFIX: str |
| 33 :var ANSI_PREFIX: |
| 34 ANSI terminal code notifying the start of an ANSI escape sequence |
| 35 |
| 36 :type ANSI_END: str |
| 37 :var ANSI_END: |
| 38 ANSI terminal code notifying the end of an ANSI escape sequence |
| 39 |
| 40 :type ANSI_RESET: str |
| 41 :var ANSI_RESET: |
| 42 ANSI terminal code resetting format defined by a previous ANSI escape sequence |
| 43 """ |
| 44 __docformat__ = "restructuredtext en" |
| 45 |
| 46 import sys |
| 47 import re |
| 48 import os.path as osp |
| 49 from warnings import warn |
| 50 from unicodedata import normalize as _uninormalize |
| 51 try: |
| 52 from os import linesep |
| 53 except ImportError: |
| 54 linesep = '\n' # gae |
| 55 |
| 56 from logilab.common.deprecation import deprecated |
| 57 |
| 58 MANUAL_UNICODE_MAP = { |
| 59 u'\xa1': u'!', # INVERTED EXCLAMATION MARK |
| 60 u'\u0142': u'l', # LATIN SMALL LETTER L WITH STROKE |
| 61 u'\u2044': u'/', # FRACTION SLASH |
| 62 u'\xc6': u'AE', # LATIN CAPITAL LETTER AE |
| 63 u'\xa9': u'(c)', # COPYRIGHT SIGN |
| 64 u'\xab': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK |
| 65 u'\xe6': u'ae', # LATIN SMALL LETTER AE |
| 66 u'\xae': u'(r)', # REGISTERED SIGN |
| 67 u'\u0153': u'oe', # LATIN SMALL LIGATURE OE |
| 68 u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE |
| 69 u'\xd8': u'O', # LATIN CAPITAL LETTER O WITH STROKE |
| 70 u'\xf8': u'o', # LATIN SMALL LETTER O WITH STROKE |
| 71 u'\xbb': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK |
| 72 u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S |
| 73 } |
| 74 |
| 75 def unormalize(ustring, ignorenonascii=None, substitute=None): |
| 76 """replace diacritical characters with their corresponding ascii characters |
| 77 |
| 78 Convert the unicode string to its long normalized form (unicode character |
| 79 will be transform into several characters) and keep the first one only. |
| 80 The normal form KD (NFKD) will apply the compatibility decomposition, i.e. |
| 81 replace all compatibility characters with their equivalents. |
| 82 |
| 83 :type substitute: str |
| 84 :param substitute: replacement character to use if decomposition fails |
| 85 |
| 86 :see: Another project about ASCII transliterations of Unicode text |
| 87 http://pypi.python.org/pypi/Unidecode |
| 88 """ |
| 89 # backward compatibility, ignorenonascii was a boolean |
| 90 if ignorenonascii is not None: |
| 91 warn("ignorenonascii is deprecated, use substitute named parameter inste
ad", |
| 92 DeprecationWarning, stacklevel=2) |
| 93 if ignorenonascii: |
| 94 substitute = '' |
| 95 res = [] |
| 96 for letter in ustring[:]: |
| 97 try: |
| 98 replacement = MANUAL_UNICODE_MAP[letter] |
| 99 except KeyError: |
| 100 replacement = _uninormalize('NFKD', letter)[0] |
| 101 if ord(replacement) >= 2 ** 7: |
| 102 if substitute is None: |
| 103 raise ValueError("can't deal with non-ascii based characters
") |
| 104 replacement = substitute |
| 105 res.append(replacement) |
| 106 return u''.join(res) |
| 107 |
| 108 def unquote(string): |
| 109 """remove optional quotes (simple or double) from the string |
| 110 |
| 111 :type string: str or unicode |
| 112 :param string: an optionally quoted string |
| 113 |
| 114 :rtype: str or unicode |
| 115 :return: the unquoted string (or the input string if it wasn't quoted) |
| 116 """ |
| 117 if not string: |
| 118 return string |
| 119 if string[0] in '"\'': |
| 120 string = string[1:] |
| 121 if string[-1] in '"\'': |
| 122 string = string[:-1] |
| 123 return string |
| 124 |
| 125 |
| 126 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') |
| 127 _NORM_SPACES_RGX = re.compile('\s+') |
| 128 |
| 129 def normalize_text(text, line_len=80, indent='', rest=False): |
| 130 """normalize a text to display it with a maximum line size and |
| 131 optionally arbitrary indentation. Line jumps are normalized but blank |
| 132 lines are kept. The indentation string may be used to insert a |
| 133 comment (#) or a quoting (>) mark for instance. |
| 134 |
| 135 :type text: str or unicode |
| 136 :param text: the input text to normalize |
| 137 |
| 138 :type line_len: int |
| 139 :param line_len: expected maximum line's length, default to 80 |
| 140 |
| 141 :type indent: str or unicode |
| 142 :param indent: optional string to use as indentation |
| 143 |
| 144 :rtype: str or unicode |
| 145 :return: |
| 146 the input text normalized to fit on lines with a maximized size |
| 147 inferior to `line_len`, and optionally prefixed by an |
| 148 indentation string |
| 149 """ |
| 150 if rest: |
| 151 normp = normalize_rest_paragraph |
| 152 else: |
| 153 normp = normalize_paragraph |
| 154 result = [] |
| 155 for text in _BLANKLINES_RGX.split(text): |
| 156 result.append(normp(text, line_len, indent)) |
| 157 return ('%s%s%s' % (linesep, indent, linesep)).join(result) |
| 158 |
| 159 |
| 160 def normalize_paragraph(text, line_len=80, indent=''): |
| 161 """normalize a text to display it with a maximum line size and |
| 162 optionally arbitrary indentation. Line jumps are normalized. The |
| 163 indentation string may be used top insert a comment mark for |
| 164 instance. |
| 165 |
| 166 :type text: str or unicode |
| 167 :param text: the input text to normalize |
| 168 |
| 169 :type line_len: int |
| 170 :param line_len: expected maximum line's length, default to 80 |
| 171 |
| 172 :type indent: str or unicode |
| 173 :param indent: optional string to use as indentation |
| 174 |
| 175 :rtype: str or unicode |
| 176 :return: |
| 177 the input text normalized to fit on lines with a maximized size |
| 178 inferior to `line_len`, and optionally prefixed by an |
| 179 indentation string |
| 180 """ |
| 181 text = _NORM_SPACES_RGX.sub(' ', text) |
| 182 line_len = line_len - len(indent) |
| 183 lines = [] |
| 184 while text: |
| 185 aline, text = splittext(text.strip(), line_len) |
| 186 lines.append(indent + aline) |
| 187 return linesep.join(lines) |
| 188 |
| 189 def normalize_rest_paragraph(text, line_len=80, indent=''): |
| 190 """normalize a ReST text to display it with a maximum line size and |
| 191 optionally arbitrary indentation. Line jumps are normalized. The |
| 192 indentation string may be used top insert a comment mark for |
| 193 instance. |
| 194 |
| 195 :type text: str or unicode |
| 196 :param text: the input text to normalize |
| 197 |
| 198 :type line_len: int |
| 199 :param line_len: expected maximum line's length, default to 80 |
| 200 |
| 201 :type indent: str or unicode |
| 202 :param indent: optional string to use as indentation |
| 203 |
| 204 :rtype: str or unicode |
| 205 :return: |
| 206 the input text normalized to fit on lines with a maximized size |
| 207 inferior to `line_len`, and optionally prefixed by an |
| 208 indentation string |
| 209 """ |
| 210 toreport = '' |
| 211 lines = [] |
| 212 line_len = line_len - len(indent) |
| 213 for line in text.splitlines(): |
| 214 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) |
| 215 toreport = '' |
| 216 while len(line) > line_len: |
| 217 # too long line, need split |
| 218 line, toreport = splittext(line, line_len) |
| 219 lines.append(indent + line) |
| 220 if toreport: |
| 221 line = toreport + ' ' |
| 222 toreport = '' |
| 223 else: |
| 224 line = '' |
| 225 if line: |
| 226 lines.append(indent + line.strip()) |
| 227 return linesep.join(lines) |
| 228 |
| 229 |
| 230 def splittext(text, line_len): |
| 231 """split the given text on space according to the given max line size |
| 232 |
| 233 return a 2-uple: |
| 234 * a line <= line_len if possible |
| 235 * the rest of the text which has to be reported on another line |
| 236 """ |
| 237 if len(text) <= line_len: |
| 238 return text, '' |
| 239 pos = min(len(text)-1, line_len) |
| 240 while pos > 0 and text[pos] != ' ': |
| 241 pos -= 1 |
| 242 if pos == 0: |
| 243 pos = min(len(text), line_len) |
| 244 while len(text) > pos and text[pos] != ' ': |
| 245 pos += 1 |
| 246 return text[:pos], text[pos+1:].strip() |
| 247 |
| 248 |
| 249 def splitstrip(string, sep=','): |
| 250 """return a list of stripped string by splitting the string given as |
| 251 argument on `sep` (',' by default). Empty string are discarded. |
| 252 |
| 253 >>> splitstrip('a, b, c , 4,,') |
| 254 ['a', 'b', 'c', '4'] |
| 255 >>> splitstrip('a') |
| 256 ['a'] |
| 257 >>> |
| 258 |
| 259 :type string: str or unicode |
| 260 :param string: a csv line |
| 261 |
| 262 :type sep: str or unicode |
| 263 :param sep: field separator, default to the comma (',') |
| 264 |
| 265 :rtype: str or unicode |
| 266 :return: the unquoted string (or the input string if it wasn't quoted) |
| 267 """ |
| 268 return [word.strip() for word in string.split(sep) if word.strip()] |
| 269 |
| 270 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip) |
| 271 |
| 272 |
| 273 def split_url_or_path(url_or_path): |
| 274 """return the latest component of a string containing either an url of the |
| 275 form <scheme>://<path> or a local file system path |
| 276 """ |
| 277 if '://' in url_or_path: |
| 278 return url_or_path.rstrip('/').rsplit('/', 1) |
| 279 return osp.split(url_or_path.rstrip(osp.sep)) |
| 280 |
| 281 |
| 282 def text_to_dict(text): |
| 283 """parse multilines text containing simple 'key=value' lines and return a |
| 284 dict of {'key': 'value'}. When the same key is encountered multiple time, |
| 285 value is turned into a list containing all values. |
| 286 |
| 287 >>> text_to_dict('''multiple=1 |
| 288 ... multiple= 2 |
| 289 ... single =3 |
| 290 ... ''') |
| 291 {'single': '3', 'multiple': ['1', '2']} |
| 292 |
| 293 """ |
| 294 res = {} |
| 295 if not text: |
| 296 return res |
| 297 for line in text.splitlines(): |
| 298 line = line.strip() |
| 299 if line and not line.startswith('#'): |
| 300 key, value = [w.strip() for w in line.split('=', 1)] |
| 301 if key in res: |
| 302 try: |
| 303 res[key].append(value) |
| 304 except AttributeError: |
| 305 res[key] = [res[key], value] |
| 306 else: |
| 307 res[key] = value |
| 308 return res |
| 309 |
| 310 |
| 311 _BLANK_URE = r'(\s|,)+' |
| 312 _BLANK_RE = re.compile(_BLANK_URE) |
| 313 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' |
| 314 __UNITS_URE = r'[a-zA-Z]+' |
| 315 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE)) |
| 316 |
| 317 BYTE_UNITS = { |
| 318 "b": 1, |
| 319 "kb": 1024, |
| 320 "mb": 1024 ** 2, |
| 321 "gb": 1024 ** 3, |
| 322 "tb": 1024 ** 4, |
| 323 } |
| 324 |
| 325 TIME_UNITS = { |
| 326 "ms": 0.0001, |
| 327 "s": 1, |
| 328 "min": 60, |
| 329 "h": 60 * 60, |
| 330 "d": 60 * 60 *24, |
| 331 } |
| 332 |
| 333 def apply_units(string, units, inter=None, final=float, blank_reg=_BLANK_RE, |
| 334 value_reg=_VALUE_RE): |
| 335 """Parse the string applying the units defined in units |
| 336 (e.g.: "1.5m",{'m',60} -> 80). |
| 337 |
| 338 :type string: str or unicode |
| 339 :param string: the string to parse |
| 340 |
| 341 :type units: dict (or any object with __getitem__ using basestring key) |
| 342 :param units: a dict mapping a unit string repr to its value |
| 343 |
| 344 :type inter: type |
| 345 :param inter: used to parse every intermediate value (need __sum__) |
| 346 |
| 347 :type blank_reg: regexp |
| 348 :param blank_reg: should match every blank char to ignore. |
| 349 |
| 350 :type value_reg: regexp with "value" and optional "unit" group |
| 351 :param value_reg: match a value and it's unit into the |
| 352 """ |
| 353 if inter is None: |
| 354 inter = final |
| 355 string = _BLANK_RE.sub('', string) |
| 356 values = [] |
| 357 for match in value_reg.finditer(string): |
| 358 dic = match.groupdict() |
| 359 #import sys |
| 360 #print >> sys.stderr, dic |
| 361 lit, unit = dic["value"], dic.get("unit") |
| 362 value = inter(lit) |
| 363 if unit is not None: |
| 364 try: |
| 365 value *= units[unit.lower()] |
| 366 except KeyError: |
| 367 raise KeyError('invalid unit %s. valid units are %s' % |
| 368 (unit, units.keys())) |
| 369 values.append(value) |
| 370 return final(sum(values)) |
| 371 |
| 372 |
| 373 _LINE_RGX = re.compile('\r\n|\r+|\n') |
| 374 |
| 375 def pretty_match(match, string, underline_char='^'): |
| 376 """return a string with the match location underlined: |
| 377 |
| 378 >>> import re |
| 379 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du
bacon')) |
| 380 il mange du bacon |
| 381 ^^^^^ |
| 382 >>> |
| 383 |
| 384 :type match: _sre.SRE_match |
| 385 :param match: object returned by re.match, re.search or re.finditer |
| 386 |
| 387 :type string: str or unicode |
| 388 :param string: |
| 389 the string on which the regular expression has been applied to |
| 390 obtain the `match` object |
| 391 |
| 392 :type underline_char: str or unicode |
| 393 :param underline_char: |
| 394 character to use to underline the matched section, default to the |
| 395 carret '^' |
| 396 |
| 397 :rtype: str or unicode |
| 398 :return: |
| 399 the original string with an inserted line to underline the match |
| 400 location |
| 401 """ |
| 402 start = match.start() |
| 403 end = match.end() |
| 404 string = _LINE_RGX.sub(linesep, string) |
| 405 start_line_pos = string.rfind(linesep, 0, start) |
| 406 if start_line_pos == -1: |
| 407 start_line_pos = 0 |
| 408 result = [] |
| 409 else: |
| 410 result = [string[:start_line_pos]] |
| 411 start_line_pos += len(linesep) |
| 412 offset = start - start_line_pos |
| 413 underline = ' ' * offset + underline_char * (end - start) |
| 414 end_line_pos = string.find(linesep, end) |
| 415 if end_line_pos == -1: |
| 416 string = string[start_line_pos:] |
| 417 result.append(string) |
| 418 result.append(underline) |
| 419 else: |
| 420 end = string[end_line_pos + len(linesep):] |
| 421 string = string[start_line_pos:end_line_pos] |
| 422 result.append(string) |
| 423 result.append(underline) |
| 424 result.append(end) |
| 425 return linesep.join(result).rstrip() |
| 426 |
| 427 |
| 428 # Ansi colorization ########################################################### |
| 429 |
| 430 ANSI_PREFIX = '\033[' |
| 431 ANSI_END = 'm' |
| 432 ANSI_RESET = '\033[0m' |
| 433 ANSI_STYLES = { |
| 434 'reset': "0", |
| 435 'bold': "1", |
| 436 'italic': "3", |
| 437 'underline': "4", |
| 438 'blink': "5", |
| 439 'inverse': "7", |
| 440 'strike': "9", |
| 441 } |
| 442 ANSI_COLORS = { |
| 443 'reset': "0", |
| 444 'black': "30", |
| 445 'red': "31", |
| 446 'green': "32", |
| 447 'yellow': "33", |
| 448 'blue': "34", |
| 449 'magenta': "35", |
| 450 'cyan': "36", |
| 451 'white': "37", |
| 452 } |
| 453 |
| 454 def _get_ansi_code(color=None, style=None): |
| 455 """return ansi escape code corresponding to color and style |
| 456 |
| 457 :type color: str or None |
| 458 :param color: |
| 459 the color name (see `ANSI_COLORS` for available values) |
| 460 or the color number when 256 colors are available |
| 461 |
| 462 :type style: str or None |
| 463 :param style: |
| 464 style string (see `ANSI_COLORS` for available values). To get |
| 465 several style effects at the same time, use a coma as separator. |
| 466 |
| 467 :raise KeyError: if an unexistent color or style identifier is given |
| 468 |
| 469 :rtype: str |
| 470 :return: the built escape code |
| 471 """ |
| 472 ansi_code = [] |
| 473 if style: |
| 474 style_attrs = splitstrip(style) |
| 475 for effect in style_attrs: |
| 476 ansi_code.append(ANSI_STYLES[effect]) |
| 477 if color: |
| 478 if color.isdigit(): |
| 479 ansi_code.extend(['38', '5']) |
| 480 ansi_code.append(color) |
| 481 else: |
| 482 ansi_code.append(ANSI_COLORS[color]) |
| 483 if ansi_code: |
| 484 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END |
| 485 return '' |
| 486 |
| 487 def colorize_ansi(msg, color=None, style=None): |
| 488 """colorize message by wrapping it with ansi escape codes |
| 489 |
| 490 :type msg: str or unicode |
| 491 :param msg: the message string to colorize |
| 492 |
| 493 :type color: str or None |
| 494 :param color: |
| 495 the color identifier (see `ANSI_COLORS` for available values) |
| 496 |
| 497 :type style: str or None |
| 498 :param style: |
| 499 style string (see `ANSI_COLORS` for available values). To get |
| 500 several style effects at the same time, use a coma as separator. |
| 501 |
| 502 :raise KeyError: if an unexistent color or style identifier is given |
| 503 |
| 504 :rtype: str or unicode |
| 505 :return: the ansi escaped string |
| 506 """ |
| 507 # If both color and style are not defined, then leave the text as is |
| 508 if color is None and style is None: |
| 509 return msg |
| 510 escape_code = _get_ansi_code(color, style) |
| 511 # If invalid (or unknown) color, don't wrap msg with ansi codes |
| 512 if escape_code: |
| 513 return '%s%s%s' % (escape_code, msg, ANSI_RESET) |
| 514 return msg |
| 515 |
| 516 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} |
| 517 |
| 518 def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE): |
| 519 for line in lines: |
| 520 if line[:4] in ('--- ', '+++ '): |
| 521 out.write(colorize_ansi(line, style['separator'])) |
| 522 elif line[0] == '-': |
| 523 out.write(colorize_ansi(line, style['remove'])) |
| 524 elif line[0] == '+': |
| 525 out.write(colorize_ansi(line, style['add'])) |
| 526 elif line[:4] == '--- ': |
| 527 out.write(colorize_ansi(line, style['separator'])) |
| 528 elif line[:4] == '+++ ': |
| 529 out.write(colorize_ansi(line, style['separator'])) |
| 530 else: |
| 531 out.write(line) |
| 532 |
OLD | NEW |