third_party/logilab/common/textutils.py - Issue 10447014: Add pylint to depot_tools.

Side by Side Diff: third_party/logilab/common/textutils.py

Issue 10447014: Add pylint to depot_tools. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/depot_tools

Patch Set: Fix unittests. Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.

	2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr

	3 #

	4 # This file is part of logilab-common.

	5 #

	6 # logilab-common is free software: you can redistribute it and/or modify it unde r

	7 # the terms of the GNU Lesser General Public License as published by the Free

	8 # Software Foundation, either version 2.1 of the License, or (at your option) an y

	9 # later version.

	10 #

	11 # logilab-common is distributed in the hope that it will be useful, but WITHOUT

	12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS

	13 # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more

	14 # details.

	15 #

	16 # You should have received a copy of the GNU Lesser General Public License along

	17 # with logilab-common. If not, see <http://www.gnu.org/licenses/>.

	18 """Some text manipulation utility functions.

	19

	20

	21 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\

	22 unquote, colorize_ansi

	23 :group text manipulation: searchall, splitstrip

	24 :sort: text formatting, text manipulation

	25

	26 :type ANSI_STYLES: dict(str)

	27 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code

	28

	29 :type ANSI_COLORS: dict(str)

	30 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code

	31

	32 :type ANSI_PREFIX: str

	33 :var ANSI_PREFIX:

	34 ANSI terminal code notifying the start of an ANSI escape sequence

	35

	36 :type ANSI_END: str

	37 :var ANSI_END:

	38 ANSI terminal code notifying the end of an ANSI escape sequence

	39

	40 :type ANSI_RESET: str

	41 :var ANSI_RESET:

	42 ANSI terminal code resetting format defined by a previous ANSI escape sequence

	43 """

	44 __docformat__ = "restructuredtext en"

	45

	46 import sys

	47 import re

	48 import os.path as osp

	49 from warnings import warn

	50 from unicodedata import normalize as _uninormalize

	51 try:

	52 from os import linesep

	53 except ImportError:

	54 linesep = '\n' # gae

	55

	56 from logilab.common.deprecation import deprecated

	57

	58 MANUAL_UNICODE_MAP = {

	59 u'\xa1': u'!', # INVERTED EXCLAMATION MARK

	60 u'\u0142': u'l', # LATIN SMALL LETTER L WITH STROKE

	61 u'\u2044': u'/', # FRACTION SLASH

	62 u'\xc6': u'AE', # LATIN CAPITAL LETTER AE

	63 u'\xa9': u'(c)', # COPYRIGHT SIGN

	64 u'\xab': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK

	65 u'\xe6': u'ae', # LATIN SMALL LETTER AE

	66 u'\xae': u'(r)', # REGISTERED SIGN

	67 u'\u0153': u'oe', # LATIN SMALL LIGATURE OE

	68 u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE

	69 u'\xd8': u'O', # LATIN CAPITAL LETTER O WITH STROKE

	70 u'\xf8': u'o', # LATIN SMALL LETTER O WITH STROKE

	71 u'\xbb': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK

	72 u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S

	73 }

	74

	75 def unormalize(ustring, ignorenonascii=None, substitute=None):

	76 """replace diacritical characters with their corresponding ascii characters

	77

	78 Convert the unicode string to its long normalized form (unicode character

	79 will be transform into several characters) and keep the first one only.

	80 The normal form KD (NFKD) will apply the compatibility decomposition, i.e.

	81 replace all compatibility characters with their equivalents.

	82

	83 :type substitute: str

	84 :param substitute: replacement character to use if decomposition fails

	85

	86 :see: Another project about ASCII transliterations of Unicode text

	87 http://pypi.python.org/pypi/Unidecode

	88 """

	89 # backward compatibility, ignorenonascii was a boolean

	90 if ignorenonascii is not None:

	91 warn("ignorenonascii is deprecated, use substitute named parameter inste ad",

	92 DeprecationWarning, stacklevel=2)

	93 if ignorenonascii:

	94 substitute = ''

	95 res = []

	96 for letter in ustring[:]:

	97 try:

	98 replacement = MANUAL_UNICODE_MAP[letter]

	99 except KeyError:

	100 replacement = _uninormalize('NFKD', letter)[0]

	101 if ord(replacement) >= 2 ** 7:

	102 if substitute is None:

	103 raise ValueError("can't deal with non-ascii based characters ")

	104 replacement = substitute

	105 res.append(replacement)

	106 return u''.join(res)

	107

	108 def unquote(string):

	109 """remove optional quotes (simple or double) from the string

	110

	111 :type string: str or unicode

	112 :param string: an optionally quoted string

	113

	114 :rtype: str or unicode

	115 :return: the unquoted string (or the input string if it wasn't quoted)

	116 """

	117 if not string:

	118 return string

	119 if string[0] in '"\'':

	120 string = string[1:]

	121 if string[-1] in '"\'':

	122 string = string[:-1]

	123 return string

	124

	125

	126 _BLANKLINES_RGX = re.compile('\r?\n\r?\n')

	127 _NORM_SPACES_RGX = re.compile('\s+')

	128

	129 def normalize_text(text, line_len=80, indent='', rest=False):

	130 """normalize a text to display it with a maximum line size and

	131 optionally arbitrary indentation. Line jumps are normalized but blank

	132 lines are kept. The indentation string may be used to insert a

	133 comment (#) or a quoting (>) mark for instance.

	134

	135 :type text: str or unicode

	136 :param text: the input text to normalize

	137

	138 :type line_len: int

	139 :param line_len: expected maximum line's length, default to 80

	140

	141 :type indent: str or unicode

	142 :param indent: optional string to use as indentation

	143

	144 :rtype: str or unicode

	145 :return:

	146 the input text normalized to fit on lines with a maximized size

	147 inferior to `line_len`, and optionally prefixed by an

	148 indentation string

	149 """

	150 if rest:

	151 normp = normalize_rest_paragraph

	152 else:

	153 normp = normalize_paragraph

	154 result = []

	155 for text in _BLANKLINES_RGX.split(text):

	156 result.append(normp(text, line_len, indent))

	157 return ('%s%s%s' % (linesep, indent, linesep)).join(result)

	158

	159

	160 def normalize_paragraph(text, line_len=80, indent=''):

	161 """normalize a text to display it with a maximum line size and

	162 optionally arbitrary indentation. Line jumps are normalized. The

	163 indentation string may be used top insert a comment mark for

	164 instance.

	165

	166 :type text: str or unicode

	167 :param text: the input text to normalize

	168

	169 :type line_len: int

	170 :param line_len: expected maximum line's length, default to 80

	171

	172 :type indent: str or unicode

	173 :param indent: optional string to use as indentation

	174

	175 :rtype: str or unicode

	176 :return:

	177 the input text normalized to fit on lines with a maximized size

	178 inferior to `line_len`, and optionally prefixed by an

	179 indentation string

	180 """

	181 text = _NORM_SPACES_RGX.sub(' ', text)

	182 line_len = line_len - len(indent)

	183 lines = []

	184 while text:

	185 aline, text = splittext(text.strip(), line_len)

	186 lines.append(indent + aline)

	187 return linesep.join(lines)

	188

	189 def normalize_rest_paragraph(text, line_len=80, indent=''):

	190 """normalize a ReST text to display it with a maximum line size and

	191 optionally arbitrary indentation. Line jumps are normalized. The

	192 indentation string may be used top insert a comment mark for

	193 instance.

	194

	195 :type text: str or unicode

	196 :param text: the input text to normalize

	197

	198 :type line_len: int

	199 :param line_len: expected maximum line's length, default to 80

	200

	201 :type indent: str or unicode

	202 :param indent: optional string to use as indentation

	203

	204 :rtype: str or unicode

	205 :return:

	206 the input text normalized to fit on lines with a maximized size

	207 inferior to `line_len`, and optionally prefixed by an

	208 indentation string

	209 """

	210 toreport = ''

	211 lines = []

	212 line_len = line_len - len(indent)

	213 for line in text.splitlines():

	214 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())

	215 toreport = ''

	216 while len(line) > line_len:

	217 # too long line, need split

	218 line, toreport = splittext(line, line_len)

	219 lines.append(indent + line)

	220 if toreport:

	221 line = toreport + ' '

	222 toreport = ''

	223 else:

	224 line = ''

	225 if line:

	226 lines.append(indent + line.strip())

	227 return linesep.join(lines)

	228

	229

	230 def splittext(text, line_len):

	231 """split the given text on space according to the given max line size

	232

	233 return a 2-uple:

	234 * a line <= line_len if possible

	235 * the rest of the text which has to be reported on another line

	236 """

	237 if len(text) <= line_len:

	238 return text, ''

	239 pos = min(len(text)-1, line_len)

	240 while pos > 0 and text[pos] != ' ':

	241 pos -= 1

	242 if pos == 0:

	243 pos = min(len(text), line_len)

	244 while len(text) > pos and text[pos] != ' ':

	245 pos += 1

	246 return text[:pos], text[pos+1:].strip()

	247

	248

	249 def splitstrip(string, sep=','):

	250 """return a list of stripped string by splitting the string given as

	251 argument on `sep` (',' by default). Empty string are discarded.

	252

	253 >>> splitstrip('a, b, c , 4,,')

	254 ['a', 'b', 'c', '4']

	255 >>> splitstrip('a')

	256 ['a']

	257 >>>

	258

	259 :type string: str or unicode

	260 :param string: a csv line

	261

	262 :type sep: str or unicode

	263 :param sep: field separator, default to the comma (',')

	264

	265 :rtype: str or unicode

	266 :return: the unquoted string (or the input string if it wasn't quoted)

	267 """

	268 return [word.strip() for word in string.split(sep) if word.strip()]

	269

	270 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip)

	271

	272

	273 def split_url_or_path(url_or_path):

	274 """return the latest component of a string containing either an url of the

	275 form <scheme>://<path> or a local file system path

	276 """

	277 if '://' in url_or_path:

	278 return url_or_path.rstrip('/').rsplit('/', 1)

	279 return osp.split(url_or_path.rstrip(osp.sep))

	280

	281

	282 def text_to_dict(text):

	283 """parse multilines text containing simple 'key=value' lines and return a

	284 dict of {'key': 'value'}. When the same key is encountered multiple time,

	285 value is turned into a list containing all values.

	286

	287 >>> text_to_dict('''multiple=1

	288 ... multiple= 2

	289 ... single =3

	290 ... ''')

	291 {'single': '3', 'multiple': ['1', '2']}

	292

	293 """

	294 res = {}

	295 if not text:

	296 return res

	297 for line in text.splitlines():

	298 line = line.strip()

	299 if line and not line.startswith('#'):

	300 key, value = [w.strip() for w in line.split('=', 1)]

	301 if key in res:

	302 try:

	303 res[key].append(value)

	304 except AttributeError:

	305 res[key] = [res[key], value]

	306 else:

	307 res[key] = value

	308 return res

	309

	310

	311 _BLANK_URE = r'(\s\|,)+'

	312 _BLANK_RE = re.compile(_BLANK_URE)

	313 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)\|((0x?)?[0-9]+))'

	314 __UNITS_URE = r'[a-zA-Z]+'

	315 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE))

	316

	317 BYTE_UNITS = {

	318 "b": 1,

	319 "kb": 1024,

	320 "mb": 1024 ** 2,

	321 "gb": 1024 ** 3,

	322 "tb": 1024 ** 4,

	323 }

	324

	325 TIME_UNITS = {

	326 "ms": 0.0001,

	327 "s": 1,

	328 "min": 60,

	329 "h": 60 * 60,

	330 "d": 60 * 60 *24,

	331 }

	332

	333 def apply_units(string, units, inter=None, final=float, blank_reg=_BLANK_RE,

	334 value_reg=_VALUE_RE):

	335 """Parse the string applying the units defined in units

	336 (e.g.: "1.5m",{'m',60} -> 80).

	337

	338 :type string: str or unicode

	339 :param string: the string to parse

	340

	341 :type units: dict (or any object with __getitem__ using basestring key)

	342 :param units: a dict mapping a unit string repr to its value

	343

	344 :type inter: type

	345 :param inter: used to parse every intermediate value (need __sum__)

	346

	347 :type blank_reg: regexp

	348 :param blank_reg: should match every blank char to ignore.

	349

	350 :type value_reg: regexp with "value" and optional "unit" group

	351 :param value_reg: match a value and it's unit into the

	352 """

	353 if inter is None:

	354 inter = final

	355 string = _BLANK_RE.sub('', string)

	356 values = []

	357 for match in value_reg.finditer(string):

	358 dic = match.groupdict()

	359 #import sys

	360 #print >> sys.stderr, dic

	361 lit, unit = dic["value"], dic.get("unit")

	362 value = inter(lit)

	363 if unit is not None:

	364 try:

	365 value *= units[unit.lower()]

	366 except KeyError:

	367 raise KeyError('invalid unit %s. valid units are %s' %

	368 (unit, units.keys()))

	369 values.append(value)

	370 return final(sum(values))

	371

	372

	373 _LINE_RGX = re.compile('\r\n\|\r+\|\n')

	374

	375 def pretty_match(match, string, underline_char='^'):

	376 """return a string with the match location underlined:

	377

	378 >>> import re

	379 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon'))

	380 il mange du bacon

	381 ^^^^^

	382 >>>

	383

	384 :type match: _sre.SRE_match

	385 :param match: object returned by re.match, re.search or re.finditer

	386

	387 :type string: str or unicode

	388 :param string:

	389 the string on which the regular expression has been applied to

	390 obtain the `match` object

	391

	392 :type underline_char: str or unicode

	393 :param underline_char:

	394 character to use to underline the matched section, default to the

	395 carret '^'

	396

	397 :rtype: str or unicode

	398 :return:

	399 the original string with an inserted line to underline the match

	400 location

	401 """

	402 start = match.start()

	403 end = match.end()

	404 string = _LINE_RGX.sub(linesep, string)

	405 start_line_pos = string.rfind(linesep, 0, start)

	406 if start_line_pos == -1:

	407 start_line_pos = 0

	408 result = []

	409 else:

	410 result = [string[:start_line_pos]]

	411 start_line_pos += len(linesep)

	412 offset = start - start_line_pos

	413 underline = ' ' * offset + underline_char * (end - start)

	414 end_line_pos = string.find(linesep, end)

	415 if end_line_pos == -1:

	416 string = string[start_line_pos:]

	417 result.append(string)

	418 result.append(underline)

	419 else:

	420 end = string[end_line_pos + len(linesep):]

	421 string = string[start_line_pos:end_line_pos]

	422 result.append(string)

	423 result.append(underline)

	424 result.append(end)

	425 return linesep.join(result).rstrip()

	426

	427

	428 # Ansi colorization ###########################################################

	429

	430 ANSI_PREFIX = '\033['

	431 ANSI_END = 'm'

	432 ANSI_RESET = '\033[0m'

	433 ANSI_STYLES = {

	434 'reset': "0",

	435 'bold': "1",

	436 'italic': "3",

	437 'underline': "4",

	438 'blink': "5",

	439 'inverse': "7",

	440 'strike': "9",

	441 }

	442 ANSI_COLORS = {

	443 'reset': "0",

	444 'black': "30",

	445 'red': "31",

	446 'green': "32",

	447 'yellow': "33",

	448 'blue': "34",

	449 'magenta': "35",

	450 'cyan': "36",

	451 'white': "37",

	452 }

	453

	454 def _get_ansi_code(color=None, style=None):

	455 """return ansi escape code corresponding to color and style

	456

	457 :type color: str or None

	458 :param color:

	459 the color name (see `ANSI_COLORS` for available values)

	460 or the color number when 256 colors are available

	461

	462 :type style: str or None

	463 :param style:

	464 style string (see `ANSI_COLORS` for available values). To get

	465 several style effects at the same time, use a coma as separator.

	466

	467 :raise KeyError: if an unexistent color or style identifier is given

	468

	469 :rtype: str

	470 :return: the built escape code

	471 """

	472 ansi_code = []

	473 if style:

	474 style_attrs = splitstrip(style)

	475 for effect in style_attrs:

	476 ansi_code.append(ANSI_STYLES[effect])

	477 if color:

	478 if color.isdigit():

	479 ansi_code.extend(['38', '5'])

	480 ansi_code.append(color)

	481 else:

	482 ansi_code.append(ANSI_COLORS[color])

	483 if ansi_code:

	484 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END

	485 return ''

	486

	487 def colorize_ansi(msg, color=None, style=None):

	488 """colorize message by wrapping it with ansi escape codes

	489

	490 :type msg: str or unicode

	491 :param msg: the message string to colorize

	492

	493 :type color: str or None

	494 :param color:

	495 the color identifier (see `ANSI_COLORS` for available values)

	496

	497 :type style: str or None

	498 :param style:

	499 style string (see `ANSI_COLORS` for available values). To get

	500 several style effects at the same time, use a coma as separator.

	501

	502 :raise KeyError: if an unexistent color or style identifier is given

	503

	504 :rtype: str or unicode

	505 :return: the ansi escaped string

	506 """

	507 # If both color and style are not defined, then leave the text as is

	508 if color is None and style is None:

	509 return msg

	510 escape_code = _get_ansi_code(color, style)

	511 # If invalid (or unknown) color, don't wrap msg with ansi codes

	512 if escape_code:

	513 return '%s%s%s' % (escape_code, msg, ANSI_RESET)

	514 return msg

	515

	516 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}

	517

	518 def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):

	519 for line in lines:

	520 if line[:4] in ('--- ', '+++ '):

	521 out.write(colorize_ansi(line, style['separator']))

	522 elif line[0] == '-':

	523 out.write(colorize_ansi(line, style['remove']))

	524 elif line[0] == '+':

	525 out.write(colorize_ansi(line, style['add']))

	526 elif line[:4] == '--- ':

	527 out.write(colorize_ansi(line, style['separator']))

	528 elif line[:4] == '+++ ':

	529 out.write(colorize_ansi(line, style['separator']))

	530 else:

	531 out.write(line)

	532

OLD	NEW

« no previous file with comments | « third_party/logilab/common/testlib.py ('k') | third_party/logilab/common/tree.py » ('j') | no next file with comments »