| Index: third_party/pylint/checkers/similar.py
|
| diff --git a/third_party/pylint/checkers/similar.py b/third_party/pylint/checkers/similar.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..1e38ed61e80aa910c8051685cbd3c0c78d9dad93
|
| --- /dev/null
|
| +++ b/third_party/pylint/checkers/similar.py
|
| @@ -0,0 +1,334 @@
|
| +# pylint: disable=W0622
|
| +# Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE).
|
| +# http://www.logilab.fr/ -- mailto:contact@logilab.fr
|
| +#
|
| +# This program is free software; you can redistribute it and/or modify it under
|
| +# the terms of the GNU General Public License as published by the Free Software
|
| +# Foundation; either version 2 of the License, or (at your option) any later
|
| +# version.
|
| +#
|
| +# This program is distributed in the hope that it will be useful, but WITHOUT
|
| +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
| +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
|
| +#
|
| +# You should have received a copy of the GNU General Public License along with
|
| +# this program; if not, write to the Free Software Foundation, Inc.,
|
| +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
| +"""a similarities / code duplication command line tool and pylint checker
|
| +"""
|
| +from __future__ import generators
|
| +
|
| +import sys
|
| +from itertools import izip
|
| +
|
| +from logilab.common.ureports import Table
|
| +
|
| +from pylint.interfaces import IRawChecker
|
| +from pylint.checkers import BaseChecker, table_lines_from_stats
|
| +
|
| +
|
| +class Similar:
|
| + """finds copy-pasted lines of code in a project"""
|
| +
|
| + def __init__(self, min_lines=4, ignore_comments=False,
|
| + ignore_docstrings=False):
|
| + self.min_lines = min_lines
|
| + self.ignore_comments = ignore_comments
|
| + self.ignore_docstrings = ignore_docstrings
|
| + self.linesets = []
|
| +
|
| + def append_stream(self, streamid, stream):
|
| + """append a file to search for similarities"""
|
| + stream.seek(0) # XXX may be removed with astng > 0.23
|
| + self.linesets.append(LineSet(streamid,
|
| + stream.readlines(),
|
| + self.ignore_comments,
|
| + self.ignore_docstrings))
|
| +
|
| + def run(self):
|
| + """start looking for similarities and display results on stdout"""
|
| + self._display_sims(self._compute_sims())
|
| +
|
| + def _compute_sims(self):
|
| + """compute similarities in appended files"""
|
| + no_duplicates = {}
|
| + for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
|
| + duplicate = no_duplicates.setdefault(num, [])
|
| + for couples in duplicate:
|
| + if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
|
| + couples.add( (lineset1, idx1) )
|
| + couples.add( (lineset2, idx2) )
|
| + break
|
| + else:
|
| + duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) )
|
| + sims = []
|
| + for num, ensembles in no_duplicates.iteritems():
|
| + for couples in ensembles:
|
| + sims.append( (num, couples) )
|
| + sims.sort()
|
| + sims.reverse()
|
| + return sims
|
| +
|
| + def _display_sims(self, sims):
|
| + """display computed similarities on stdout"""
|
| + nb_lignes_dupliquees = 0
|
| + for num, couples in sims:
|
| + print
|
| + print num, "similar lines in", len(couples), "files"
|
| + couples = sorted(couples)
|
| + for lineset, idx in couples:
|
| + print "==%s:%s" % (lineset.name, idx)
|
| + # pylint: disable=W0631
|
| + for line in lineset._real_lines[idx:idx+num]:
|
| + print " ", line,
|
| + nb_lignes_dupliquees += num * (len(couples)-1)
|
| + nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
|
| + print "TOTAL lines=%s duplicates=%s percent=%.2f" \
|
| + % (nb_total_lignes, nb_lignes_dupliquees,
|
| + nb_lignes_dupliquees*100. / nb_total_lignes)
|
| +
|
| + def _find_common(self, lineset1, lineset2):
|
| + """find similarities in the two given linesets"""
|
| + lines1 = lineset1.enumerate_stripped
|
| + lines2 = lineset2.enumerate_stripped
|
| + find = lineset2.find
|
| + index1 = 0
|
| + min_lines = self.min_lines
|
| + while index1 < len(lineset1):
|
| + skip = 1
|
| + num = 0
|
| + for index2 in find( lineset1[index1] ):
|
| + non_blank = 0
|
| + for num, ((_, line1), (_, line2)) in enumerate(
|
| + izip(lines1(index1), lines2(index2))):
|
| + if line1 != line2:
|
| + if non_blank > min_lines:
|
| + yield num, lineset1, index1, lineset2, index2
|
| + skip = max(skip, num)
|
| + break
|
| + if line1:
|
| + non_blank += 1
|
| + else:
|
| + # we may have reach the end
|
| + num += 1
|
| + if non_blank > min_lines:
|
| + yield num, lineset1, index1, lineset2, index2
|
| + skip = max(skip, num)
|
| + index1 += skip
|
| +
|
| + def _iter_sims(self):
|
| + """iterate on similarities among all files, by making a cartesian
|
| + product
|
| + """
|
| + for idx, lineset in enumerate(self.linesets[:-1]):
|
| + for lineset2 in self.linesets[idx+1:]:
|
| + for sim in self._find_common(lineset, lineset2):
|
| + yield sim
|
| +
|
| +def stripped_lines(lines, ignore_comments, ignore_docstrings):
|
| + strippedlines = []
|
| + docstring = None
|
| + for line in lines:
|
| + line = line.strip()
|
| + if ignore_docstrings:
|
| + if not docstring and \
|
| + (line.startswith('"""') or line.startswith("'''")):
|
| + docstring = line[:3]
|
| + line = line[3:]
|
| + if docstring:
|
| + if line.endswith(docstring):
|
| + docstring = None
|
| + line = ''
|
| + if ignore_comments:
|
| + # XXX should use regex in checkers/format to avoid cutting
|
| + # at a "#" in a string
|
| + line = line.split('#', 1)[0].strip()
|
| + strippedlines.append(line)
|
| + return strippedlines
|
| +
|
| +class LineSet:
|
| + """Holds and indexes all the lines of a single source file"""
|
| + def __init__(self, name, lines, ignore_comments=False,
|
| + ignore_docstrings=False):
|
| + self.name = name
|
| + self._real_lines = lines
|
| + self._stripped_lines = stripped_lines(lines, ignore_comments,
|
| + ignore_docstrings)
|
| + self._index = self._mk_index()
|
| +
|
| + def __str__(self):
|
| + return '<Lineset for %s>' % self.name
|
| +
|
| + def __len__(self):
|
| + return len(self._real_lines)
|
| +
|
| + def __getitem__(self, index):
|
| + return self._stripped_lines[index]
|
| +
|
| + def __lt__(self, other):
|
| + return self.name < other.name
|
| +
|
| + def __hash__(self):
|
| + return id(self)
|
| +
|
| + def enumerate_stripped(self, start_at=0):
|
| + """return an iterator on stripped lines, starting from a given index
|
| + if specified, else 0
|
| + """
|
| + idx = start_at
|
| + if start_at:
|
| + lines = self._stripped_lines[start_at:]
|
| + else:
|
| + lines = self._stripped_lines
|
| + for line in lines:
|
| + #if line:
|
| + yield idx, line
|
| + idx += 1
|
| +
|
| + def find(self, stripped_line):
|
| + """return positions of the given stripped line in this set"""
|
| + return self._index.get(stripped_line, ())
|
| +
|
| + def _mk_index(self):
|
| + """create the index for this set"""
|
| + index = {}
|
| + for line_no, line in enumerate(self._stripped_lines):
|
| + if line:
|
| + index.setdefault(line, []).append( line_no )
|
| + return index
|
| +
|
| +
|
| +MSGS = {'R0801': ('Similar lines in %s files\n%s',
|
| + 'Indicates that a set of similar lines has been detected \
|
| + among multiple file. This usually means that the code should \
|
| + be refactored to avoid this duplication.')}
|
| +
|
| +def report_similarities(sect, stats, old_stats):
|
| + """make a layout with some stats about duplication"""
|
| + lines = ['', 'now', 'previous', 'difference']
|
| + lines += table_lines_from_stats(stats, old_stats,
|
| + ('nb_duplicated_lines',
|
| + 'percent_duplicated_lines'))
|
| + sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))
|
| +
|
| +
|
| +# wrapper to get a pylint checker from the similar class
|
| +class SimilarChecker(BaseChecker, Similar):
|
| + """checks for similarities and duplicated code. This computation may be
|
| + memory / CPU intensive, so you should disable it if you experiment some
|
| + problems.
|
| + """
|
| +
|
| + __implements__ = (IRawChecker,)
|
| + # configuration section name
|
| + name = 'similarities'
|
| + # messages
|
| + msgs = MSGS
|
| + # configuration options
|
| + # for available dict keys/values see the optik parser 'add_option' method
|
| + options = (('min-similarity-lines',
|
| + {'default' : 4, 'type' : "int", 'metavar' : '<int>',
|
| + 'help' : 'Minimum lines number of a similarity.'}),
|
| + ('ignore-comments',
|
| + {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
|
| + 'help': 'Ignore comments when computing similarities.'}
|
| + ),
|
| + ('ignore-docstrings',
|
| + {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
|
| + 'help': 'Ignore docstrings when computing similarities.'}
|
| + ),
|
| + )
|
| + # reports
|
| + reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually a Refactoring message
|
| +
|
| + def __init__(self, linter=None):
|
| + BaseChecker.__init__(self, linter)
|
| + Similar.__init__(self, min_lines=4,
|
| + ignore_comments=True, ignore_docstrings=True)
|
| + self.stats = None
|
| +
|
| + def set_option(self, optname, value, action=None, optdict=None):
|
| + """method called to set an option (registered in the options list)
|
| +
|
| + overridden to report options setting to Similar
|
| + """
|
| + BaseChecker.set_option(self, optname, value, action, optdict)
|
| + if optname == 'min-similarity-lines':
|
| + self.min_lines = self.config.min_similarity_lines
|
| + elif optname == 'ignore-comments':
|
| + self.ignore_comments = self.config.ignore_comments
|
| + elif optname == 'ignore-docstrings':
|
| + self.ignore_docstrings = self.config.ignore_docstrings
|
| +
|
| + def open(self):
|
| + """init the checkers: reset linesets and statistics information"""
|
| + self.linesets = []
|
| + self.stats = self.linter.add_stats(nb_duplicated_lines=0,
|
| + percent_duplicated_lines=0)
|
| +
|
| + def process_module(self, node):
|
| + """process a module
|
| +
|
| + the module's content is accessible via the stream object
|
| +
|
| + stream must implement the readlines method
|
| + """
|
| + self.append_stream(self.linter.current_name, node.file_stream)
|
| +
|
| + def close(self):
|
| + """compute and display similarities on closing (i.e. end of parsing)"""
|
| + total = sum([len(lineset) for lineset in self.linesets])
|
| + duplicated = 0
|
| + stats = self.stats
|
| + for num, couples in self._compute_sims():
|
| + msg = []
|
| + for lineset, idx in couples:
|
| + msg.append("==%s:%s" % (lineset.name, idx))
|
| + msg.sort()
|
| + # pylint: disable=W0631
|
| + for line in lineset._real_lines[idx:idx+num]:
|
| + msg.append(line.rstrip())
|
| + self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
|
| + duplicated += num * (len(couples) - 1)
|
| + stats['nb_duplicated_lines'] = duplicated
|
| + stats['percent_duplicated_lines'] = total and duplicated * 100. / total
|
| +
|
| +
|
| +def register(linter):
|
| + """required method to auto register this checker """
|
| + linter.register_checker(SimilarChecker(linter))
|
| +
|
| +def usage(status=0):
|
| + """display command line usage information"""
|
| + print "finds copy pasted blocks in a set of files"
|
| + print
|
| + print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \
|
| +[-i|--ignore-comments] file1...'
|
| + sys.exit(status)
|
| +
|
| +def run(argv=None):
|
| + """standalone command line access point"""
|
| + if argv is None:
|
| + argv = sys.argv[1:]
|
| + from getopt import getopt
|
| + s_opts = 'hdi'
|
| + l_opts = ('help', 'duplicates=', 'ignore-comments')
|
| + min_lines = 4
|
| + ignore_comments = False
|
| + opts, args = getopt(argv, s_opts, l_opts)
|
| + for opt, val in opts:
|
| + if opt in ('-d', '--duplicates'):
|
| + min_lines = int(val)
|
| + elif opt in ('-h', '--help'):
|
| + usage()
|
| + elif opt in ('-i', '--ignore-comments'):
|
| + ignore_comments = True
|
| + if not args:
|
| + usage(1)
|
| + sim = Similar(min_lines, ignore_comments)
|
| + for filename in args:
|
| + sim.append_stream(filename, open(filename))
|
| + sim.run()
|
| +
|
| +if __name__ == '__main__':
|
| + run()
|
|
|