Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(7)

Side by Side Diff: third_party/pylint/checkers/similar.py

Issue 10447014: Add pylint to depot_tools. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/depot_tools
Patch Set: Fix unittests. Created 8 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 # pylint: disable=W0622
2 # Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE).
3 # http://www.logilab.fr/ -- mailto:contact@logilab.fr
4 #
5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version.
9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
13 #
14 # You should have received a copy of the GNU General Public License along with
15 # this program; if not, write to the Free Software Foundation, Inc.,
16 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 """a similarities / code duplication command line tool and pylint checker
18 """
19 from __future__ import generators
20
21 import sys
22 from itertools import izip
23
24 from logilab.common.ureports import Table
25
26 from pylint.interfaces import IRawChecker
27 from pylint.checkers import BaseChecker, table_lines_from_stats
28
29
30 class Similar:
31 """finds copy-pasted lines of code in a project"""
32
33 def __init__(self, min_lines=4, ignore_comments=False,
34 ignore_docstrings=False):
35 self.min_lines = min_lines
36 self.ignore_comments = ignore_comments
37 self.ignore_docstrings = ignore_docstrings
38 self.linesets = []
39
40 def append_stream(self, streamid, stream):
41 """append a file to search for similarities"""
42 stream.seek(0) # XXX may be removed with astng > 0.23
43 self.linesets.append(LineSet(streamid,
44 stream.readlines(),
45 self.ignore_comments,
46 self.ignore_docstrings))
47
48 def run(self):
49 """start looking for similarities and display results on stdout"""
50 self._display_sims(self._compute_sims())
51
52 def _compute_sims(self):
53 """compute similarities in appended files"""
54 no_duplicates = {}
55 for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
56 duplicate = no_duplicates.setdefault(num, [])
57 for couples in duplicate:
58 if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
59 couples.add( (lineset1, idx1) )
60 couples.add( (lineset2, idx2) )
61 break
62 else:
63 duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) )
64 sims = []
65 for num, ensembles in no_duplicates.iteritems():
66 for couples in ensembles:
67 sims.append( (num, couples) )
68 sims.sort()
69 sims.reverse()
70 return sims
71
72 def _display_sims(self, sims):
73 """display computed similarities on stdout"""
74 nb_lignes_dupliquees = 0
75 for num, couples in sims:
76 print
77 print num, "similar lines in", len(couples), "files"
78 couples = sorted(couples)
79 for lineset, idx in couples:
80 print "==%s:%s" % (lineset.name, idx)
81 # pylint: disable=W0631
82 for line in lineset._real_lines[idx:idx+num]:
83 print " ", line,
84 nb_lignes_dupliquees += num * (len(couples)-1)
85 nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
86 print "TOTAL lines=%s duplicates=%s percent=%.2f" \
87 % (nb_total_lignes, nb_lignes_dupliquees,
88 nb_lignes_dupliquees*100. / nb_total_lignes)
89
90 def _find_common(self, lineset1, lineset2):
91 """find similarities in the two given linesets"""
92 lines1 = lineset1.enumerate_stripped
93 lines2 = lineset2.enumerate_stripped
94 find = lineset2.find
95 index1 = 0
96 min_lines = self.min_lines
97 while index1 < len(lineset1):
98 skip = 1
99 num = 0
100 for index2 in find( lineset1[index1] ):
101 non_blank = 0
102 for num, ((_, line1), (_, line2)) in enumerate(
103 izip(lines1(index1), lines2(index2))):
104 if line1 != line2:
105 if non_blank > min_lines:
106 yield num, lineset1, index1, lineset2, index2
107 skip = max(skip, num)
108 break
109 if line1:
110 non_blank += 1
111 else:
112 # we may have reach the end
113 num += 1
114 if non_blank > min_lines:
115 yield num, lineset1, index1, lineset2, index2
116 skip = max(skip, num)
117 index1 += skip
118
119 def _iter_sims(self):
120 """iterate on similarities among all files, by making a cartesian
121 product
122 """
123 for idx, lineset in enumerate(self.linesets[:-1]):
124 for lineset2 in self.linesets[idx+1:]:
125 for sim in self._find_common(lineset, lineset2):
126 yield sim
127
128 def stripped_lines(lines, ignore_comments, ignore_docstrings):
129 strippedlines = []
130 docstring = None
131 for line in lines:
132 line = line.strip()
133 if ignore_docstrings:
134 if not docstring and \
135 (line.startswith('"""') or line.startswith("'''")):
136 docstring = line[:3]
137 line = line[3:]
138 if docstring:
139 if line.endswith(docstring):
140 docstring = None
141 line = ''
142 if ignore_comments:
143 # XXX should use regex in checkers/format to avoid cutting
144 # at a "#" in a string
145 line = line.split('#', 1)[0].strip()
146 strippedlines.append(line)
147 return strippedlines
148
149 class LineSet:
150 """Holds and indexes all the lines of a single source file"""
151 def __init__(self, name, lines, ignore_comments=False,
152 ignore_docstrings=False):
153 self.name = name
154 self._real_lines = lines
155 self._stripped_lines = stripped_lines(lines, ignore_comments,
156 ignore_docstrings)
157 self._index = self._mk_index()
158
159 def __str__(self):
160 return '<Lineset for %s>' % self.name
161
162 def __len__(self):
163 return len(self._real_lines)
164
165 def __getitem__(self, index):
166 return self._stripped_lines[index]
167
168 def __lt__(self, other):
169 return self.name < other.name
170
171 def __hash__(self):
172 return id(self)
173
174 def enumerate_stripped(self, start_at=0):
175 """return an iterator on stripped lines, starting from a given index
176 if specified, else 0
177 """
178 idx = start_at
179 if start_at:
180 lines = self._stripped_lines[start_at:]
181 else:
182 lines = self._stripped_lines
183 for line in lines:
184 #if line:
185 yield idx, line
186 idx += 1
187
188 def find(self, stripped_line):
189 """return positions of the given stripped line in this set"""
190 return self._index.get(stripped_line, ())
191
192 def _mk_index(self):
193 """create the index for this set"""
194 index = {}
195 for line_no, line in enumerate(self._stripped_lines):
196 if line:
197 index.setdefault(line, []).append( line_no )
198 return index
199
200
201 MSGS = {'R0801': ('Similar lines in %s files\n%s',
202 'Indicates that a set of similar lines has been detected \
203 among multiple file. This usually means that the code should \
204 be refactored to avoid this duplication.')}
205
206 def report_similarities(sect, stats, old_stats):
207 """make a layout with some stats about duplication"""
208 lines = ['', 'now', 'previous', 'difference']
209 lines += table_lines_from_stats(stats, old_stats,
210 ('nb_duplicated_lines',
211 'percent_duplicated_lines'))
212 sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))
213
214
215 # wrapper to get a pylint checker from the similar class
216 class SimilarChecker(BaseChecker, Similar):
217 """checks for similarities and duplicated code. This computation may be
218 memory / CPU intensive, so you should disable it if you experiment some
219 problems.
220 """
221
222 __implements__ = (IRawChecker,)
223 # configuration section name
224 name = 'similarities'
225 # messages
226 msgs = MSGS
227 # configuration options
228 # for available dict keys/values see the optik parser 'add_option' method
229 options = (('min-similarity-lines',
230 {'default' : 4, 'type' : "int", 'metavar' : '<int>',
231 'help' : 'Minimum lines number of a similarity.'}),
232 ('ignore-comments',
233 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
234 'help': 'Ignore comments when computing similarities.'}
235 ),
236 ('ignore-docstrings',
237 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
238 'help': 'Ignore docstrings when computing similarities.'}
239 ),
240 )
241 # reports
242 reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually a Refactoring message
243
244 def __init__(self, linter=None):
245 BaseChecker.__init__(self, linter)
246 Similar.__init__(self, min_lines=4,
247 ignore_comments=True, ignore_docstrings=True)
248 self.stats = None
249
250 def set_option(self, optname, value, action=None, optdict=None):
251 """method called to set an option (registered in the options list)
252
253 overridden to report options setting to Similar
254 """
255 BaseChecker.set_option(self, optname, value, action, optdict)
256 if optname == 'min-similarity-lines':
257 self.min_lines = self.config.min_similarity_lines
258 elif optname == 'ignore-comments':
259 self.ignore_comments = self.config.ignore_comments
260 elif optname == 'ignore-docstrings':
261 self.ignore_docstrings = self.config.ignore_docstrings
262
263 def open(self):
264 """init the checkers: reset linesets and statistics information"""
265 self.linesets = []
266 self.stats = self.linter.add_stats(nb_duplicated_lines=0,
267 percent_duplicated_lines=0)
268
269 def process_module(self, node):
270 """process a module
271
272 the module's content is accessible via the stream object
273
274 stream must implement the readlines method
275 """
276 self.append_stream(self.linter.current_name, node.file_stream)
277
278 def close(self):
279 """compute and display similarities on closing (i.e. end of parsing)"""
280 total = sum([len(lineset) for lineset in self.linesets])
281 duplicated = 0
282 stats = self.stats
283 for num, couples in self._compute_sims():
284 msg = []
285 for lineset, idx in couples:
286 msg.append("==%s:%s" % (lineset.name, idx))
287 msg.sort()
288 # pylint: disable=W0631
289 for line in lineset._real_lines[idx:idx+num]:
290 msg.append(line.rstrip())
291 self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
292 duplicated += num * (len(couples) - 1)
293 stats['nb_duplicated_lines'] = duplicated
294 stats['percent_duplicated_lines'] = total and duplicated * 100. / total
295
296
297 def register(linter):
298 """required method to auto register this checker """
299 linter.register_checker(SimilarChecker(linter))
300
301 def usage(status=0):
302 """display command line usage information"""
303 print "finds copy pasted blocks in a set of files"
304 print
305 print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \
306 [-i|--ignore-comments] file1...'
307 sys.exit(status)
308
309 def run(argv=None):
310 """standalone command line access point"""
311 if argv is None:
312 argv = sys.argv[1:]
313 from getopt import getopt
314 s_opts = 'hdi'
315 l_opts = ('help', 'duplicates=', 'ignore-comments')
316 min_lines = 4
317 ignore_comments = False
318 opts, args = getopt(argv, s_opts, l_opts)
319 for opt, val in opts:
320 if opt in ('-d', '--duplicates'):
321 min_lines = int(val)
322 elif opt in ('-h', '--help'):
323 usage()
324 elif opt in ('-i', '--ignore-comments'):
325 ignore_comments = True
326 if not args:
327 usage(1)
328 sim = Similar(min_lines, ignore_comments)
329 for filename in args:
330 sim.append_stream(filename, open(filename))
331 sim.run()
332
333 if __name__ == '__main__':
334 run()
OLDNEW
« no previous file with comments | « third_party/pylint/checkers/raw_metrics.py ('k') | third_party/pylint/checkers/string_format.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698