Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(39)

Side by Side Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 68873003: Docserver: Serve docs out of src/ not src/chrome/common/extensions. This allows (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: . Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 # Copyright 2013 The Chromium Authors. All rights reserved. 1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 from collections import defaultdict, deque, namedtuple 5 from collections import defaultdict, deque, namedtuple
6 from HTMLParser import HTMLParser, HTMLParseError 6 from HTMLParser import HTMLParser, HTMLParseError
7 from itertools import groupby 7 from itertools import groupby
8 from operator import itemgetter 8 from operator import itemgetter
9 import posixpath 9 import posixpath
10 from urlparse import urlsplit 10 from urlparse import urlsplit
11 11
12 from file_system_util import CreateURLsFromPaths 12 from file_system_util import CreateURLsFromPaths
13 import svn_constants 13
14 14
15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs') 15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
16 16
17
17 def _SplitAnchor(url): 18 def _SplitAnchor(url):
18 components = urlsplit(url) 19 components = urlsplit(url)
19 return components.path, components.fragment 20 return components.path, components.fragment
20 21
22
21 def _Process(path, renderer): 23 def _Process(path, renderer):
22 '''Render the page at |path| using a |renderer| and process the contents of 24 '''Render the page at |path| using a |renderer| and process the contents of
23 that page. Returns a |Page| namedtuple with fields for the http status code 25 that page. Returns a |Page| namedtuple with fields for the http status code
24 of the page render, the href of all the links that occurred on the page, all 26 of the page render, the href of all the links that occurred on the page, all
25 of the anchors on the page (ids and names), and all links that contain an 27 of the anchors on the page (ids and names), and all links that contain an
26 anchor component. 28 anchor component.
27 29
28 If a non-html page is properly rendered, a |Page| with status code 200 and 30 If a non-html page is properly rendered, a |Page| with status code 200 and
29 all other fields empty is returned. 31 all other fields empty is returned.
30 ''' 32 '''
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
71 else: 73 else:
72 link = posixpath.normpath('%s/%s' % (base, link)) 74 link = posixpath.normpath('%s/%s' % (base, link))
73 75
74 if '#' in link: 76 if '#' in link:
75 anchor_refs.append(link) 77 anchor_refs.append(link)
76 else: 78 else:
77 edges.append(link) 79 edges.append(link)
78 80
79 return Page(200, edges, anchors, anchor_refs) 81 return Page(200, edges, anchors, anchor_refs)
80 82
83
81 class _ContentParser(HTMLParser): 84 class _ContentParser(HTMLParser):
82 '''Parse an html file pulling out all links and anchor_refs, where an 85 '''Parse an html file pulling out all links and anchor_refs, where an
83 anchor_ref is a link that contains an anchor. 86 anchor_ref is a link that contains an anchor.
84 ''' 87 '''
85 88
86 def __init__(self): 89 def __init__(self):
87 HTMLParser.__init__(self) 90 HTMLParser.__init__(self)
88 self.links = [] 91 self.links = []
89 self.anchors = set() 92 self.anchors = set()
90 93
91 def handle_starttag(self, tag, raw_attrs): 94 def handle_starttag(self, tag, raw_attrs):
92 attrs = dict(raw_attrs) 95 attrs = dict(raw_attrs)
93 96
94 if tag == 'a': 97 if tag == 'a':
95 # Handle special cases for href's that: start with a space, contain 98 # Handle special cases for href's that: start with a space, contain
96 # just a '.' (period), contain python templating code, are an absolute 99 # just a '.' (period), contain python templating code, are an absolute
97 # url, are a zip file, or execute javascript on the page. 100 # url, are a zip file, or execute javascript on the page.
98 href = attrs.get('href', '').strip() 101 href = attrs.get('href', '').strip()
99 if href and not href == '.' and not '{{' in href: 102 if href and not href == '.' and not '{{' in href:
100 if not urlsplit(href).scheme in ('http', 'https'): 103 if not urlsplit(href).scheme in ('http', 'https'):
101 if not href.endswith('.zip') and not 'javascript:' in href: 104 if not href.endswith('.zip') and not 'javascript:' in href:
102 self.links.append(href) 105 self.links.append(href)
103 106
104 if attrs.get('id'): 107 if attrs.get('id'):
105 self.anchors.add(attrs['id']) 108 self.anchors.add(attrs['id'])
106 if attrs.get('name'): 109 if attrs.get('name'):
107 self.anchors.add(attrs['name']) 110 self.anchors.add(attrs['name'])
108 111
112
109 class LinkErrorDetector(object): 113 class LinkErrorDetector(object):
110 '''Finds link errors on the doc server. This includes broken links, those with 114 '''Finds link errors on the doc server. This includes broken links, those with
111 a target page that 404s or contain an anchor that doesn't exist, or pages that 115 a target page that 404s or contain an anchor that doesn't exist, or pages that
112 have no links to them. 116 have no links to them.
113 ''' 117 '''
114 118
115 def __init__(self, file_system, renderer, public_path, root_pages): 119 def __init__(self, file_system, renderer, public_path, root_pages):
116 '''Creates a new broken link detector. |renderer| is a callable that takes 120 '''Creates a new broken link detector. |renderer| is a callable that takes
117 a path and returns a full html page. |public_path| is the path to public 121 a path and returns a full html page. |public_path| is the path to public
118 template files. All URLs in |root_pages| are used as the starting nodes for 122 template files. All URLs in |root_pages| are used as the starting nodes for
(...skipping 11 matching lines...) Expand all
130 'extensions/private_apis.html')) 134 'extensions/private_apis.html'))
131 self._redirection_whitelist = frozenset(('extensions/', 'apps/')) 135 self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
132 136
133 self._RenderAllPages() 137 self._RenderAllPages()
134 138
135 def _RenderAllPages(self): 139 def _RenderAllPages(self):
136 '''Traverses the public templates directory rendering each URL and 140 '''Traverses the public templates directory rendering each URL and
137 processing the resultant html to pull out all links and anchors. 141 processing the resultant html to pull out all links and anchors.
138 ''' 142 '''
139 top_level_directories = ( 143 top_level_directories = (
140 (svn_constants.PUBLIC_TEMPLATE_PATH, ''), 144 ('docs/templates/public', ''),
141 (svn_constants.STATIC_PATH, 'static/'), 145 ('docs/static', 'static/'),
142 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'), 146 ('docs/examples', 'extensions/examples/'),
143 ) 147 )
144 148
145 for dirpath, urlprefix in top_level_directories: 149 for dirpath, urlprefix in top_level_directories:
146 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) 150 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
147 for url, path in files: 151 for url, path in files:
148 self._pages[url] = _Process(url, self._renderer) 152 self._pages[url] = _Process(url, self._renderer)
149 153
150 if self._pages[url].status != 200: 154 if self._pages[url].status != 200:
151 print(url, ', a url derived from the path', dirpath + 155 print(url, ', a url derived from the path', dirpath +
152 ', resulted in a', self._pages[url].status) 156 ', resulted in a', self._pages[url].status)
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after
262 for link in target_page.links: 266 for link in target_page.links:
263 if link not in found: 267 if link not in found:
264 found.add(link) 268 found.add(link)
265 pages_to_check.append(link) 269 pages_to_check.append(link)
266 270
267 all_urls = set( 271 all_urls = set(
268 [url for url, page in self._pages.iteritems() if page.status == 200]) 272 [url for url, page in self._pages.iteritems() if page.status == 200])
269 273
270 return [url for url in all_urls - found if url.endswith('.html')] 274 return [url for url in all_urls - found if url.endswith('.html')]
271 275
276
272 def StringifyBrokenLinks(broken_links): 277 def StringifyBrokenLinks(broken_links):
273 '''Prints out broken links in a more readable format. 278 '''Prints out broken links in a more readable format.
274 ''' 279 '''
275 def fixed_width(string, width): 280 def fixed_width(string, width):
276 return "%s%s" % (string, (width - len(string)) * ' ') 281 return "%s%s" % (string, (width - len(string)) * ' ')
277 282
278 first_col_width = max(len(link[1]) for link in broken_links) 283 first_col_width = max(len(link[1]) for link in broken_links)
279 second_col_width = max(len(link[2]) for link in broken_links) 284 second_col_width = max(len(link[2]) for link in broken_links)
280 target = itemgetter(2) 285 target = itemgetter(2)
281 output = [] 286 output = []
282 287
283 def pretty_print(link, col_offset=0): 288 def pretty_print(link, col_offset=0):
284 return "%s -> %s %s" % ( 289 return "%s -> %s %s" % (
285 fixed_width(link[1], first_col_width - col_offset), 290 fixed_width(link[1], first_col_width - col_offset),
286 fixed_width(link[2], second_col_width), 291 fixed_width(link[2], second_col_width),
287 link[3]) 292 link[3])
288 293
289 for target, links in groupby(sorted(broken_links, key=target), target): 294 for target, links in groupby(sorted(broken_links, key=target), target):
290 links = list(links) 295 links = list(links)
291 # Compress messages 296 # Compress messages
292 if len(links) > 50 and not links[0][2].startswith('#'): 297 if len(links) > 50 and not links[0][2].startswith('#'):
293 message = "Found %d broken links (" % len(links) 298 message = "Found %d broken links (" % len(links)
294 output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) 299 output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
295 else: 300 else:
296 for link in links: 301 for link in links:
297 output.append(pretty_print(link)) 302 output.append(pretty_print(link))
298 303
299 return '\n'.join(output) 304 return '\n'.join(output)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698