| OLD | NEW |
| 1 # Copyright 2013 The Chromium Authors. All rights reserved. | 1 # Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 from collections import defaultdict, deque, namedtuple | 5 from collections import defaultdict, deque, namedtuple |
| 6 from HTMLParser import HTMLParser, HTMLParseError | 6 from HTMLParser import HTMLParser, HTMLParseError |
| 7 from itertools import groupby | 7 from itertools import groupby |
| 8 from operator import itemgetter | 8 from operator import itemgetter |
| 9 import posixpath | 9 import posixpath |
| 10 from urlparse import urlsplit | 10 from urlparse import urlsplit |
| 11 | 11 |
| 12 from file_system_util import CreateURLsFromPaths | 12 from file_system_util import CreateURLsFromPaths |
| 13 import svn_constants | 13 |
| 14 | 14 |
| 15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs') | 15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs') |
| 16 | 16 |
| 17 |
| 17 def _SplitAnchor(url): | 18 def _SplitAnchor(url): |
| 18 components = urlsplit(url) | 19 components = urlsplit(url) |
| 19 return components.path, components.fragment | 20 return components.path, components.fragment |
| 20 | 21 |
| 22 |
| 21 def _Process(path, renderer): | 23 def _Process(path, renderer): |
| 22 '''Render the page at |path| using a |renderer| and process the contents of | 24 '''Render the page at |path| using a |renderer| and process the contents of |
| 23 that page. Returns a |Page| namedtuple with fields for the http status code | 25 that page. Returns a |Page| namedtuple with fields for the http status code |
| 24 of the page render, the href of all the links that occurred on the page, all | 26 of the page render, the href of all the links that occurred on the page, all |
| 25 of the anchors on the page (ids and names), and all links that contain an | 27 of the anchors on the page (ids and names), and all links that contain an |
| 26 anchor component. | 28 anchor component. |
| 27 | 29 |
| 28 If a non-html page is properly rendered, a |Page| with status code 200 and | 30 If a non-html page is properly rendered, a |Page| with status code 200 and |
| 29 all other fields empty is returned. | 31 all other fields empty is returned. |
| 30 ''' | 32 ''' |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 71 else: | 73 else: |
| 72 link = posixpath.normpath('%s/%s' % (base, link)) | 74 link = posixpath.normpath('%s/%s' % (base, link)) |
| 73 | 75 |
| 74 if '#' in link: | 76 if '#' in link: |
| 75 anchor_refs.append(link) | 77 anchor_refs.append(link) |
| 76 else: | 78 else: |
| 77 edges.append(link) | 79 edges.append(link) |
| 78 | 80 |
| 79 return Page(200, edges, anchors, anchor_refs) | 81 return Page(200, edges, anchors, anchor_refs) |
| 80 | 82 |
| 83 |
| 81 class _ContentParser(HTMLParser): | 84 class _ContentParser(HTMLParser): |
| 82 '''Parse an html file pulling out all links and anchor_refs, where an | 85 '''Parse an html file pulling out all links and anchor_refs, where an |
| 83 anchor_ref is a link that contains an anchor. | 86 anchor_ref is a link that contains an anchor. |
| 84 ''' | 87 ''' |
| 85 | 88 |
| 86 def __init__(self): | 89 def __init__(self): |
| 87 HTMLParser.__init__(self) | 90 HTMLParser.__init__(self) |
| 88 self.links = [] | 91 self.links = [] |
| 89 self.anchors = set() | 92 self.anchors = set() |
| 90 | 93 |
| 91 def handle_starttag(self, tag, raw_attrs): | 94 def handle_starttag(self, tag, raw_attrs): |
| 92 attrs = dict(raw_attrs) | 95 attrs = dict(raw_attrs) |
| 93 | 96 |
| 94 if tag == 'a': | 97 if tag == 'a': |
| 95 # Handle special cases for href's that: start with a space, contain | 98 # Handle special cases for href's that: start with a space, contain |
| 96 # just a '.' (period), contain python templating code, are an absolute | 99 # just a '.' (period), contain python templating code, are an absolute |
| 97 # url, are a zip file, or execute javascript on the page. | 100 # url, are a zip file, or execute javascript on the page. |
| 98 href = attrs.get('href', '').strip() | 101 href = attrs.get('href', '').strip() |
| 99 if href and not href == '.' and not '{{' in href: | 102 if href and not href == '.' and not '{{' in href: |
| 100 if not urlsplit(href).scheme in ('http', 'https'): | 103 if not urlsplit(href).scheme in ('http', 'https'): |
| 101 if not href.endswith('.zip') and not 'javascript:' in href: | 104 if not href.endswith('.zip') and not 'javascript:' in href: |
| 102 self.links.append(href) | 105 self.links.append(href) |
| 103 | 106 |
| 104 if attrs.get('id'): | 107 if attrs.get('id'): |
| 105 self.anchors.add(attrs['id']) | 108 self.anchors.add(attrs['id']) |
| 106 if attrs.get('name'): | 109 if attrs.get('name'): |
| 107 self.anchors.add(attrs['name']) | 110 self.anchors.add(attrs['name']) |
| 108 | 111 |
| 112 |
| 109 class LinkErrorDetector(object): | 113 class LinkErrorDetector(object): |
| 110 '''Finds link errors on the doc server. This includes broken links, those with | 114 '''Finds link errors on the doc server. This includes broken links, those with |
| 111 a target page that 404s or contain an anchor that doesn't exist, or pages that | 115 a target page that 404s or contain an anchor that doesn't exist, or pages that |
| 112 have no links to them. | 116 have no links to them. |
| 113 ''' | 117 ''' |
| 114 | 118 |
| 115 def __init__(self, file_system, renderer, public_path, root_pages): | 119 def __init__(self, file_system, renderer, public_path, root_pages): |
| 116 '''Creates a new broken link detector. |renderer| is a callable that takes | 120 '''Creates a new broken link detector. |renderer| is a callable that takes |
| 117 a path and returns a full html page. |public_path| is the path to public | 121 a path and returns a full html page. |public_path| is the path to public |
| 118 template files. All URLs in |root_pages| are used as the starting nodes for | 122 template files. All URLs in |root_pages| are used as the starting nodes for |
| (...skipping 11 matching lines...) Expand all Loading... |
| 130 'extensions/private_apis.html')) | 134 'extensions/private_apis.html')) |
| 131 self._redirection_whitelist = frozenset(('extensions/', 'apps/')) | 135 self._redirection_whitelist = frozenset(('extensions/', 'apps/')) |
| 132 | 136 |
| 133 self._RenderAllPages() | 137 self._RenderAllPages() |
| 134 | 138 |
| 135 def _RenderAllPages(self): | 139 def _RenderAllPages(self): |
| 136 '''Traverses the public templates directory rendering each URL and | 140 '''Traverses the public templates directory rendering each URL and |
| 137 processing the resultant html to pull out all links and anchors. | 141 processing the resultant html to pull out all links and anchors. |
| 138 ''' | 142 ''' |
| 139 top_level_directories = ( | 143 top_level_directories = ( |
| 140 (svn_constants.PUBLIC_TEMPLATE_PATH, ''), | 144 ('docs/templates/public', ''), |
| 141 (svn_constants.STATIC_PATH, 'static/'), | 145 ('docs/static', 'static/'), |
| 142 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'), | 146 ('docs/examples', 'extensions/examples/'), |
| 143 ) | 147 ) |
| 144 | 148 |
| 145 for dirpath, urlprefix in top_level_directories: | 149 for dirpath, urlprefix in top_level_directories: |
| 146 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) | 150 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) |
| 147 for url, path in files: | 151 for url, path in files: |
| 148 self._pages[url] = _Process(url, self._renderer) | 152 self._pages[url] = _Process(url, self._renderer) |
| 149 | 153 |
| 150 if self._pages[url].status != 200: | 154 if self._pages[url].status != 200: |
| 151 print(url, ', a url derived from the path', dirpath + | 155 print(url, ', a url derived from the path', dirpath + |
| 152 ', resulted in a', self._pages[url].status) | 156 ', resulted in a', self._pages[url].status) |
| (...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 262 for link in target_page.links: | 266 for link in target_page.links: |
| 263 if link not in found: | 267 if link not in found: |
| 264 found.add(link) | 268 found.add(link) |
| 265 pages_to_check.append(link) | 269 pages_to_check.append(link) |
| 266 | 270 |
| 267 all_urls = set( | 271 all_urls = set( |
| 268 [url for url, page in self._pages.iteritems() if page.status == 200]) | 272 [url for url, page in self._pages.iteritems() if page.status == 200]) |
| 269 | 273 |
| 270 return [url for url in all_urls - found if url.endswith('.html')] | 274 return [url for url in all_urls - found if url.endswith('.html')] |
| 271 | 275 |
| 276 |
| 272 def StringifyBrokenLinks(broken_links): | 277 def StringifyBrokenLinks(broken_links): |
| 273 '''Prints out broken links in a more readable format. | 278 '''Prints out broken links in a more readable format. |
| 274 ''' | 279 ''' |
| 275 def fixed_width(string, width): | 280 def fixed_width(string, width): |
| 276 return "%s%s" % (string, (width - len(string)) * ' ') | 281 return "%s%s" % (string, (width - len(string)) * ' ') |
| 277 | 282 |
| 278 first_col_width = max(len(link[1]) for link in broken_links) | 283 first_col_width = max(len(link[1]) for link in broken_links) |
| 279 second_col_width = max(len(link[2]) for link in broken_links) | 284 second_col_width = max(len(link[2]) for link in broken_links) |
| 280 target = itemgetter(2) | 285 target = itemgetter(2) |
| 281 output = [] | 286 output = [] |
| 282 | 287 |
| 283 def pretty_print(link, col_offset=0): | 288 def pretty_print(link, col_offset=0): |
| 284 return "%s -> %s %s" % ( | 289 return "%s -> %s %s" % ( |
| 285 fixed_width(link[1], first_col_width - col_offset), | 290 fixed_width(link[1], first_col_width - col_offset), |
| 286 fixed_width(link[2], second_col_width), | 291 fixed_width(link[2], second_col_width), |
| 287 link[3]) | 292 link[3]) |
| 288 | 293 |
| 289 for target, links in groupby(sorted(broken_links, key=target), target): | 294 for target, links in groupby(sorted(broken_links, key=target), target): |
| 290 links = list(links) | 295 links = list(links) |
| 291 # Compress messages | 296 # Compress messages |
| 292 if len(links) > 50 and not links[0][2].startswith('#'): | 297 if len(links) > 50 and not links[0][2].startswith('#'): |
| 293 message = "Found %d broken links (" % len(links) | 298 message = "Found %d broken links (" % len(links) |
| 294 output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) | 299 output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) |
| 295 else: | 300 else: |
| 296 for link in links: | 301 for link in links: |
| 297 output.append(pretty_print(link)) | 302 output.append(pretty_print(link)) |
| 298 | 303 |
| 299 return '\n'.join(output) | 304 return '\n'.join(output) |
| OLD | NEW |