chrome/common/extensions/docs/server2/link_error_detector.py - Issue 68873003: Docserver: Serve docs out of src/ not src/chrome/common/extensions. This allows

Side by Side Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 68873003: Docserver: Serve docs out of src/ not src/chrome/common/extensions. This allows (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: . Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« chrome/common/extensions/docs/server2/extensions_paths.py ('K') | « chrome/common/extensions/docs/server2/intro_data_source.py ('k') | chrome/common/extensions/docs/server2/local_file_system.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2013 The Chromium Authors. All rights reserved.	1 # Copyright 2013 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 from collections import defaultdict, deque, namedtuple	5 from collections import defaultdict, deque, namedtuple

6 from HTMLParser import HTMLParser, HTMLParseError	6 from HTMLParser import HTMLParser, HTMLParseError

7 from itertools import groupby	7 from itertools import groupby

8 from operator import itemgetter	8 from operator import itemgetter

9 import posixpath	9 import posixpath

10 from urlparse import urlsplit	10 from urlparse import urlsplit

11	11

12 from file_system_util import CreateURLsFromPaths	12 from file_system_util import CreateURLsFromPaths

13 import svn_constants	13

14	14

15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')	15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')

16	16

	17

17 def _SplitAnchor(url):	18 def _SplitAnchor(url):

18 components = urlsplit(url)	19 components = urlsplit(url)

19 return components.path, components.fragment	20 return components.path, components.fragment

20	21

	22

21 def _Process(path, renderer):	23 def _Process(path, renderer):

22 '''Render the page at \|path\| using a \|renderer\| and process the contents of	24 '''Render the page at \|path\| using a \|renderer\| and process the contents of

23 that page. Returns a \|Page\| namedtuple with fields for the http status code	25 that page. Returns a \|Page\| namedtuple with fields for the http status code

24 of the page render, the href of all the links that occurred on the page, all	26 of the page render, the href of all the links that occurred on the page, all

25 of the anchors on the page (ids and names), and all links that contain an	27 of the anchors on the page (ids and names), and all links that contain an

26 anchor component.	28 anchor component.

27	29

28 If a non-html page is properly rendered, a \|Page\| with status code 200 and	30 If a non-html page is properly rendered, a \|Page\| with status code 200 and

29 all other fields empty is returned.	31 all other fields empty is returned.

30 '''	32 '''

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
71 else:	73 else:

72 link = posixpath.normpath('%s/%s' % (base, link))	74 link = posixpath.normpath('%s/%s' % (base, link))

73	75

74 if '#' in link:	76 if '#' in link:

75 anchor_refs.append(link)	77 anchor_refs.append(link)

76 else:	78 else:

77 edges.append(link)	79 edges.append(link)

78	80

79 return Page(200, edges, anchors, anchor_refs)	81 return Page(200, edges, anchors, anchor_refs)

80	82

	83

81 class _ContentParser(HTMLParser):	84 class _ContentParser(HTMLParser):

82 '''Parse an html file pulling out all links and anchor_refs, where an	85 '''Parse an html file pulling out all links and anchor_refs, where an

83 anchor_ref is a link that contains an anchor.	86 anchor_ref is a link that contains an anchor.

84 '''	87 '''

85	88

86 def __init__(self):	89 def __init__(self):

87 HTMLParser.__init__(self)	90 HTMLParser.__init__(self)

88 self.links = []	91 self.links = []

89 self.anchors = set()	92 self.anchors = set()

90	93

91 def handle_starttag(self, tag, raw_attrs):	94 def handle_starttag(self, tag, raw_attrs):

92 attrs = dict(raw_attrs)	95 attrs = dict(raw_attrs)

93	96

94 if tag == 'a':	97 if tag == 'a':

95 # Handle special cases for href's that: start with a space, contain	98 # Handle special cases for href's that: start with a space, contain

96 # just a '.' (period), contain python templating code, are an absolute	99 # just a '.' (period), contain python templating code, are an absolute

97 # url, are a zip file, or execute javascript on the page.	100 # url, are a zip file, or execute javascript on the page.

98 href = attrs.get('href', '').strip()	101 href = attrs.get('href', '').strip()

99 if href and not href == '.' and not '{{' in href:	102 if href and not href == '.' and not '{{' in href:

100 if not urlsplit(href).scheme in ('http', 'https'):	103 if not urlsplit(href).scheme in ('http', 'https'):

101 if not href.endswith('.zip') and not 'javascript:' in href:	104 if not href.endswith('.zip') and not 'javascript:' in href:

102 self.links.append(href)	105 self.links.append(href)

103	106

104 if attrs.get('id'):	107 if attrs.get('id'):

105 self.anchors.add(attrs['id'])	108 self.anchors.add(attrs['id'])

106 if attrs.get('name'):	109 if attrs.get('name'):

107 self.anchors.add(attrs['name'])	110 self.anchors.add(attrs['name'])

108	111

	112

109 class LinkErrorDetector(object):	113 class LinkErrorDetector(object):

110 '''Finds link errors on the doc server. This includes broken links, those with	114 '''Finds link errors on the doc server. This includes broken links, those with

111 a target page that 404s or contain an anchor that doesn't exist, or pages that	115 a target page that 404s or contain an anchor that doesn't exist, or pages that

112 have no links to them.	116 have no links to them.

113 '''	117 '''

114	118

115 def __init__(self, file_system, renderer, public_path, root_pages):	119 def __init__(self, file_system, renderer, public_path, root_pages):

116 '''Creates a new broken link detector. \|renderer\| is a callable that takes	120 '''Creates a new broken link detector. \|renderer\| is a callable that takes

117 a path and returns a full html page. \|public_path\| is the path to public	121 a path and returns a full html page. \|public_path\| is the path to public

118 template files. All URLs in \|root_pages\| are used as the starting nodes for	122 template files. All URLs in \|root_pages\| are used as the starting nodes for

(...skipping 11 matching lines...) Expand all Loading...
130 'extensions/private_apis.html'))	134 'extensions/private_apis.html'))

131 self._redirection_whitelist = frozenset(('extensions/', 'apps/'))	135 self._redirection_whitelist = frozenset(('extensions/', 'apps/'))

132	136

133 self._RenderAllPages()	137 self._RenderAllPages()

134	138

135 def _RenderAllPages(self):	139 def _RenderAllPages(self):

136 '''Traverses the public templates directory rendering each URL and	140 '''Traverses the public templates directory rendering each URL and

137 processing the resultant html to pull out all links and anchors.	141 processing the resultant html to pull out all links and anchors.

138 '''	142 '''

139 top_level_directories = (	143 top_level_directories = (

140 (svn_constants.PUBLIC_TEMPLATE_PATH, ''),	144 ('docs/templates/public', ''),

141 (svn_constants.STATIC_PATH, 'static/'),	145 ('docs/static', 'static/'),

142 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),	146 ('docs/examples', 'extensions/examples/'),

143 )	147 )

144	148

145 for dirpath, urlprefix in top_level_directories:	149 for dirpath, urlprefix in top_level_directories:

146 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)	150 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)

147 for url, path in files:	151 for url, path in files:

148 self._pages[url] = _Process(url, self._renderer)	152 self._pages[url] = _Process(url, self._renderer)

149	153

150 if self._pages[url].status != 200:	154 if self._pages[url].status != 200:

151 print(url, ', a url derived from the path', dirpath +	155 print(url, ', a url derived from the path', dirpath +

152 ', resulted in a', self._pages[url].status)	156 ', resulted in a', self._pages[url].status)

(...skipping 109 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
262 for link in target_page.links:	266 for link in target_page.links:

263 if link not in found:	267 if link not in found:

264 found.add(link)	268 found.add(link)

265 pages_to_check.append(link)	269 pages_to_check.append(link)

266	270

267 all_urls = set(	271 all_urls = set(

268 [url for url, page in self._pages.iteritems() if page.status == 200])	272 [url for url, page in self._pages.iteritems() if page.status == 200])

269	273

270 return [url for url in all_urls - found if url.endswith('.html')]	274 return [url for url in all_urls - found if url.endswith('.html')]

271	275

	276

272 def StringifyBrokenLinks(broken_links):	277 def StringifyBrokenLinks(broken_links):

273 '''Prints out broken links in a more readable format.	278 '''Prints out broken links in a more readable format.

274 '''	279 '''

275 def fixed_width(string, width):	280 def fixed_width(string, width):

276 return "%s%s" % (string, (width - len(string)) * ' ')	281 return "%s%s" % (string, (width - len(string)) * ' ')

277	282

278 first_col_width = max(len(link[1]) for link in broken_links)	283 first_col_width = max(len(link[1]) for link in broken_links)

279 second_col_width = max(len(link[2]) for link in broken_links)	284 second_col_width = max(len(link[2]) for link in broken_links)

280 target = itemgetter(2)	285 target = itemgetter(2)

281 output = []	286 output = []

282	287

283 def pretty_print(link, col_offset=0):	288 def pretty_print(link, col_offset=0):

284 return "%s -> %s %s" % (	289 return "%s -> %s %s" % (

285 fixed_width(link[1], first_col_width - col_offset),	290 fixed_width(link[1], first_col_width - col_offset),

286 fixed_width(link[2], second_col_width),	291 fixed_width(link[2], second_col_width),

287 link[3])	292 link[3])

288	293

289 for target, links in groupby(sorted(broken_links, key=target), target):	294 for target, links in groupby(sorted(broken_links, key=target), target):

290 links = list(links)	295 links = list(links)

291 # Compress messages	296 # Compress messages

292 if len(links) > 50 and not links[0][2].startswith('#'):	297 if len(links) > 50 and not links[0][2].startswith('#'):

293 message = "Found %d broken links (" % len(links)	298 message = "Found %d broken links (" % len(links)

294 output.append("%s%s)" % (message, pretty_print(links[0], len(message))))	299 output.append("%s%s)" % (message, pretty_print(links[0], len(message))))

295 else:	300 else:

296 for link in links:	301 for link in links:

297 output.append(pretty_print(link))	302 output.append(pretty_print(link))

298	303

299 return '\n'.join(output)	304 return '\n'.join(output)

OLD	NEW