OLD | NEW |
---|---|
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import re | 5 import logging |
6 | |
7 from HTMLParser import HTMLParser | |
8 | |
6 from path_utils import FormatKey | 9 from path_utils import FormatKey |
7 from third_party.handlebar import Handlebar | 10 from third_party.handlebar import Handlebar |
8 | 11 |
12 class _IntroParser(HTMLParser): | |
13 """ An HTML parser which will parse table of contents and page title info out | |
14 of an intro. | |
15 """ | |
16 def __init__(self): | |
17 HTMLParser.__init__(self) | |
18 self.toc = [] | |
19 self.page_title = None | |
20 self._recent_tag = None | |
21 self._current_heading = {} | |
22 | |
23 def handle_starttag(self, tag, attrs): | |
24 id_ = '' | |
25 if tag not in ['h1', 'h2', 'h3']: | |
26 self.handle_data(self.get_starttag_text()) | |
27 return | |
28 self._recent_tag = tag | |
29 for attr in attrs: | |
30 if attr[0] == 'id': | |
31 id_ = attr[1] | |
32 if tag == 'h2': | |
33 self._current_heading = { 'link': id_, 'subheadings': [], 'title': '' } | |
34 self.toc.append(self._current_heading) | |
35 elif tag == 'h3': | |
36 self._current_heading = { 'link': id_, 'title': '' } | |
37 self.toc[-1]['subheadings'].append(self._current_heading) | |
38 | |
39 def handle_endtag(self, tag): | |
40 if tag in ['h1', 'h2', 'h3']: | |
41 self._recent_tag = None | |
42 self.handle_data('<' + tag + '/>') | |
not at google - send to devlin
2012/07/23 23:30:42
I think we should just strip out the tags, like, i
cduvall
2012/07/23 23:58:34
Done.
| |
43 | |
44 def handle_data(self, data): | |
45 if self._recent_tag is None: | |
46 return | |
47 if self._recent_tag == 'h1': | |
48 if self.page_title is None: | |
49 self.page_title = data | |
50 else: | |
51 self.page_title += data | |
not at google - send to devlin
2012/07/23 23:30:42
so if there are multiple <h1> tags it concats them
cduvall
2012/07/23 23:58:34
Done.
| |
52 elif self._recent_tag in ['h2', 'h3']: | |
53 self._current_heading['title'] += data | |
54 | |
9 class IntroDataSource(object): | 55 class IntroDataSource(object): |
10 """This class fetches the intros for a given API. From this intro, a table | 56 """This class fetches the intros for a given API. From this intro, a table |
11 of contents dictionary is created, which contains the headings in the intro. | 57 of contents dictionary is created, which contains the headings in the intro. |
12 """ | 58 """ |
13 def __init__(self, cache_builder, base_paths): | 59 def __init__(self, cache_builder, base_paths): |
14 self._cache = cache_builder.build(self._MakeIntroDict) | 60 self._cache = cache_builder.build(self._MakeIntroDict) |
15 self._base_paths = base_paths | 61 self._base_paths = base_paths |
16 | 62 |
17 def _MakeIntroDict(self, intro): | 63 def _MakeIntroDict(self, intro): |
18 h1s = re.findall('<h1.*>(.+)</h1>', intro) | 64 try: |
19 if len(h1s) > 0: | 65 parser = _IntroParser() |
20 page_title = h1s[0] | 66 parser.feed(intro) |
21 else: | 67 return { |
22 page_title = '' | 68 'intro': Handlebar(intro), |
23 headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro) | 69 'toc': parser.toc, |
24 toc = [] | 70 'title': parser.page_title |
25 for heading in headings: | 71 } |
26 level, link, title = heading | 72 except Exception as e: |
27 if level == '2': | 73 logging.info(e) |
28 toc.append({ 'link': link, 'title': title, 'subheadings': [] }) | |
29 else: | |
30 toc[-1]['subheadings'].append({ 'link': link, 'title': title }) | |
31 return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title } | |
32 | 74 |
33 def __getitem__(self, key): | 75 def __getitem__(self, key): |
34 return self.get(key) | 76 return self.get(key) |
35 | 77 |
36 def get(self, key): | 78 def get(self, key): |
37 real_path = FormatKey(key) | 79 real_path = FormatKey(key) |
38 for base_path in self._base_paths: | 80 for base_path in self._base_paths: |
39 try: | 81 try: |
40 return self._cache.GetFromFile(base_path + '/' + real_path) | 82 return self._cache.GetFromFile(base_path + '/' + real_path) |
41 except Exception: | 83 except Exception: |
42 pass | 84 pass |
43 return None | 85 return None |
OLD | NEW |