OLD | NEW |
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import re | 5 import logging |
| 6 |
| 7 from HTMLParser import HTMLParser |
| 8 |
6 from path_utils import FormatKey | 9 from path_utils import FormatKey |
7 from third_party.handlebar import Handlebar | 10 from third_party.handlebar import Handlebar |
8 | 11 |
| 12 class _IntroParser(HTMLParser): |
| 13 """ An HTML parser which will parse table of contents and page title info out |
| 14 of an intro. |
| 15 """ |
| 16 def __init__(self): |
| 17 HTMLParser.__init__(self) |
| 18 self.toc = [] |
| 19 self.page_title = None |
| 20 self._recent_tag = None |
| 21 self._current_heading = {} |
| 22 |
| 23 def handle_starttag(self, tag, attrs): |
| 24 id_ = '' |
| 25 if tag not in ['h1', 'h2', 'h3']: |
| 26 return |
| 27 if tag != 'h1' or self.page_title is None: |
| 28 self._recent_tag = tag |
| 29 for attr in attrs: |
| 30 if attr[0] == 'id': |
| 31 id_ = attr[1] |
| 32 if tag == 'h2': |
| 33 self._current_heading = { 'link': id_, 'subheadings': [], 'title': '' } |
| 34 self.toc.append(self._current_heading) |
| 35 elif tag == 'h3': |
| 36 self._current_heading = { 'link': id_, 'title': '' } |
| 37 self.toc[-1]['subheadings'].append(self._current_heading) |
| 38 |
| 39 def handle_endtag(self, tag): |
| 40 if tag in ['h1', 'h2', 'h3']: |
| 41 self._recent_tag = None |
| 42 |
| 43 def handle_data(self, data): |
| 44 if self._recent_tag is None: |
| 45 return |
| 46 if self._recent_tag == 'h1': |
| 47 if self.page_title is None: |
| 48 self.page_title = data |
| 49 else: |
| 50 self.page_title += data |
| 51 elif self._recent_tag in ['h2', 'h3']: |
| 52 self._current_heading['title'] += data |
| 53 |
9 class IntroDataSource(object): | 54 class IntroDataSource(object): |
10 """This class fetches the intros for a given API. From this intro, a table | 55 """This class fetches the intros for a given API. From this intro, a table |
11 of contents dictionary is created, which contains the headings in the intro. | 56 of contents dictionary is created, which contains the headings in the intro. |
12 """ | 57 """ |
13 def __init__(self, cache_builder, base_paths): | 58 def __init__(self, cache_builder, base_paths): |
14 self._cache = cache_builder.build(self._MakeIntroDict) | 59 self._cache = cache_builder.build(self._MakeIntroDict) |
15 self._base_paths = base_paths | 60 self._base_paths = base_paths |
16 | 61 |
17 def _MakeIntroDict(self, intro): | 62 def _MakeIntroDict(self, intro): |
18 h1s = re.findall('<h1.*>(.+)</h1>', intro) | 63 try: |
19 if len(h1s) > 0: | 64 parser = _IntroParser() |
20 page_title = h1s[0] | 65 parser.feed(intro) |
21 else: | 66 return { |
22 page_title = '' | 67 'intro': Handlebar(intro), |
23 headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro) | 68 'toc': parser.toc, |
24 toc = [] | 69 'title': parser.page_title |
25 for heading in headings: | 70 } |
26 level, link, title = heading | 71 except Exception as e: |
27 if level == '2': | 72 logging.info(e) |
28 toc.append({ 'link': link, 'title': title, 'subheadings': [] }) | |
29 else: | |
30 toc[-1]['subheadings'].append({ 'link': link, 'title': title }) | |
31 return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title } | |
32 | 73 |
33 def __getitem__(self, key): | 74 def __getitem__(self, key): |
34 return self.get(key) | 75 return self.get(key) |
35 | 76 |
36 def get(self, key): | 77 def get(self, key): |
37 real_path = FormatKey(key) | 78 real_path = FormatKey(key) |
38 for base_path in self._base_paths: | 79 for base_path in self._base_paths: |
39 try: | 80 try: |
40 return self._cache.GetFromFile(base_path + '/' + real_path) | 81 return self._cache.GetFromFile(base_path + '/' + real_path) |
41 except Exception: | 82 except Exception: |
42 pass | 83 pass |
43 return None | 84 return None |
OLD | NEW |