chrome/common/extensions/docs/server2/intro_data_source.py - Issue 10810047: Extensions Docs Server: HTML parser in IDS

Side by Side Diff: chrome/common/extensions/docs/server2/intro_data_source.py

Issue 10810047: Extensions Docs Server: HTML parser in IDS (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import re	5 import logging

	6

	7 from HTMLParser import HTMLParser

	8

6 from path_utils import FormatKey	9 from path_utils import FormatKey

7 from third_party.handlebar import Handlebar	10 from third_party.handlebar import Handlebar

8	11

	12 class _IntroParser(HTMLParser):

	13 """ An HTML parser which will parse table of contents and page title info out

	14 of an intro.

	15 """

	16 def __init__(self):

	17 HTMLParser.__init__(self)

	18 self.toc = []

	19 self.page_title = None

	20 self._recent_tag = None

	21 self._current_heading = {}

	22

	23 def handle_starttag(self, tag, attrs):

	24 id_ = ''

	25 if tag not in ['h1', 'h2', 'h3']:

	26 self.handle_data(self.get_starttag_text())

	27 return

	28 self._recent_tag = tag

	29 for attr in attrs:

	30 if attr[0] == 'id':

	31 id_ = attr[1]

	32 if tag == 'h2':

	33 self._current_heading = { 'link': id_, 'subheadings': [], 'title': '' }

	34 self.toc.append(self._current_heading)

	35 elif tag == 'h3':

	36 self._current_heading = { 'link': id_, 'title': '' }

	37 self.toc[-1]['subheadings'].append(self._current_heading)

	38

	39 def handle_endtag(self, tag):

	40 if tag in ['h1', 'h2', 'h3']:

	41 self._recent_tag = None

	42 self.handle_data('<' + tag + '/>')
	not at google - send to devlin 2012/07/23 23:30:42 I think we should just strip out the tags, like, i I think we should just strip out the tags, like, if the title was <h1>this is <b>very</b> important</h1> then we'd want the title to be <title>this is very important</title> -- putting HTML in there would look quite strange. OTOH having html in the TOC would be nice, just, simpler if we forget about it. cduvall 2012/07/23 23:58:34 Done. Show quoted text On 2012/07/23 23:30:42, kalman wrote: > I think we should just strip out the tags, like, if the title was > > <h1>this is <b>very</b> important</h1> > > then we'd want the title to be <title>this is very important</title> -- putting > HTML in there would look quite strange. > > OTOH having html in the TOC would be nice, just, simpler if we forget about it. Done.
	43

	44 def handle_data(self, data):

	45 if self._recent_tag is None:

	46 return

	47 if self._recent_tag == 'h1':

	48 if self.page_title is None:

	49 self.page_title = data

	50 else:

	51 self.page_title += data
	not at google - send to devlin 2012/07/23 23:30:42 so if there are multiple <h1> tags it concats them so if there are multiple <h1> tags it concats them together? I think that would end up looking strange; just use the first one. cduvall 2012/07/23 23:58:34 Done. Show quoted text On 2012/07/23 23:30:42, kalman wrote: > so if there are multiple <h1> tags it concats them together? I think that would > end up looking strange; just use the first one. Done.
	52 elif self._recent_tag in ['h2', 'h3']:

	53 self._current_heading['title'] += data

	54

9 class IntroDataSource(object):	55 class IntroDataSource(object):

10 """This class fetches the intros for a given API. From this intro, a table	56 """This class fetches the intros for a given API. From this intro, a table

11 of contents dictionary is created, which contains the headings in the intro.	57 of contents dictionary is created, which contains the headings in the intro.

12 """	58 """

13 def __init__(self, cache_builder, base_paths):	59 def __init__(self, cache_builder, base_paths):

14 self._cache = cache_builder.build(self._MakeIntroDict)	60 self._cache = cache_builder.build(self._MakeIntroDict)

15 self._base_paths = base_paths	61 self._base_paths = base_paths

16	62

17 def _MakeIntroDict(self, intro):	63 def _MakeIntroDict(self, intro):

18 h1s = re.findall('<h1.*>(.+)</h1>', intro)	64 try:

19 if len(h1s) > 0:	65 parser = _IntroParser()

20 page_title = h1s[0]	66 parser.feed(intro)

21 else:	67 return {

22 page_title = ''	68 'intro': Handlebar(intro),

23 headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro)	69 'toc': parser.toc,

24 toc = []	70 'title': parser.page_title

25 for heading in headings:	71 }

26 level, link, title = heading	72 except Exception as e:

27 if level == '2':	73 logging.info(e)

28 toc.append({ 'link': link, 'title': title, 'subheadings': [] })

29 else:

30 toc[-1]['subheadings'].append({ 'link': link, 'title': title })

31 return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title }

32	74

33 def __getitem__(self, key):	75 def __getitem__(self, key):

34 return self.get(key)	76 return self.get(key)

35	77

36 def get(self, key):	78 def get(self, key):

37 real_path = FormatKey(key)	79 real_path = FormatKey(key)

38 for base_path in self._base_paths:	80 for base_path in self._base_paths:

39 try:	81 try:

40 return self._cache.GetFromFile(base_path + '/' + real_path)	82 return self._cache.GetFromFile(base_path + '/' + real_path)

41 except Exception:	83 except Exception:

42 pass	84 pass

43 return None	85 return None

OLD	NEW

« no previous file with comments | « no previous file | chrome/common/extensions/docs/server2/templates/private/table_of_contents.html » ('j') | no next file with comments »