chrome/common/extensions/docs/server2/intro_data_source.py - Issue 10810047: Extensions Docs Server: HTML parser in IDS

Unified Diff: chrome/common/extensions/docs/server2/intro_data_source.py

Issue 10810047: Extensions Docs Server: HTML parser in IDS (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/common/extensions/docs/server2/intro_data_source.py

diff --git a/chrome/common/extensions/docs/server2/intro_data_source.py b/chrome/common/extensions/docs/server2/intro_data_source.py

index 1f42eec755152b71a995d32bf7f572abda963f82..115dd459acf79b0ec4127a19f0b49e31574948fc 100644

--- a/chrome/common/extensions/docs/server2/intro_data_source.py

+++ b/chrome/common/extensions/docs/server2/intro_data_source.py

@@ -2,10 +2,56 @@

# Use of this source code is governed by a BSD-style license that can be

# found in the LICENSE file.

-import re

+import logging

+from HTMLParser import HTMLParser

from path_utils import FormatKey

from third_party.handlebar import Handlebar

+class _IntroParser(HTMLParser):

+ """ An HTML parser which will parse table of contents and page title info out

+ of an intro.

+ """

+ def __init__(self):

+ HTMLParser.__init__(self)

+ self.toc = []

+ self.page_title = None

+ self._recent_tag = None

+ self._current_heading = {}

+ def handle_starttag(self, tag, attrs):

+ id_ = ''

+ if tag not in ['h1', 'h2', 'h3']:

+ self.handle_data(self.get_starttag_text())

+ return

+ self._recent_tag = tag

+ for attr in attrs:

+ if attr[0] == 'id':

+ id_ = attr[1]

+ if tag == 'h2':

+ self._current_heading = { 'link': id_, 'subheadings': [], 'title': '' }

+ self.toc.append(self._current_heading)

+ elif tag == 'h3':

+ self._current_heading = { 'link': id_, 'title': '' }

+ self.toc[-1]['subheadings'].append(self._current_heading)

+ def handle_endtag(self, tag):

+ if tag in ['h1', 'h2', 'h3']:

+ self._recent_tag = None

+ self.handle_data('<' + tag + '/>')

not at google - send to devlin 2012/07/23 23:30:42 I think we should just strip out the tags, like, i

cduvall 2012/07/23 23:58:34 Done.

+ def handle_data(self, data):

+ if self._recent_tag is None:

+ return

+ if self._recent_tag == 'h1':

+ if self.page_title is None:

+ self.page_title = data

+ else:

+ self.page_title += data

not at google - send to devlin 2012/07/23 23:30:42 so if there are multiple <h1> tags it concats them

cduvall 2012/07/23 23:58:34 Done.

+ elif self._recent_tag in ['h2', 'h3']:

+ self._current_heading['title'] += data

class IntroDataSource(object):

"""This class fetches the intros for a given API. From this intro, a table

of contents dictionary is created, which contains the headings in the intro.

@@ -15,20 +61,16 @@ class IntroDataSource(object):

self._base_paths = base_paths

def _MakeIntroDict(self, intro):

- h1s = re.findall('<h1.*>(.+)</h1>', intro)

- if len(h1s) > 0:

- page_title = h1s[0]

- else:

- page_title = ''

- headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro)

- toc = []

- for heading in headings:

- level, link, title = heading

- if level == '2':

- toc.append({ 'link': link, 'title': title, 'subheadings': [] })

- else:

- toc[-1]['subheadings'].append({ 'link': link, 'title': title })

- return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title }

+ try:

+ parser = _IntroParser()

+ parser.feed(intro)

+ return {

+ 'intro': Handlebar(intro),

+ 'toc': parser.toc,

+ 'title': parser.page_title

+ }

+ except Exception as e:

+ logging.info(e)

def __getitem__(self, key):

return self.get(key)

« no previous file with comments | « no previous file | chrome/common/extensions/docs/server2/templates/private/table_of_contents.html » ('j') | no next file with comments »