Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4109)

Unified Diff: chrome/common/extensions/docs/server2/intro_data_source.py

Issue 10810047: Extensions Docs Server: HTML parser in IDS (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | chrome/common/extensions/docs/server2/templates/private/table_of_contents.html » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: chrome/common/extensions/docs/server2/intro_data_source.py
diff --git a/chrome/common/extensions/docs/server2/intro_data_source.py b/chrome/common/extensions/docs/server2/intro_data_source.py
index 1f42eec755152b71a995d32bf7f572abda963f82..115dd459acf79b0ec4127a19f0b49e31574948fc 100644
--- a/chrome/common/extensions/docs/server2/intro_data_source.py
+++ b/chrome/common/extensions/docs/server2/intro_data_source.py
@@ -2,10 +2,56 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
-import re
+import logging
+
+from HTMLParser import HTMLParser
+
from path_utils import FormatKey
from third_party.handlebar import Handlebar
+class _IntroParser(HTMLParser):
+ """ An HTML parser which will parse table of contents and page title info out
+ of an intro.
+ """
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.toc = []
+ self.page_title = None
+ self._recent_tag = None
+ self._current_heading = {}
+
+ def handle_starttag(self, tag, attrs):
+ id_ = ''
+ if tag not in ['h1', 'h2', 'h3']:
+ self.handle_data(self.get_starttag_text())
+ return
+ self._recent_tag = tag
+ for attr in attrs:
+ if attr[0] == 'id':
+ id_ = attr[1]
+ if tag == 'h2':
+ self._current_heading = { 'link': id_, 'subheadings': [], 'title': '' }
+ self.toc.append(self._current_heading)
+ elif tag == 'h3':
+ self._current_heading = { 'link': id_, 'title': '' }
+ self.toc[-1]['subheadings'].append(self._current_heading)
+
+ def handle_endtag(self, tag):
+ if tag in ['h1', 'h2', 'h3']:
+ self._recent_tag = None
+ self.handle_data('<' + tag + '/>')
not at google - send to devlin 2012/07/23 23:30:42 I think we should just strip out the tags, like, i
cduvall 2012/07/23 23:58:34 Done.
+
+ def handle_data(self, data):
+ if self._recent_tag is None:
+ return
+ if self._recent_tag == 'h1':
+ if self.page_title is None:
+ self.page_title = data
+ else:
+ self.page_title += data
not at google - send to devlin 2012/07/23 23:30:42 so if there are multiple <h1> tags it concats them
cduvall 2012/07/23 23:58:34 Done.
+ elif self._recent_tag in ['h2', 'h3']:
+ self._current_heading['title'] += data
+
class IntroDataSource(object):
"""This class fetches the intros for a given API. From this intro, a table
of contents dictionary is created, which contains the headings in the intro.
@@ -15,20 +61,16 @@ class IntroDataSource(object):
self._base_paths = base_paths
def _MakeIntroDict(self, intro):
- h1s = re.findall('<h1.*>(.+)</h1>', intro)
- if len(h1s) > 0:
- page_title = h1s[0]
- else:
- page_title = ''
- headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro)
- toc = []
- for heading in headings:
- level, link, title = heading
- if level == '2':
- toc.append({ 'link': link, 'title': title, 'subheadings': [] })
- else:
- toc[-1]['subheadings'].append({ 'link': link, 'title': title })
- return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title }
+ try:
+ parser = _IntroParser()
+ parser.feed(intro)
+ return {
+ 'intro': Handlebar(intro),
+ 'toc': parser.toc,
+ 'title': parser.page_title
+ }
+ except Exception as e:
+ logging.info(e)
def __getitem__(self, key):
return self.get(key)
« no previous file with comments | « no previous file | chrome/common/extensions/docs/server2/templates/private/table_of_contents.html » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698