Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(5554)

Unified Diff: chrome/common/extensions/docs/server2/intro_data_source.py

Issue 10810047: Extensions Docs Server: HTML parser in IDS (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: chrome/common/extensions/docs/server2/intro_data_source.py
diff --git a/chrome/common/extensions/docs/server2/intro_data_source.py b/chrome/common/extensions/docs/server2/intro_data_source.py
index 1f42eec755152b71a995d32bf7f572abda963f82..68bddeca7d49c16e58e98797e5a05d34437cb37f 100644
--- a/chrome/common/extensions/docs/server2/intro_data_source.py
+++ b/chrome/common/extensions/docs/server2/intro_data_source.py
@@ -2,10 +2,46 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
-import re
+from HTMLParser import HTMLParser
+
from path_utils import FormatKey
from third_party.handlebar import Handlebar
+class _IntroParser(HTMLParser):
+ """ An HTML parser which will parse table of contents and page title info out
+ of an intro.
+ """
+ def init(self):
+ """ This method is needed because HTMLParser is an old style class that does
+ not inherit from |object|, so the super constructor cannot be called in
+ |__init__|.
+ """
not at google - send to devlin 2012/07/23 12:47:36 I think the pattern is like def __init_(self):
cduvall 2012/07/23 20:24:21 Done.
+ self.toc = []
+ self.page_title = ''
not at google - send to devlin 2012/07/23 12:47:36 None not empty string? We may want to test for the
cduvall 2012/07/23 20:24:21 I looked all throughout the internet for this, and
+ self._recent_tag = None
+ self._current = {}
not at google - send to devlin 2012/07/23 12:47:36 current_what?
cduvall 2012/07/23 20:24:21 Done.
+
+ def handle_starttag(self, tag, attrs):
+ id_ = ''
+ self._recent_tag = tag
+ for attr in attrs:
+ if attr[0] == 'id':
+ id_ = attr[1]
+ if tag == 'h2':
+ self._current = { 'link': id_, 'subheadings': [] }
+ self.toc.append(self._current)
+ elif tag == 'h3':
+ self._current = { 'link': id_ }
+ self.toc[-1]['subheadings'].append(self._current)
+
+ def handle_data(self, data):
+ if data.isspace():
+ return
not at google - send to devlin 2012/07/23 12:47:36 why is this needed?
cduvall 2012/07/23 20:24:21 Not needed anymore.
+ if self._recent_tag == 'h1':
+ self.page_title = data
+ elif self._recent_tag in ['h2', 'h3']:
+ self._current['title'] = data
not at google - send to devlin 2012/07/23 12:47:36 Note that this won't handle cases like <h2>This h
cduvall 2012/07/23 20:24:21 I ended up not using a stack, but the new version
not at google - send to devlin 2012/07/23 23:30:41 sgtm. I realise I started micro-managing you a bit
cduvall 2012/07/23 23:58:33 Np, I would much rather learn to do it the right w
+
class IntroDataSource(object):
"""This class fetches the intros for a given API. From this intro, a table
of contents dictionary is created, which contains the headings in the intro.
@@ -15,20 +51,14 @@ class IntroDataSource(object):
self._base_paths = base_paths
def _MakeIntroDict(self, intro):
- h1s = re.findall('<h1.*>(.+)</h1>', intro)
- if len(h1s) > 0:
- page_title = h1s[0]
- else:
- page_title = ''
- headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro)
- toc = []
- for heading in headings:
- level, link, title = heading
- if level == '2':
- toc.append({ 'link': link, 'title': title, 'subheadings': [] })
- else:
- toc[-1]['subheadings'].append({ 'link': link, 'title': title })
- return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title }
+ parser = _IntroParser()
+ parser.init()
+ parser.feed(intro)
+ return {
+ 'intro': Handlebar(intro),
+ 'toc': parser.toc,
+ 'title': parser.page_title
+ }
def __getitem__(self, key):
return self.get(key)
@@ -38,6 +68,6 @@ class IntroDataSource(object):
for base_path in self._base_paths:
try:
return self._cache.GetFromFile(base_path + '/' + real_path)
- except Exception:
+ except Exception as e:
pass
return None
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698