Index: app.py |
diff --git a/app.py b/app.py |
index 544f92098a3996a3a8f16dc7a214a6bd3aa27aaf..140f9438a96db6603de87a61b240eca92eff64ad 100644 |
--- a/app.py |
+++ b/app.py |
@@ -24,7 +24,7 @@ from webapp2_extras import jinja2 |
# pylint: disable=F0401 |
from jinja2 import Environment, FileSystemLoader |
-from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup |
+from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup, Tag |
# Current application name. |
@@ -53,19 +53,30 @@ def bootstrap(): |
console_template = fh.read() |
-def get_pagedata_from_cache(localpath): |
- memcache_data = memcache.get(localpath) |
- if not memcache_data: |
- return None |
- logging.debug('content for %s found in memcache' % localpath) |
- return json.loads(memcache_data) |
+########## |
+# Page class definition and related functions. |
+########## |
+class Page(db.Model): |
+ fetch_timestamp = db.DateTimeProperty(required=True) |
+ localpath = db.StringProperty(required=True) |
+ content = db.TextProperty() |
+ title = db.StringProperty() |
+ offsite_base = db.StringProperty() |
+ body_class = db.StringProperty() |
+ remoteurl = db.TextProperty() |
+ # Data updated separately, after creation. |
+ content_blob = blobstore.BlobReferenceProperty() |
-def put_pagedata_into_cache(localpath, page_data): |
- memcache_data = json.dumps(page_data) |
- if not memcache.set(key=localpath, value=memcache_data, time=2*60): |
- logging.error('put_pagedata_into_cache(\'%s\'): memcache.set() failed' % ( |
- localpath)) |
+def get_or_create_page(localpath, remoteurl, maxage): |
+ return Page.get_or_insert( |
+ key_name=localpath, |
+ localpath=localpath, |
+ remoteurl=remoteurl, |
+ maxage=maxage, |
+ fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), |
+ content=None, |
+ content_blob=None) |
def get_and_cache_pagedata(localpath): |
@@ -86,7 +97,7 @@ def get_and_cache_pagedata(localpath): |
Here we assume localpath is already unquoted. |
""" |
- page_data = get_pagedata_from_cache(localpath) |
+ page_data = get_data_from_cache(localpath) |
if page_data and not page_data.get('content_blob'): |
return page_data |
page = Page.all().filter('localpath =', localpath).get() |
@@ -104,15 +115,149 @@ def get_and_cache_pagedata(localpath): |
logging.debug('content for %s found in blobstore' % localpath) |
blob_reader = blobstore.BlobReader(page.content_blob) |
page_data['content_blob'] = True |
- put_pagedata_into_cache(localpath, page_data) |
+ put_data_into_cache(localpath, page_data) |
page_data['content'] = blob_reader.read().decode('utf-8', 'replace') |
else: |
logging.debug('content for %s found in datastore' % localpath) |
page_data['content'] = page.content |
- put_pagedata_into_cache(localpath, page_data) |
+ put_data_into_cache(localpath, page_data) |
return page_data |
+def save_page(page, localpath, fetch_timestamp, page_data): |
+ body_class = page_data.get('body_class', '') |
+ content = page_data.get('content') |
+ offsite_base = page_data.get('offsite_base', '') |
+ title = page_data.get('title', '') |
+ |
+ content_blob_key = None |
+ try: |
+ content = content.decode('utf-8', 'replace') |
+ except UnicodeEncodeError: |
+ logging.debug('save_page: content was already in unicode') |
+ logging.debug('save_page: content size is %d' % len(content)) |
+ if len(content.encode('utf-8')) >= 10**6: |
+ logging.debug('save_page: saving to blob') |
+ content_blob_key = write_blob(content, path_to_mime_type(localpath)) |
+ content = None |
+ def tx_page(page_key): |
+ page = Page.get(page_key) |
+ # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no |
+ # 'fetch_timestamp' member (but some types could not be inferred) |
+ # pylint: disable=E1103 |
+ if page.fetch_timestamp > fetch_timestamp: |
+ return |
+ page.content = content |
+ page.content_blob = content_blob_key |
+ page.fetch_timestamp = fetch_timestamp |
+ # title, offsite_base, body_class can all be empty strings for some |
+ # content. Where that's true, they're not used for displaying a console- |
+ # like resource, and the content alone is returned to the web user. |
+ page.title = title |
+ page.offsite_base = offsite_base |
+ page.body_class = body_class |
+ # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member |
+ # (but some types could not be inferred) |
+ # pylint: disable=E1103 |
+ page.put() |
+ db.run_in_transaction(tx_page, page.key()) |
+ page_data = { |
+ 'body_class': body_class, |
+ 'content': content, |
+ 'offsite_base': offsite_base, |
+ 'title': title, |
+ } |
+ if content_blob_key: |
+ page_data['content_blob'] = True |
+ put_data_into_cache(localpath, page_data) |
+ logging.info("Saved and cached page with localpath %s" % localpath) |
cmp
2012/12/27 01:19:22
use single quotes everywhere instead of double quo
agable
2012/12/28 23:56:08
Done.
|
+ |
+ |
+########## |
+# Row class definition and related functions. |
+########## |
+class Row(db.Model): |
+ fetch_timestamp = db.DateTimeProperty(required=True) |
+ rev_number = db.StringProperty(required=True) |
+ localpath = db.StringProperty(required=True) |
+ revision = db.TextProperty() |
+ name = db.TextProperty() |
+ status = db.TextProperty() |
+ comment = db.TextProperty() |
+ details = db.TextProperty() |
+ |
+ |
+def get_or_create_row(localpath, revision): |
+ return Row.get_or_insert( |
+ key_name=localpath, |
+ rev_number=revision, |
+ localpath=localpath, |
+ fetch_timestamp=datetime.datetime.now()) |
+ |
+ |
+def get_and_cache_rowdata(localpath): |
+ """Returns a row_data dict. |
+ |
+ get_and_cache_rowdata takes a localpath which is used to fetch data from the |
+ cache. If the data is present, then we have all of the data we need and we |
+ return early. |
+ |
+ Otherwise, we need to fetch the row object and set up the row data. |
+ |
+ Here we assume localpath is already unquoted. |
+ """ |
+ row_data = get_data_from_cache(localpath) |
+ if row_data: |
+ return row_data |
+ row = Row.all().filter('localpath =', localpath).get() |
+ if not row: |
+ logging.error('get_and_cache_rowdata(\'%s\'): no matching localpath in ' |
+ 'datastore' % localpath) |
+ return {} |
+ row_data = {} |
+ row_data['rev'] = row.revision |
+ row_data['name'] = row.name |
+ row_data['status'] = row.status |
+ row_data['comment'] = row.comment |
+ row_data['details'] = row.details |
+ row_data['rev_number'] = row.rev_number |
+ logging.debug('content for %s found in datastore' % localpath) |
+ put_data_into_cache(localpath, row_data) |
+ return row_data |
+ |
+ |
+def save_row(row_data, localpath, timestamp): |
+ rev_number = row_data['rev_number'] |
+ row = get_or_create_row(localpath, rev_number) |
+ row_key = row.key() |
+ def tx_row(row_key): |
+ row = Row.get(row_key) |
+ # E1103:959,7:save_row.tx_row: Instance of 'list' has no |
+ # 'fetch_timestamp' member (but some types could not be inferred) |
+ # pylint: disable=E1103 |
+ # if row.fetch_timestamp > timestamp: |
+ # return |
+ row.fetch_timestamp = timestamp |
+ row.revision = row_data['rev'] |
+ row.name = row_data['name'] |
+ row.status = row_data['status'] |
+ row.comment = row_data['comment'] |
+ row.details = row_data['details'] |
+ # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member |
+ # (but some types could not be inferred) |
+ # pylint: disable=E1103 |
+ row.put() |
+ db.run_in_transaction(tx_row, row_key) |
+ prev_rev = memcache.get(key='latest_rev') |
+ if (rev_number > prev_rev): |
+ memcache.set(key='latest_rev', value=rev_number) |
+ put_data_into_cache(localpath, row_data) |
+ logging.info("Saved and cached row with localpath %s" % localpath) |
cmp
2012/12/27 01:19:22
use single quotes instead of double quotes
|
+ |
+ |
+########## |
+# ConsoleData class definition and related functions. |
+########## |
class ConsoleData(object): |
def __init__(self): |
self.row_orderedkeys = [] |
@@ -178,6 +323,23 @@ class ConsoleData(object): |
self.category_data[self.lastMasterSeen][category] = builder_status |
self.category_count += 1 |
+ def AddRow(self, row): |
+ revision = row['rev_number'] |
+ self.SawRevision(revision) |
+ revlink = BeautifulSoup(row['rev']).td.a['href'] |
+ self.SetLink(revlink) |
+ name = BeautifulSoup(row['name']).td.contents |
+ self.SetName(self.ContentsToHtml(name)) |
+ status = BeautifulSoup(row['status']).findAll('table') |
+ for i, stat in enumerate(status): |
+ self.SetStatus(self.category_order[self.lastMasterSeen][i], |
+ unicode(stat)) |
+ comment = BeautifulSoup(row['comment']).td.contents |
+ self.SetComment(self.ContentsToHtml(comment)) |
+ if row['details']: |
+ details = BeautifulSoup(row['details']).td.contents |
+ self.SetDetail(self.ContentsToHtml(details)) |
+ |
def ParseRow(self, row): |
cells = row.findAll('td', recursive=False) |
# Figure out which row this is. |
@@ -208,72 +370,61 @@ class ConsoleData(object): |
# the earliest revisions, set them to ''. |
-# W0613:169,39:console_merger: Unused argument 'remoteurl' |
-# W0613:169,19:console_merger: Unused argument 'unquoted_localpath' |
-# pylint: disable=W0613 |
-def console_merger(unquoted_localpath, remote_url, page_data=None, |
- masters_to_merge=None): |
- page_data = page_data or {} |
- |
+########## |
+# Heavy-lifting functions that do most of the console processing. |
+# AKA postfetch and postsave functions/handlers. |
+########## |
+def console_merger(localpath, remoteurl, page_data, |
+ masters_to_merge=None, num_rows_to_merge=25): |
masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE |
mergedconsole = ConsoleData() |
- merged_page = None |
- merged_tag = None |
+ surroundings = get_and_cache_pagedata('surroundings') |
+ merged_page = BeautifulSoup(surroundings['content']) |
+ merged_tag = merged_page.find('table', 'ConsoleData') |
+ latest_rev = int(memcache.get(key='latest_rev')) |
+ if not latest_rev: |
+ logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest ' |
+ 'revision number.' % ( |
+ localpath, remoteurl, page_data)) |
+ return |
fetch_timestamp = datetime.datetime.now() |
for master in masters_to_merge: |
- page_data = get_and_cache_pagedata('%s/console' % master) |
- master_content = page_data['content'] |
- if master_content is None: |
+ # Fetch the summary one-box-per-builder for the master. |
+ # If we don't get it, something is wrong, skip the master entirely. |
+ master_summary = get_and_cache_pagedata('%s/console/summary' % master) |
+ if not master_summary['content']: |
continue |
- master_content = master_content.encode('ascii', 'replace') |
cmp
2012/12/27 01:19:22
I hope that this line being removed and not shuffl
|
- this_page = BeautifulSoup(master_content) |
- this_tag = this_page.find('table', {'class': 'ConsoleData'}) |
- # The first console is special, we reuse all of the console page. |
- if not merged_page: |
- merged_page = this_page |
- merged_tag = this_tag |
mergedconsole.SawMaster(master) |
- |
- # Parse each of the rows. |
- CATEGORY_ROW = 0 |
- trs = this_tag.findAll('tr', recursive=False) |
- |
- # Get the list of categories in |master|. |
- category_tds = trs[CATEGORY_ROW].findAll('td', recursive=False)[2:] |
- third_cell = category_tds[0] |
- third_cell_class = third_cell.attrs[0][1] |
- categories = [] |
- if third_cell_class.startswith('DevStatus '): |
- BUILDER_STATUS_ROW = 2 |
- FIRST_CL_ROW = 3 |
- for index, category_td in enumerate(category_tds): |
- categories.append(category_td.contents[0].strip()) |
+ # Get the categories for this builder. If the builder doesn't have any |
+ # categories, just use the default empty-string category. |
+ category_list = [] |
+ master_categories = get_and_cache_pagedata('%s/console/categories' % master) |
+ if not master_categories['content']: |
+ category_list.append('') |
else: |
- # There's no categories + spacing row, the first row will be the builder |
- # status row. |
- categories.append('') |
- BUILDER_STATUS_ROW = 0 |
- FIRST_CL_ROW = 1 |
- |
- # For each category in |master|, add the category plus its |builder_status|. |
- builder_tds = trs[BUILDER_STATUS_ROW].findAll('td', recursive=False)[2:] |
- for index, category in enumerate(categories): |
- builder_status = builder_tds[index].findAll('table', recursive=False)[0] |
- mergedconsole.AddCategory(category=category, |
- builder_status=builder_status) |
- |
- # For each of the remaining rows, add them to the console data. |
- for console_index in range(FIRST_CL_ROW, len(trs)): |
- console_row = trs[console_index] |
- mergedconsole.ParseRow(console_row) |
- # Add GC memory profiling. |
- # import gc |
- # gc.set_debug(gc.DEBUG_LEAK) |
- # logging.debug(gc.garbage) |
- # del gc.garbage[:] |
- mergedconsole.Finish() |
+ category_row = BeautifulSoup(master_categories['content']) |
+ category_list = map(lambda x: x.text, |
+ category_row.findAll('td', 'DevStatus')) |
+ # Get the corresponding summary box(es). |
+ summary_row = BeautifulSoup(master_summary['content']) |
+ summary_list = summary_row.findAll('table') |
+ for category, summary in zip(category_list, summary_list): |
+ mergedconsole.AddCategory(category, summary) |
+ |
+ # Fetch all of the rows that we need. |
+ rows_fetched = 0 |
+ current_rev = latest_rev |
+ while rows_fetched < num_rows_to_merge and current_rev >= 0: |
+ row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev)) |
+ if not row_data: |
+ current_rev -= 1 |
+ continue |
+ mergedconsole.AddRow(row_data) |
+ current_rev -= 1 |
+ rows_fetched += 1 |
# Convert the merged content into console content. |
+ mergedconsole.Finish() |
template_environment = Environment() |
template_environment.loader = FileSystemLoader('.') |
def notstarted(builder_status): |
@@ -316,7 +467,8 @@ def console_merger(unquoted_localpath, remote_url, page_data=None, |
# Update the merged console page. |
merged_page = get_or_create_page('chromium/console', None, maxage=30) |
- logging.debug('console_merger: saving merged console') |
+ logging.info('console_merger: saving merged console') |
+ page_data = get_and_cache_pagedata('chromium/console') |
page_data['title'] = 'BuildBot: Chromium' |
page_data['offsite_base'] = 'http://build.chromium.org/p/chromium' |
page_data['body_class'] = 'interface' |
@@ -325,10 +477,13 @@ def console_merger(unquoted_localpath, remote_url, page_data=None, |
return |
-def console_handler(_unquoted_localpath, remoteurl, page_data=None): |
+def console_handler(unquoted_localpath, remoteurl, page_data=None): |
page_data = page_data or {} |
content = page_data.get('content') |
if not content: |
+ logging.error('console_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' |
+ 'from local path' % ( |
+ unquoted_localpath, remoteurl, page_data)) |
return page_data |
# Decode content from utf-8 to unicode, replacing bad characters. |
@@ -460,51 +615,14 @@ def console_handler(_unquoted_localpath, remoteurl, page_data=None): |
return page_data |
- |
-def get_or_create_row(localpath, revision): |
- return Row.get_or_insert( |
- key_name=revision + ' '+ localpath, |
- rev_number=revision, |
- localpath=localpath, |
- fetch_timestamp=datetime.datetime.now()) |
- |
- |
-def save_row(row_data, localpath, timestamp): |
- rev_number = row_data['rev_number'] |
- row = get_or_create_row(localpath, rev_number) |
- row_key = row.key() |
- def tx_row(row_key): |
- row = Row.get(row_key) |
- # E1103:959,7:save_row.tx_row: Instance of 'list' has no |
- # 'fetch_timestamp' member (but some types could not be inferred) |
- # pylint: disable=E1103 |
- # if row.fetch_timestamp > timestamp: |
- # return |
- row.fetch_timestamp = timestamp |
- row.revision = row_data['rev'] |
- row.name = row_data['name'] |
- row.status = row_data['status'] |
- row.comment = row_data['comment'] |
- row.details = row_data['details'] |
- # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member |
- # (but some types could not be inferred) |
- # pylint: disable=E1103 |
- row.put() |
- db.run_in_transaction(tx_row, row_key) |
- memcache_data = json.dumps(row_data) |
- # A row should never be large enough to hit the blobstore, so we |
- # explicitly don't handle rows larger than 10^6 bytes. |
- if not memcache.set(key=str(row_key), value=memcache_data, time=2*60): |
- logging.error('save_row(\'%s\'): memcache.set() failed' % (row_key)) |
- |
- |
+# W0613:600,28:parse_master: Unused argument 'remoteurl' |
+# pylint: disable=W0613 |
def parse_master(localpath, remoteurl, page_data=None): |
"""Part of the new pipeline to store individual rows rather than |
whole pages of html. Parses the master data into a set of rows, |
and writes them out to the datastore in an easily retrievable format. |
- Returns the same page_data as it was passed, so as to not interrupt |
- the current pipeline. This may change when we switch over. |
+ Doesn't modify page_data dict. |
""" |
ts = datetime.datetime.now() |
page_data = page_data or {} |
@@ -516,17 +634,19 @@ def parse_master(localpath, remoteurl, page_data=None): |
# Split page into surroundings (announce, legend, footer) and data (rows). |
surroundings = BeautifulSoup(content) |
data = surroundings.find('table', 'ConsoleData') |
- data.extract() |
+ new_data = Tag(surroundings, 'table', [('class', 'ConsoleData'), |
+ ('width', '96%')]) |
+ data.replaceWith(new_data) |
- surroundings_page = get_or_create_page(localpath + '/surroundings', |
+ surroundings_page = get_or_create_page('surroundings', |
None, maxage=30) |
surroundings_data = {} |
- surroundings_data['title'] = 'Surroundings for ' + localpath |
+ surroundings_data['title'] = 'Surroundings' |
surroundings_data['content'] = unicode(surroundings) |
- save_page(surroundings_page, localpath + '/surroundings', ts, |
+ save_page(surroundings_page, 'surroundings', ts, |
surroundings_data) |
- rows = data.tbody.findAll('tr', recursive=False) |
+ rows = data.findAll('tr', recursive=False) |
# The first table row can be special: the list of categories. |
categories = None |
# If the first row contains a DevStatus cell... |
@@ -567,11 +687,11 @@ def parse_master(localpath, remoteurl, page_data=None): |
curr_row['rev'] = unicode(row.find('td', 'DevRev')) |
curr_row['rev_number'] = unicode(row.find('td', 'DevRev').a.string) |
curr_row['name'] = unicode(row.find('td', 'DevName')) |
- curr_row['status'] = unicode(row.findAll('td', 'DevStatus')) |
+ curr_row['status'] = unicode(row.findAll('table')) |
else: |
if 'details' not in curr_row: |
curr_row['details'] = '' |
- save_row(curr_row, localpath, ts) |
+ save_row(curr_row, localpath + '/' + curr_row['rev_number'], ts) |
curr_row = {} |
return page_data |
@@ -598,6 +718,124 @@ def one_box_handler(unquoted_localpath, remoteurl, page_data=None): |
return page_data |
+########## |
+# Utility functions for blobstore and memcache. |
+########## |
+def get_data_from_cache(localpath): |
+ memcache_data = memcache.get(localpath) |
+ if not memcache_data: |
+ return None |
+ logging.debug('content for %s found in memcache' % localpath) |
+ return json.loads(memcache_data) |
+ |
+ |
+def put_data_into_cache(localpath, data): |
+ memcache_data = json.dumps(data) |
+ if not memcache.set(key=localpath, value=memcache_data, time=2*60): |
+ logging.error('put_data_into_cache(\'%s\'): memcache.set() failed' % ( |
+ localpath)) |
+ |
+ |
+def write_blob(data, mime_type): |
+ """Saves a Unicode string as a new blob, returns the blob's key.""" |
+ file_name = files.blobstore.create(mime_type=mime_type) |
+ data = data.encode('utf-8') |
+ with files.open(file_name, 'a') as blob_file: |
+ blob_file.write(data) |
+ files.finalize(file_name) |
+ return files.blobstore.get_blob_key(file_name) |
+ |
+ |
+def path_to_mime_type(path): |
+ return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') |
+ |
+ |
+EXT_TO_MIME = { |
+ '.css': 'text/css', |
+ '.js': 'text/javascript', |
+ '.json': 'application/json', |
+ '.html': 'text/html', |
+} |
+ |
+ |
+########## |
+# Functions for actually fetching original pages. |
+########## |
+def fetch_pages(): |
+ """Starts a background fetch operation for pages that need it.""" |
+ logging.debug('fetch_pages()') |
+ for url in URLS: |
+ deferred.defer(fetch_page, **url) |
+ |
+ |
+def nonfatal_fetch_url(url, *args, **kwargs): |
+ # Temporary workaround to disable AppEngine global cache of these pages. |
+ if '?' in url: |
+ url += '&' + str(random.random()) |
+ else: |
+ url += '?' + str(random.random()) |
+ |
+ try: |
+ return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) |
+ except urlfetch.DownloadError: |
+ logging.warn('urlfetch failed: %s' % url, exc_info=1) |
+ return None |
+ |
+ |
+def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None, |
+ fetch_url=nonfatal_fetch_url): |
+ """Fetches data about a set of pages.""" |
+ if type(localpath) != type(''): |
+ logging.error('fetch_page: localpath is %r, expected a string' % ( |
+ repr(localpath))) |
+ return |
+ unquoted_localpath = urllib.unquote(localpath) |
+ logging.debug('fetch_page("%s", "%s", "%s")' % ( |
+ unquoted_localpath, remoteurl, maxage)) |
+ page = get_or_create_page(unquoted_localpath, remoteurl, maxage) |
+ |
+ # Check if our copy of the page is younger than maxage. If it is, we'll |
+ # skip the fetch. |
+ oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( |
+ seconds=maxage) |
+ if (page.fetch_timestamp and |
+ page.fetch_timestamp > oldest_acceptable_timestamp): |
+ logging.debug('fetch_page: too recent, skipping') |
+ return |
+ |
+ # Perform the actual page fetch. |
+ fetch_timestamp = datetime.datetime.now() |
+ response = fetch_url(remoteurl) |
+ if not response: |
+ logging.warning('fetch_page: got empty response') |
+ return |
+ if response.status_code != 200: |
+ logging.warning('fetch_page: got non-empty response but code ' |
+ '%d' % response.status_code) |
+ return |
+ |
+ # We have actual content. If there's one or more handlers, call them. |
+ page_data = {} |
+ page_data['content'] = response.content |
+ if postfetch: |
+ if not isinstance(postfetch, list): |
+ postfetch = [postfetch] |
+ for handler in postfetch: |
+ logging.debug('fetch_page: calling postfetch handler ' |
+ '%s' % handler.__name__) |
+ page_data = handler(unquoted_localpath, remoteurl, page_data) |
+ |
+ # Save the returned content into the DB and caching layers. |
+ logging.debug('fetch_page: saving page') |
+ save_page(page, unquoted_localpath, fetch_timestamp, page_data) |
+ if postsave: |
+ if not isinstance(postsave, list): |
+ postsave = [postsave] |
+ for handler in postsave: |
+ logging.debug('fetch_page: calling postsave handler ' |
+ '%s' % handler.__name__) |
+ handler(unquoted_localpath, remoteurl, page_data) |
+ |
# List of URLs to fetch. |
URLS = [ |
@@ -921,183 +1159,3 @@ URLS = [ |
'maxage': 2*60, # 2 mins |
}, |
] |
- |
- |
-def nonfatal_fetch_url(url, *args, **kwargs): |
- # Temporary workaround to disable AppEngine global cache of these pages. |
- if '?' in url: |
- url += '&' + str(random.random()) |
- else: |
- url += '?' + str(random.random()) |
- |
- try: |
- return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) |
- except urlfetch.DownloadError: |
- logging.warn('urlfetch failed: %s' % url, exc_info=1) |
- return None |
- |
- |
-class Row(db.Model): |
- fetch_timestamp = db.DateTimeProperty(required=True) |
- rev_number = db.StringProperty(required=True) |
- localpath = db.StringProperty(required=True) |
- revision = db.TextProperty() |
- name = db.TextProperty() |
- status = db.TextProperty() |
- comment = db.TextProperty() |
- details = db.TextProperty() |
- |
- |
-class Page(db.Model): |
- fetch_timestamp = db.DateTimeProperty(required=True) |
- localpath = db.StringProperty(required=True) |
- content = db.TextProperty() |
- title = db.StringProperty() |
- offsite_base = db.StringProperty() |
- body_class = db.StringProperty() |
- remoteurl = db.TextProperty() |
- # Data updated separately, after creation. |
- content_blob = blobstore.BlobReferenceProperty() |
cmp
2012/12/27 01:19:22
lg
|
- |
- |
-def write_blob(data, mime_type): |
- """Saves a Unicode string as a new blob, returns the blob's key.""" |
- file_name = files.blobstore.create(mime_type=mime_type) |
- data = data.encode('utf-8') |
- with files.open(file_name, 'a') as blob_file: |
- blob_file.write(data) |
- files.finalize(file_name) |
- return files.blobstore.get_blob_key(file_name) |
- |
- |
-def save_page(page, localpath, fetch_timestamp, page_data): |
- body_class = page_data.get('body_class', '') |
- content = page_data.get('content') |
- offsite_base = page_data.get('offsite_base', '') |
- title = page_data.get('title', '') |
- |
- content_blob_key = None |
- try: |
- content = content.decode('utf-8', 'replace') |
- except UnicodeEncodeError: |
- logging.debug('save_page: content was already in unicode') |
- logging.debug('save_page: content size is %d' % len(content)) |
- if len(content.encode('utf-8')) >= 10**6: |
- logging.debug('save_page: saving to blob') |
- content_blob_key = write_blob(content, path_to_mime_type(localpath)) |
- content = None |
- def tx_page(page_key): |
- page = Page.get(page_key) |
- # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no |
- # 'fetch_timestamp' member (but some types could not be inferred) |
- # pylint: disable=E1103 |
- if page.fetch_timestamp > fetch_timestamp: |
- return |
- page.content = content |
- page.content_blob = content_blob_key |
- page.fetch_timestamp = fetch_timestamp |
- # title, offsite_base, body_class can all be empty strings for some |
- # content. Where that's true, they're not used for displaying a console- |
- # like resource, and the content alone is returned to the web user. |
- page.title = title |
- page.offsite_base = offsite_base |
- page.body_class = body_class |
- # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member |
- # (but some types could not be inferred) |
- # pylint: disable=E1103 |
- page.put() |
- db.run_in_transaction(tx_page, page.key()) |
- page_data = { |
- 'body_class': body_class, |
- 'content': content, |
- 'offsite_base': offsite_base, |
- 'title': title, |
- } |
- if content_blob_key: |
- page_data['content_blob'] = True |
- put_pagedata_into_cache(localpath, page_data) |
- |
- |
-def get_or_create_page(localpath, remoteurl, maxage): |
- return Page.get_or_insert( |
- key_name=localpath, |
- localpath=localpath, |
- remoteurl=remoteurl, |
- maxage=maxage, |
- fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), |
- content=None, |
- content_blob=None) |
cmp
2012/12/27 01:19:22
lg
|
- |
- |
-def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None, |
- fetch_url=nonfatal_fetch_url): |
- """Fetches data about a set of pages.""" |
- if type(localpath) != type(''): |
- logging.error('fetch_page: localpath is %r, expected a string' % ( |
- repr(localpath))) |
- return |
- unquoted_localpath = urllib.unquote(localpath) |
- logging.debug('fetch_page("%s", "%s", "%s")' % ( |
- unquoted_localpath, remoteurl, maxage)) |
- page = get_or_create_page(unquoted_localpath, remoteurl, maxage) |
- |
- # Check if our copy of the page is younger than maxage. If it is, we'll |
- # skip the fetch. |
- oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( |
- seconds=maxage) |
- if (page.fetch_timestamp and |
- page.fetch_timestamp > oldest_acceptable_timestamp): |
- logging.debug('fetch_page: too recent, skipping') |
- return |
- |
- # Perform the actual page fetch. |
- fetch_timestamp = datetime.datetime.now() |
- response = fetch_url(remoteurl) |
- if not response: |
- logging.warning('fetch_page: got empty response') |
- return |
- if response.status_code != 200: |
- logging.warning('fetch_page: got non-empty response but code ' |
- '%d' % response.status_code) |
- return |
- |
- # We have actual content. If there's one or more handlers, call them. |
- page_data = {} |
- page_data['content'] = response.content |
- if postfetch: |
- if not isinstance(postfetch, list): |
- postfetch = [postfetch] |
- for handler in postfetch: |
- logging.debug('fetch_page: calling postfetch handler ' |
- '%s' % handler.__name__) |
- page_data = handler(unquoted_localpath, remoteurl, page_data) |
- |
- # Save the returned content into the DB and caching layers. |
- logging.debug('fetch_page: saving page') |
- save_page(page, unquoted_localpath, fetch_timestamp, page_data) |
- if postsave: |
- if not isinstance(postsave, list): |
- postsave = [postsave] |
- for handler in postsave: |
- logging.debug('fetch_page: calling postsave handler ' |
- '%s' % handler.__name__) |
- handler(unquoted_localpath, remoteurl, page_data) |
- |
- |
-EXT_TO_MIME = { |
- '.css': 'text/css', |
- '.js': 'text/javascript', |
- '.json': 'application/json', |
- '.html': 'text/html', |
-} |
- |
- |
-def path_to_mime_type(path): |
- return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') |
- |
- |
-def fetch_pages(): |
- """Starts a background fetch operation for pages that need it.""" |
- logging.debug('fetch_pages()') |
- for url in URLS: |
- deferred.defer(fetch_page, **url) |