Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(100)

Unified Diff: app.py

Issue 11535002: chromium-build app now renders console from stored rows. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/chromium-build
Patch Set: Minor fixes found during further development Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | app_test.py » ('j') | app_test.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: app.py
diff --git a/app.py b/app.py
index 544f92098a3996a3a8f16dc7a214a6bd3aa27aaf..140f9438a96db6603de87a61b240eca92eff64ad 100644
--- a/app.py
+++ b/app.py
@@ -24,7 +24,7 @@ from webapp2_extras import jinja2
# pylint: disable=F0401
from jinja2 import Environment, FileSystemLoader
-from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup
+from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup, Tag
# Current application name.
@@ -53,19 +53,30 @@ def bootstrap():
console_template = fh.read()
-def get_pagedata_from_cache(localpath):
- memcache_data = memcache.get(localpath)
- if not memcache_data:
- return None
- logging.debug('content for %s found in memcache' % localpath)
- return json.loads(memcache_data)
+##########
+# Page class definition and related functions.
+##########
+class Page(db.Model):
+ fetch_timestamp = db.DateTimeProperty(required=True)
+ localpath = db.StringProperty(required=True)
+ content = db.TextProperty()
+ title = db.StringProperty()
+ offsite_base = db.StringProperty()
+ body_class = db.StringProperty()
+ remoteurl = db.TextProperty()
+ # Data updated separately, after creation.
+ content_blob = blobstore.BlobReferenceProperty()
-def put_pagedata_into_cache(localpath, page_data):
- memcache_data = json.dumps(page_data)
- if not memcache.set(key=localpath, value=memcache_data, time=2*60):
- logging.error('put_pagedata_into_cache(\'%s\'): memcache.set() failed' % (
- localpath))
+def get_or_create_page(localpath, remoteurl, maxage):
+ return Page.get_or_insert(
+ key_name=localpath,
+ localpath=localpath,
+ remoteurl=remoteurl,
+ maxage=maxage,
+ fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24),
+ content=None,
+ content_blob=None)
def get_and_cache_pagedata(localpath):
@@ -86,7 +97,7 @@ def get_and_cache_pagedata(localpath):
Here we assume localpath is already unquoted.
"""
- page_data = get_pagedata_from_cache(localpath)
+ page_data = get_data_from_cache(localpath)
if page_data and not page_data.get('content_blob'):
return page_data
page = Page.all().filter('localpath =', localpath).get()
@@ -104,15 +115,149 @@ def get_and_cache_pagedata(localpath):
logging.debug('content for %s found in blobstore' % localpath)
blob_reader = blobstore.BlobReader(page.content_blob)
page_data['content_blob'] = True
- put_pagedata_into_cache(localpath, page_data)
+ put_data_into_cache(localpath, page_data)
page_data['content'] = blob_reader.read().decode('utf-8', 'replace')
else:
logging.debug('content for %s found in datastore' % localpath)
page_data['content'] = page.content
- put_pagedata_into_cache(localpath, page_data)
+ put_data_into_cache(localpath, page_data)
return page_data
+def save_page(page, localpath, fetch_timestamp, page_data):
+ body_class = page_data.get('body_class', '')
+ content = page_data.get('content')
+ offsite_base = page_data.get('offsite_base', '')
+ title = page_data.get('title', '')
+
+ content_blob_key = None
+ try:
+ content = content.decode('utf-8', 'replace')
+ except UnicodeEncodeError:
+ logging.debug('save_page: content was already in unicode')
+ logging.debug('save_page: content size is %d' % len(content))
+ if len(content.encode('utf-8')) >= 10**6:
+ logging.debug('save_page: saving to blob')
+ content_blob_key = write_blob(content, path_to_mime_type(localpath))
+ content = None
+ def tx_page(page_key):
+ page = Page.get(page_key)
+ # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no
+ # 'fetch_timestamp' member (but some types could not be inferred)
+ # pylint: disable=E1103
+ if page.fetch_timestamp > fetch_timestamp:
+ return
+ page.content = content
+ page.content_blob = content_blob_key
+ page.fetch_timestamp = fetch_timestamp
+ # title, offsite_base, body_class can all be empty strings for some
+ # content. Where that's true, they're not used for displaying a console-
+ # like resource, and the content alone is returned to the web user.
+ page.title = title
+ page.offsite_base = offsite_base
+ page.body_class = body_class
+ # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member
+ # (but some types could not be inferred)
+ # pylint: disable=E1103
+ page.put()
+ db.run_in_transaction(tx_page, page.key())
+ page_data = {
+ 'body_class': body_class,
+ 'content': content,
+ 'offsite_base': offsite_base,
+ 'title': title,
+ }
+ if content_blob_key:
+ page_data['content_blob'] = True
+ put_data_into_cache(localpath, page_data)
+ logging.info("Saved and cached page with localpath %s" % localpath)
cmp 2012/12/27 01:19:22 use single quotes everywhere instead of double quo
agable 2012/12/28 23:56:08 Done.
+
+
+##########
+# Row class definition and related functions.
+##########
+class Row(db.Model):
+ fetch_timestamp = db.DateTimeProperty(required=True)
+ rev_number = db.StringProperty(required=True)
+ localpath = db.StringProperty(required=True)
+ revision = db.TextProperty()
+ name = db.TextProperty()
+ status = db.TextProperty()
+ comment = db.TextProperty()
+ details = db.TextProperty()
+
+
+def get_or_create_row(localpath, revision):
+ return Row.get_or_insert(
+ key_name=localpath,
+ rev_number=revision,
+ localpath=localpath,
+ fetch_timestamp=datetime.datetime.now())
+
+
+def get_and_cache_rowdata(localpath):
+ """Returns a row_data dict.
+
+ get_and_cache_rowdata takes a localpath which is used to fetch data from the
+ cache. If the data is present, then we have all of the data we need and we
+ return early.
+
+ Otherwise, we need to fetch the row object and set up the row data.
+
+ Here we assume localpath is already unquoted.
+ """
+ row_data = get_data_from_cache(localpath)
+ if row_data:
+ return row_data
+ row = Row.all().filter('localpath =', localpath).get()
+ if not row:
+ logging.error('get_and_cache_rowdata(\'%s\'): no matching localpath in '
+ 'datastore' % localpath)
+ return {}
+ row_data = {}
+ row_data['rev'] = row.revision
+ row_data['name'] = row.name
+ row_data['status'] = row.status
+ row_data['comment'] = row.comment
+ row_data['details'] = row.details
+ row_data['rev_number'] = row.rev_number
+ logging.debug('content for %s found in datastore' % localpath)
+ put_data_into_cache(localpath, row_data)
+ return row_data
+
+
+def save_row(row_data, localpath, timestamp):
+ rev_number = row_data['rev_number']
+ row = get_or_create_row(localpath, rev_number)
+ row_key = row.key()
+ def tx_row(row_key):
+ row = Row.get(row_key)
+ # E1103:959,7:save_row.tx_row: Instance of 'list' has no
+ # 'fetch_timestamp' member (but some types could not be inferred)
+ # pylint: disable=E1103
+ # if row.fetch_timestamp > timestamp:
+ # return
+ row.fetch_timestamp = timestamp
+ row.revision = row_data['rev']
+ row.name = row_data['name']
+ row.status = row_data['status']
+ row.comment = row_data['comment']
+ row.details = row_data['details']
+ # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member
+ # (but some types could not be inferred)
+ # pylint: disable=E1103
+ row.put()
+ db.run_in_transaction(tx_row, row_key)
+ prev_rev = memcache.get(key='latest_rev')
+ if (rev_number > prev_rev):
+ memcache.set(key='latest_rev', value=rev_number)
+ put_data_into_cache(localpath, row_data)
+ logging.info("Saved and cached row with localpath %s" % localpath)
cmp 2012/12/27 01:19:22 use single quotes instead of double quotes
+
+
+##########
+# ConsoleData class definition and related functions.
+##########
class ConsoleData(object):
def __init__(self):
self.row_orderedkeys = []
@@ -178,6 +323,23 @@ class ConsoleData(object):
self.category_data[self.lastMasterSeen][category] = builder_status
self.category_count += 1
+ def AddRow(self, row):
+ revision = row['rev_number']
+ self.SawRevision(revision)
+ revlink = BeautifulSoup(row['rev']).td.a['href']
+ self.SetLink(revlink)
+ name = BeautifulSoup(row['name']).td.contents
+ self.SetName(self.ContentsToHtml(name))
+ status = BeautifulSoup(row['status']).findAll('table')
+ for i, stat in enumerate(status):
+ self.SetStatus(self.category_order[self.lastMasterSeen][i],
+ unicode(stat))
+ comment = BeautifulSoup(row['comment']).td.contents
+ self.SetComment(self.ContentsToHtml(comment))
+ if row['details']:
+ details = BeautifulSoup(row['details']).td.contents
+ self.SetDetail(self.ContentsToHtml(details))
+
def ParseRow(self, row):
cells = row.findAll('td', recursive=False)
# Figure out which row this is.
@@ -208,72 +370,61 @@ class ConsoleData(object):
# the earliest revisions, set them to ''.
-# W0613:169,39:console_merger: Unused argument 'remoteurl'
-# W0613:169,19:console_merger: Unused argument 'unquoted_localpath'
-# pylint: disable=W0613
-def console_merger(unquoted_localpath, remote_url, page_data=None,
- masters_to_merge=None):
- page_data = page_data or {}
-
+##########
+# Heavy-lifting functions that do most of the console processing.
+# AKA postfetch and postsave functions/handlers.
+##########
+def console_merger(localpath, remoteurl, page_data,
+ masters_to_merge=None, num_rows_to_merge=25):
masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE
mergedconsole = ConsoleData()
- merged_page = None
- merged_tag = None
+ surroundings = get_and_cache_pagedata('surroundings')
+ merged_page = BeautifulSoup(surroundings['content'])
+ merged_tag = merged_page.find('table', 'ConsoleData')
+ latest_rev = int(memcache.get(key='latest_rev'))
+ if not latest_rev:
+ logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest '
+ 'revision number.' % (
+ localpath, remoteurl, page_data))
+ return
fetch_timestamp = datetime.datetime.now()
for master in masters_to_merge:
- page_data = get_and_cache_pagedata('%s/console' % master)
- master_content = page_data['content']
- if master_content is None:
+ # Fetch the summary one-box-per-builder for the master.
+ # If we don't get it, something is wrong, skip the master entirely.
+ master_summary = get_and_cache_pagedata('%s/console/summary' % master)
+ if not master_summary['content']:
continue
- master_content = master_content.encode('ascii', 'replace')
cmp 2012/12/27 01:19:22 I hope that this line being removed and not shuffl
- this_page = BeautifulSoup(master_content)
- this_tag = this_page.find('table', {'class': 'ConsoleData'})
- # The first console is special, we reuse all of the console page.
- if not merged_page:
- merged_page = this_page
- merged_tag = this_tag
mergedconsole.SawMaster(master)
-
- # Parse each of the rows.
- CATEGORY_ROW = 0
- trs = this_tag.findAll('tr', recursive=False)
-
- # Get the list of categories in |master|.
- category_tds = trs[CATEGORY_ROW].findAll('td', recursive=False)[2:]
- third_cell = category_tds[0]
- third_cell_class = third_cell.attrs[0][1]
- categories = []
- if third_cell_class.startswith('DevStatus '):
- BUILDER_STATUS_ROW = 2
- FIRST_CL_ROW = 3
- for index, category_td in enumerate(category_tds):
- categories.append(category_td.contents[0].strip())
+ # Get the categories for this builder. If the builder doesn't have any
+ # categories, just use the default empty-string category.
+ category_list = []
+ master_categories = get_and_cache_pagedata('%s/console/categories' % master)
+ if not master_categories['content']:
+ category_list.append('')
else:
- # There's no categories + spacing row, the first row will be the builder
- # status row.
- categories.append('')
- BUILDER_STATUS_ROW = 0
- FIRST_CL_ROW = 1
-
- # For each category in |master|, add the category plus its |builder_status|.
- builder_tds = trs[BUILDER_STATUS_ROW].findAll('td', recursive=False)[2:]
- for index, category in enumerate(categories):
- builder_status = builder_tds[index].findAll('table', recursive=False)[0]
- mergedconsole.AddCategory(category=category,
- builder_status=builder_status)
-
- # For each of the remaining rows, add them to the console data.
- for console_index in range(FIRST_CL_ROW, len(trs)):
- console_row = trs[console_index]
- mergedconsole.ParseRow(console_row)
- # Add GC memory profiling.
- # import gc
- # gc.set_debug(gc.DEBUG_LEAK)
- # logging.debug(gc.garbage)
- # del gc.garbage[:]
- mergedconsole.Finish()
+ category_row = BeautifulSoup(master_categories['content'])
+ category_list = map(lambda x: x.text,
+ category_row.findAll('td', 'DevStatus'))
+ # Get the corresponding summary box(es).
+ summary_row = BeautifulSoup(master_summary['content'])
+ summary_list = summary_row.findAll('table')
+ for category, summary in zip(category_list, summary_list):
+ mergedconsole.AddCategory(category, summary)
+
+ # Fetch all of the rows that we need.
+ rows_fetched = 0
+ current_rev = latest_rev
+ while rows_fetched < num_rows_to_merge and current_rev >= 0:
+ row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev))
+ if not row_data:
+ current_rev -= 1
+ continue
+ mergedconsole.AddRow(row_data)
+ current_rev -= 1
+ rows_fetched += 1
# Convert the merged content into console content.
+ mergedconsole.Finish()
template_environment = Environment()
template_environment.loader = FileSystemLoader('.')
def notstarted(builder_status):
@@ -316,7 +467,8 @@ def console_merger(unquoted_localpath, remote_url, page_data=None,
# Update the merged console page.
merged_page = get_or_create_page('chromium/console', None, maxage=30)
- logging.debug('console_merger: saving merged console')
+ logging.info('console_merger: saving merged console')
+ page_data = get_and_cache_pagedata('chromium/console')
page_data['title'] = 'BuildBot: Chromium'
page_data['offsite_base'] = 'http://build.chromium.org/p/chromium'
page_data['body_class'] = 'interface'
@@ -325,10 +477,13 @@ def console_merger(unquoted_localpath, remote_url, page_data=None,
return
-def console_handler(_unquoted_localpath, remoteurl, page_data=None):
+def console_handler(unquoted_localpath, remoteurl, page_data=None):
page_data = page_data or {}
content = page_data.get('content')
if not content:
+ logging.error('console_handler(\'%s\', \'%s\', \'%s\'): cannot get site '
+ 'from local path' % (
+ unquoted_localpath, remoteurl, page_data))
return page_data
# Decode content from utf-8 to unicode, replacing bad characters.
@@ -460,51 +615,14 @@ def console_handler(_unquoted_localpath, remoteurl, page_data=None):
return page_data
-
-def get_or_create_row(localpath, revision):
- return Row.get_or_insert(
- key_name=revision + ' '+ localpath,
- rev_number=revision,
- localpath=localpath,
- fetch_timestamp=datetime.datetime.now())
-
-
-def save_row(row_data, localpath, timestamp):
- rev_number = row_data['rev_number']
- row = get_or_create_row(localpath, rev_number)
- row_key = row.key()
- def tx_row(row_key):
- row = Row.get(row_key)
- # E1103:959,7:save_row.tx_row: Instance of 'list' has no
- # 'fetch_timestamp' member (but some types could not be inferred)
- # pylint: disable=E1103
- # if row.fetch_timestamp > timestamp:
- # return
- row.fetch_timestamp = timestamp
- row.revision = row_data['rev']
- row.name = row_data['name']
- row.status = row_data['status']
- row.comment = row_data['comment']
- row.details = row_data['details']
- # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member
- # (but some types could not be inferred)
- # pylint: disable=E1103
- row.put()
- db.run_in_transaction(tx_row, row_key)
- memcache_data = json.dumps(row_data)
- # A row should never be large enough to hit the blobstore, so we
- # explicitly don't handle rows larger than 10^6 bytes.
- if not memcache.set(key=str(row_key), value=memcache_data, time=2*60):
- logging.error('save_row(\'%s\'): memcache.set() failed' % (row_key))
-
-
+# W0613:600,28:parse_master: Unused argument 'remoteurl'
+# pylint: disable=W0613
def parse_master(localpath, remoteurl, page_data=None):
"""Part of the new pipeline to store individual rows rather than
whole pages of html. Parses the master data into a set of rows,
and writes them out to the datastore in an easily retrievable format.
- Returns the same page_data as it was passed, so as to not interrupt
- the current pipeline. This may change when we switch over.
+ Doesn't modify page_data dict.
"""
ts = datetime.datetime.now()
page_data = page_data or {}
@@ -516,17 +634,19 @@ def parse_master(localpath, remoteurl, page_data=None):
# Split page into surroundings (announce, legend, footer) and data (rows).
surroundings = BeautifulSoup(content)
data = surroundings.find('table', 'ConsoleData')
- data.extract()
+ new_data = Tag(surroundings, 'table', [('class', 'ConsoleData'),
+ ('width', '96%')])
+ data.replaceWith(new_data)
- surroundings_page = get_or_create_page(localpath + '/surroundings',
+ surroundings_page = get_or_create_page('surroundings',
None, maxage=30)
surroundings_data = {}
- surroundings_data['title'] = 'Surroundings for ' + localpath
+ surroundings_data['title'] = 'Surroundings'
surroundings_data['content'] = unicode(surroundings)
- save_page(surroundings_page, localpath + '/surroundings', ts,
+ save_page(surroundings_page, 'surroundings', ts,
surroundings_data)
- rows = data.tbody.findAll('tr', recursive=False)
+ rows = data.findAll('tr', recursive=False)
# The first table row can be special: the list of categories.
categories = None
# If the first row contains a DevStatus cell...
@@ -567,11 +687,11 @@ def parse_master(localpath, remoteurl, page_data=None):
curr_row['rev'] = unicode(row.find('td', 'DevRev'))
curr_row['rev_number'] = unicode(row.find('td', 'DevRev').a.string)
curr_row['name'] = unicode(row.find('td', 'DevName'))
- curr_row['status'] = unicode(row.findAll('td', 'DevStatus'))
+ curr_row['status'] = unicode(row.findAll('table'))
else:
if 'details' not in curr_row:
curr_row['details'] = ''
- save_row(curr_row, localpath, ts)
+ save_row(curr_row, localpath + '/' + curr_row['rev_number'], ts)
curr_row = {}
return page_data
@@ -598,6 +718,124 @@ def one_box_handler(unquoted_localpath, remoteurl, page_data=None):
return page_data
+##########
+# Utility functions for blobstore and memcache.
+##########
+def get_data_from_cache(localpath):
+ memcache_data = memcache.get(localpath)
+ if not memcache_data:
+ return None
+ logging.debug('content for %s found in memcache' % localpath)
+ return json.loads(memcache_data)
+
+
+def put_data_into_cache(localpath, data):
+ memcache_data = json.dumps(data)
+ if not memcache.set(key=localpath, value=memcache_data, time=2*60):
+ logging.error('put_data_into_cache(\'%s\'): memcache.set() failed' % (
+ localpath))
+
+
+def write_blob(data, mime_type):
+ """Saves a Unicode string as a new blob, returns the blob's key."""
+ file_name = files.blobstore.create(mime_type=mime_type)
+ data = data.encode('utf-8')
+ with files.open(file_name, 'a') as blob_file:
+ blob_file.write(data)
+ files.finalize(file_name)
+ return files.blobstore.get_blob_key(file_name)
+
+
+def path_to_mime_type(path):
+ return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html')
+
+
+EXT_TO_MIME = {
+ '.css': 'text/css',
+ '.js': 'text/javascript',
+ '.json': 'application/json',
+ '.html': 'text/html',
+}
+
+
+##########
+# Functions for actually fetching original pages.
+##########
+def fetch_pages():
+ """Starts a background fetch operation for pages that need it."""
+ logging.debug('fetch_pages()')
+ for url in URLS:
+ deferred.defer(fetch_page, **url)
+
+
+def nonfatal_fetch_url(url, *args, **kwargs):
+ # Temporary workaround to disable AppEngine global cache of these pages.
+ if '?' in url:
+ url += '&' + str(random.random())
+ else:
+ url += '?' + str(random.random())
+
+ try:
+ return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs)
+ except urlfetch.DownloadError:
+ logging.warn('urlfetch failed: %s' % url, exc_info=1)
+ return None
+
+
+def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None,
+ fetch_url=nonfatal_fetch_url):
+ """Fetches data about a set of pages."""
+ if type(localpath) != type(''):
+ logging.error('fetch_page: localpath is %r, expected a string' % (
+ repr(localpath)))
+ return
+ unquoted_localpath = urllib.unquote(localpath)
+ logging.debug('fetch_page("%s", "%s", "%s")' % (
+ unquoted_localpath, remoteurl, maxage))
+ page = get_or_create_page(unquoted_localpath, remoteurl, maxage)
+
+ # Check if our copy of the page is younger than maxage. If it is, we'll
+ # skip the fetch.
+ oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta(
+ seconds=maxage)
+ if (page.fetch_timestamp and
+ page.fetch_timestamp > oldest_acceptable_timestamp):
+ logging.debug('fetch_page: too recent, skipping')
+ return
+
+ # Perform the actual page fetch.
+ fetch_timestamp = datetime.datetime.now()
+ response = fetch_url(remoteurl)
+ if not response:
+ logging.warning('fetch_page: got empty response')
+ return
+ if response.status_code != 200:
+ logging.warning('fetch_page: got non-empty response but code '
+ '%d' % response.status_code)
+ return
+
+ # We have actual content. If there's one or more handlers, call them.
+ page_data = {}
+ page_data['content'] = response.content
+ if postfetch:
+ if not isinstance(postfetch, list):
+ postfetch = [postfetch]
+ for handler in postfetch:
+ logging.debug('fetch_page: calling postfetch handler '
+ '%s' % handler.__name__)
+ page_data = handler(unquoted_localpath, remoteurl, page_data)
+
+ # Save the returned content into the DB and caching layers.
+ logging.debug('fetch_page: saving page')
+ save_page(page, unquoted_localpath, fetch_timestamp, page_data)
+ if postsave:
+ if not isinstance(postsave, list):
+ postsave = [postsave]
+ for handler in postsave:
+ logging.debug('fetch_page: calling postsave handler '
+ '%s' % handler.__name__)
+ handler(unquoted_localpath, remoteurl, page_data)
+
# List of URLs to fetch.
URLS = [
@@ -921,183 +1159,3 @@ URLS = [
'maxage': 2*60, # 2 mins
},
]
-
-
-def nonfatal_fetch_url(url, *args, **kwargs):
- # Temporary workaround to disable AppEngine global cache of these pages.
- if '?' in url:
- url += '&' + str(random.random())
- else:
- url += '?' + str(random.random())
-
- try:
- return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs)
- except urlfetch.DownloadError:
- logging.warn('urlfetch failed: %s' % url, exc_info=1)
- return None
-
-
-class Row(db.Model):
- fetch_timestamp = db.DateTimeProperty(required=True)
- rev_number = db.StringProperty(required=True)
- localpath = db.StringProperty(required=True)
- revision = db.TextProperty()
- name = db.TextProperty()
- status = db.TextProperty()
- comment = db.TextProperty()
- details = db.TextProperty()
-
-
-class Page(db.Model):
- fetch_timestamp = db.DateTimeProperty(required=True)
- localpath = db.StringProperty(required=True)
- content = db.TextProperty()
- title = db.StringProperty()
- offsite_base = db.StringProperty()
- body_class = db.StringProperty()
- remoteurl = db.TextProperty()
- # Data updated separately, after creation.
- content_blob = blobstore.BlobReferenceProperty()
cmp 2012/12/27 01:19:22 lg
-
-
-def write_blob(data, mime_type):
- """Saves a Unicode string as a new blob, returns the blob's key."""
- file_name = files.blobstore.create(mime_type=mime_type)
- data = data.encode('utf-8')
- with files.open(file_name, 'a') as blob_file:
- blob_file.write(data)
- files.finalize(file_name)
- return files.blobstore.get_blob_key(file_name)
-
-
-def save_page(page, localpath, fetch_timestamp, page_data):
- body_class = page_data.get('body_class', '')
- content = page_data.get('content')
- offsite_base = page_data.get('offsite_base', '')
- title = page_data.get('title', '')
-
- content_blob_key = None
- try:
- content = content.decode('utf-8', 'replace')
- except UnicodeEncodeError:
- logging.debug('save_page: content was already in unicode')
- logging.debug('save_page: content size is %d' % len(content))
- if len(content.encode('utf-8')) >= 10**6:
- logging.debug('save_page: saving to blob')
- content_blob_key = write_blob(content, path_to_mime_type(localpath))
- content = None
- def tx_page(page_key):
- page = Page.get(page_key)
- # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no
- # 'fetch_timestamp' member (but some types could not be inferred)
- # pylint: disable=E1103
- if page.fetch_timestamp > fetch_timestamp:
- return
- page.content = content
- page.content_blob = content_blob_key
- page.fetch_timestamp = fetch_timestamp
- # title, offsite_base, body_class can all be empty strings for some
- # content. Where that's true, they're not used for displaying a console-
- # like resource, and the content alone is returned to the web user.
- page.title = title
- page.offsite_base = offsite_base
- page.body_class = body_class
- # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member
- # (but some types could not be inferred)
- # pylint: disable=E1103
- page.put()
- db.run_in_transaction(tx_page, page.key())
- page_data = {
- 'body_class': body_class,
- 'content': content,
- 'offsite_base': offsite_base,
- 'title': title,
- }
- if content_blob_key:
- page_data['content_blob'] = True
- put_pagedata_into_cache(localpath, page_data)
-
-
-def get_or_create_page(localpath, remoteurl, maxage):
- return Page.get_or_insert(
- key_name=localpath,
- localpath=localpath,
- remoteurl=remoteurl,
- maxage=maxage,
- fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24),
- content=None,
- content_blob=None)
cmp 2012/12/27 01:19:22 lg
-
-
-def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None,
- fetch_url=nonfatal_fetch_url):
- """Fetches data about a set of pages."""
- if type(localpath) != type(''):
- logging.error('fetch_page: localpath is %r, expected a string' % (
- repr(localpath)))
- return
- unquoted_localpath = urllib.unquote(localpath)
- logging.debug('fetch_page("%s", "%s", "%s")' % (
- unquoted_localpath, remoteurl, maxage))
- page = get_or_create_page(unquoted_localpath, remoteurl, maxage)
-
- # Check if our copy of the page is younger than maxage. If it is, we'll
- # skip the fetch.
- oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta(
- seconds=maxage)
- if (page.fetch_timestamp and
- page.fetch_timestamp > oldest_acceptable_timestamp):
- logging.debug('fetch_page: too recent, skipping')
- return
-
- # Perform the actual page fetch.
- fetch_timestamp = datetime.datetime.now()
- response = fetch_url(remoteurl)
- if not response:
- logging.warning('fetch_page: got empty response')
- return
- if response.status_code != 200:
- logging.warning('fetch_page: got non-empty response but code '
- '%d' % response.status_code)
- return
-
- # We have actual content. If there's one or more handlers, call them.
- page_data = {}
- page_data['content'] = response.content
- if postfetch:
- if not isinstance(postfetch, list):
- postfetch = [postfetch]
- for handler in postfetch:
- logging.debug('fetch_page: calling postfetch handler '
- '%s' % handler.__name__)
- page_data = handler(unquoted_localpath, remoteurl, page_data)
-
- # Save the returned content into the DB and caching layers.
- logging.debug('fetch_page: saving page')
- save_page(page, unquoted_localpath, fetch_timestamp, page_data)
- if postsave:
- if not isinstance(postsave, list):
- postsave = [postsave]
- for handler in postsave:
- logging.debug('fetch_page: calling postsave handler '
- '%s' % handler.__name__)
- handler(unquoted_localpath, remoteurl, page_data)
-
-
-EXT_TO_MIME = {
- '.css': 'text/css',
- '.js': 'text/javascript',
- '.json': 'application/json',
- '.html': 'text/html',
-}
-
-
-def path_to_mime_type(path):
- return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html')
-
-
-def fetch_pages():
- """Starts a background fetch operation for pages that need it."""
- logging.debug('fetch_pages()')
- for url in URLS:
- deferred.defer(fetch_page, **url)
« no previous file with comments | « no previous file | app_test.py » ('j') | app_test.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698