app.py - Issue 11535002: chromium-build app now renders console from stored rows.

Unified Diff: app.py

Issue 11535002: chromium-build app now renders console from stored rows. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/chromium-build

Patch Set: Minor fixes found during further development Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: app.py

diff --git a/app.py b/app.py

index 544f92098a3996a3a8f16dc7a214a6bd3aa27aaf..140f9438a96db6603de87a61b240eca92eff64ad 100644

--- a/app.py

+++ b/app.py

@@ -24,7 +24,7 @@ from webapp2_extras import jinja2

# pylint: disable=F0401

from jinja2 import Environment, FileSystemLoader

-from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup

+from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup, Tag

# Current application name.

@@ -53,19 +53,30 @@ def bootstrap():

console_template = fh.read()

-def get_pagedata_from_cache(localpath):

- memcache_data = memcache.get(localpath)

- if not memcache_data:

- return None

- logging.debug('content for %s found in memcache' % localpath)

- return json.loads(memcache_data)

+##########

+# Page class definition and related functions.

+##########

+class Page(db.Model):

+ fetch_timestamp = db.DateTimeProperty(required=True)

+ localpath = db.StringProperty(required=True)

+ content = db.TextProperty()

+ title = db.StringProperty()

+ offsite_base = db.StringProperty()

+ body_class = db.StringProperty()

+ remoteurl = db.TextProperty()

+ # Data updated separately, after creation.

+ content_blob = blobstore.BlobReferenceProperty()

-def put_pagedata_into_cache(localpath, page_data):

- memcache_data = json.dumps(page_data)

- if not memcache.set(key=localpath, value=memcache_data, time=2*60):

- logging.error('put_pagedata_into_cache(\'%s\'): memcache.set() failed' % (

- localpath))

+def get_or_create_page(localpath, remoteurl, maxage):

+ return Page.get_or_insert(

+ key_name=localpath,

+ localpath=localpath,

+ remoteurl=remoteurl,

+ maxage=maxage,

+ fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24),

+ content=None,

+ content_blob=None)

def get_and_cache_pagedata(localpath):

@@ -86,7 +97,7 @@ def get_and_cache_pagedata(localpath):

Here we assume localpath is already unquoted.

"""

- page_data = get_pagedata_from_cache(localpath)

+ page_data = get_data_from_cache(localpath)

if page_data and not page_data.get('content_blob'):

return page_data

page = Page.all().filter('localpath =', localpath).get()

@@ -104,15 +115,149 @@ def get_and_cache_pagedata(localpath):

logging.debug('content for %s found in blobstore' % localpath)

blob_reader = blobstore.BlobReader(page.content_blob)

page_data['content_blob'] = True

- put_pagedata_into_cache(localpath, page_data)

+ put_data_into_cache(localpath, page_data)

page_data['content'] = blob_reader.read().decode('utf-8', 'replace')

else:

logging.debug('content for %s found in datastore' % localpath)

page_data['content'] = page.content

- put_pagedata_into_cache(localpath, page_data)

+ put_data_into_cache(localpath, page_data)

return page_data

+def save_page(page, localpath, fetch_timestamp, page_data):

+ body_class = page_data.get('body_class', '')

+ content = page_data.get('content')

+ offsite_base = page_data.get('offsite_base', '')

+ title = page_data.get('title', '')

+ content_blob_key = None

+ try:

+ content = content.decode('utf-8', 'replace')

+ except UnicodeEncodeError:

+ logging.debug('save_page: content was already in unicode')

+ logging.debug('save_page: content size is %d' % len(content))

+ if len(content.encode('utf-8')) >= 10**6:

+ logging.debug('save_page: saving to blob')

+ content_blob_key = write_blob(content, path_to_mime_type(localpath))

+ content = None

+ def tx_page(page_key):

+ page = Page.get(page_key)

+ # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no

+ # 'fetch_timestamp' member (but some types could not be inferred)

+ # pylint: disable=E1103

+ if page.fetch_timestamp > fetch_timestamp:

+ return

+ page.content = content

+ page.content_blob = content_blob_key

+ page.fetch_timestamp = fetch_timestamp

+ # title, offsite_base, body_class can all be empty strings for some

+ # content. Where that's true, they're not used for displaying a console-

+ # like resource, and the content alone is returned to the web user.

+ page.title = title

+ page.offsite_base = offsite_base

+ page.body_class = body_class

+ # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member

+ # (but some types could not be inferred)

+ # pylint: disable=E1103

+ page.put()

+ db.run_in_transaction(tx_page, page.key())

+ page_data = {

+ 'body_class': body_class,

+ 'content': content,

+ 'offsite_base': offsite_base,

+ 'title': title,

+ }

+ if content_blob_key:

+ page_data['content_blob'] = True

+ put_data_into_cache(localpath, page_data)

+ logging.info("Saved and cached page with localpath %s" % localpath)

cmp 2012/12/27 01:19:22 use single quotes everywhere instead of double quo

agable 2012/12/28 23:56:08 Done.

+##########

+# Row class definition and related functions.

+##########

+class Row(db.Model):

+ fetch_timestamp = db.DateTimeProperty(required=True)

+ rev_number = db.StringProperty(required=True)

+ localpath = db.StringProperty(required=True)

+ revision = db.TextProperty()

+ name = db.TextProperty()

+ status = db.TextProperty()

+ comment = db.TextProperty()

+ details = db.TextProperty()

+def get_or_create_row(localpath, revision):

+ return Row.get_or_insert(

+ key_name=localpath,

+ rev_number=revision,

+ localpath=localpath,

+ fetch_timestamp=datetime.datetime.now())

+def get_and_cache_rowdata(localpath):

+ """Returns a row_data dict.

+ get_and_cache_rowdata takes a localpath which is used to fetch data from the

+ cache. If the data is present, then we have all of the data we need and we

+ return early.

+ Otherwise, we need to fetch the row object and set up the row data.

+ Here we assume localpath is already unquoted.

+ """

+ row_data = get_data_from_cache(localpath)

+ if row_data:

+ return row_data

+ row = Row.all().filter('localpath =', localpath).get()

+ if not row:

+ logging.error('get_and_cache_rowdata(\'%s\'): no matching localpath in '

+ 'datastore' % localpath)

+ return {}

+ row_data = {}

+ row_data['rev'] = row.revision

+ row_data['name'] = row.name

+ row_data['status'] = row.status

+ row_data['comment'] = row.comment

+ row_data['details'] = row.details

+ row_data['rev_number'] = row.rev_number

+ logging.debug('content for %s found in datastore' % localpath)

+ put_data_into_cache(localpath, row_data)

+ return row_data

+def save_row(row_data, localpath, timestamp):

+ rev_number = row_data['rev_number']

+ row = get_or_create_row(localpath, rev_number)

+ row_key = row.key()

+ def tx_row(row_key):

+ row = Row.get(row_key)

+ # E1103:959,7:save_row.tx_row: Instance of 'list' has no

+ # 'fetch_timestamp' member (but some types could not be inferred)

+ # pylint: disable=E1103

+ # if row.fetch_timestamp > timestamp:

+ # return

+ row.fetch_timestamp = timestamp

+ row.revision = row_data['rev']

+ row.name = row_data['name']

+ row.status = row_data['status']

+ row.comment = row_data['comment']

+ row.details = row_data['details']

+ # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member

+ # (but some types could not be inferred)

+ # pylint: disable=E1103

+ row.put()

+ db.run_in_transaction(tx_row, row_key)

+ prev_rev = memcache.get(key='latest_rev')

+ if (rev_number > prev_rev):

+ memcache.set(key='latest_rev', value=rev_number)

+ put_data_into_cache(localpath, row_data)

+ logging.info("Saved and cached row with localpath %s" % localpath)

cmp 2012/12/27 01:19:22 use single quotes instead of double quotes

+##########

+# ConsoleData class definition and related functions.

+##########

class ConsoleData(object):

def __init__(self):

self.row_orderedkeys = []

@@ -178,6 +323,23 @@ class ConsoleData(object):

self.category_data[self.lastMasterSeen][category] = builder_status

self.category_count += 1

+ def AddRow(self, row):

+ revision = row['rev_number']

+ self.SawRevision(revision)

+ revlink = BeautifulSoup(row['rev']).td.a['href']

+ self.SetLink(revlink)

+ name = BeautifulSoup(row['name']).td.contents

+ self.SetName(self.ContentsToHtml(name))

+ status = BeautifulSoup(row['status']).findAll('table')

+ for i, stat in enumerate(status):

+ self.SetStatus(self.category_order[self.lastMasterSeen][i],

+ unicode(stat))

+ comment = BeautifulSoup(row['comment']).td.contents

+ self.SetComment(self.ContentsToHtml(comment))

+ if row['details']:

+ details = BeautifulSoup(row['details']).td.contents

+ self.SetDetail(self.ContentsToHtml(details))

def ParseRow(self, row):

cells = row.findAll('td', recursive=False)

# Figure out which row this is.

@@ -208,72 +370,61 @@ class ConsoleData(object):

# the earliest revisions, set them to ''.

-# W0613:169,39:console_merger: Unused argument 'remoteurl'

-# W0613:169,19:console_merger: Unused argument 'unquoted_localpath'

-# pylint: disable=W0613

-def console_merger(unquoted_localpath, remote_url, page_data=None,

- masters_to_merge=None):

- page_data = page_data or {}

+##########

+# Heavy-lifting functions that do most of the console processing.

+# AKA postfetch and postsave functions/handlers.

+##########

+def console_merger(localpath, remoteurl, page_data,

+ masters_to_merge=None, num_rows_to_merge=25):

masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE

mergedconsole = ConsoleData()

- merged_page = None

- merged_tag = None

+ surroundings = get_and_cache_pagedata('surroundings')

+ merged_page = BeautifulSoup(surroundings['content'])

+ merged_tag = merged_page.find('table', 'ConsoleData')

+ latest_rev = int(memcache.get(key='latest_rev'))

+ if not latest_rev:

+ logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest '

+ 'revision number.' % (

+ localpath, remoteurl, page_data))

+ return

fetch_timestamp = datetime.datetime.now()

for master in masters_to_merge:

- page_data = get_and_cache_pagedata('%s/console' % master)

- master_content = page_data['content']

- if master_content is None:

+ # Fetch the summary one-box-per-builder for the master.

+ # If we don't get it, something is wrong, skip the master entirely.

+ master_summary = get_and_cache_pagedata('%s/console/summary' % master)

+ if not master_summary['content']:

continue

- master_content = master_content.encode('ascii', 'replace')

cmp 2012/12/27 01:19:22 I hope that this line being removed and not shuffl

- this_page = BeautifulSoup(master_content)

- this_tag = this_page.find('table', {'class': 'ConsoleData'})

- # The first console is special, we reuse all of the console page.

- if not merged_page:

- merged_page = this_page

- merged_tag = this_tag

mergedconsole.SawMaster(master)

- # Parse each of the rows.

- CATEGORY_ROW = 0

- trs = this_tag.findAll('tr', recursive=False)

- # Get the list of categories in |master|.

- category_tds = trs[CATEGORY_ROW].findAll('td', recursive=False)[2:]

- third_cell = category_tds[0]

- third_cell_class = third_cell.attrs[0][1]

- categories = []

- if third_cell_class.startswith('DevStatus '):

- BUILDER_STATUS_ROW = 2

- FIRST_CL_ROW = 3

- for index, category_td in enumerate(category_tds):

- categories.append(category_td.contents[0].strip())

+ # Get the categories for this builder. If the builder doesn't have any

+ # categories, just use the default empty-string category.

+ category_list = []

+ master_categories = get_and_cache_pagedata('%s/console/categories' % master)

+ if not master_categories['content']:

+ category_list.append('')

else:

- # There's no categories + spacing row, the first row will be the builder

- # status row.

- categories.append('')

- BUILDER_STATUS_ROW = 0

- FIRST_CL_ROW = 1

- # For each category in |master|, add the category plus its |builder_status|.

- builder_tds = trs[BUILDER_STATUS_ROW].findAll('td', recursive=False)[2:]

- for index, category in enumerate(categories):

- builder_status = builder_tds[index].findAll('table', recursive=False)[0]

- mergedconsole.AddCategory(category=category,

- builder_status=builder_status)

- # For each of the remaining rows, add them to the console data.

- for console_index in range(FIRST_CL_ROW, len(trs)):

- console_row = trs[console_index]

- mergedconsole.ParseRow(console_row)

- # Add GC memory profiling.

- # import gc

- # gc.set_debug(gc.DEBUG_LEAK)

- # logging.debug(gc.garbage)

- # del gc.garbage[:]

- mergedconsole.Finish()

+ category_row = BeautifulSoup(master_categories['content'])

+ category_list = map(lambda x: x.text,

+ category_row.findAll('td', 'DevStatus'))

+ # Get the corresponding summary box(es).

+ summary_row = BeautifulSoup(master_summary['content'])

+ summary_list = summary_row.findAll('table')

+ for category, summary in zip(category_list, summary_list):

+ mergedconsole.AddCategory(category, summary)

+ # Fetch all of the rows that we need.

+ rows_fetched = 0

+ current_rev = latest_rev

+ while rows_fetched < num_rows_to_merge and current_rev >= 0:

+ row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev))

+ if not row_data:

+ current_rev -= 1

+ continue

+ mergedconsole.AddRow(row_data)

+ current_rev -= 1

+ rows_fetched += 1

# Convert the merged content into console content.

+ mergedconsole.Finish()

template_environment = Environment()

template_environment.loader = FileSystemLoader('.')

def notstarted(builder_status):

@@ -316,7 +467,8 @@ def console_merger(unquoted_localpath, remote_url, page_data=None,

# Update the merged console page.

merged_page = get_or_create_page('chromium/console', None, maxage=30)

- logging.debug('console_merger: saving merged console')

+ logging.info('console_merger: saving merged console')

+ page_data = get_and_cache_pagedata('chromium/console')

page_data['title'] = 'BuildBot: Chromium'

page_data['offsite_base'] = 'http://build.chromium.org/p/chromium'

page_data['body_class'] = 'interface'

@@ -325,10 +477,13 @@ def console_merger(unquoted_localpath, remote_url, page_data=None,

return

-def console_handler(_unquoted_localpath, remoteurl, page_data=None):

+def console_handler(unquoted_localpath, remoteurl, page_data=None):

page_data = page_data or {}

content = page_data.get('content')

if not content:

+ logging.error('console_handler(\'%s\', \'%s\', \'%s\'): cannot get site '

+ 'from local path' % (

+ unquoted_localpath, remoteurl, page_data))

return page_data

# Decode content from utf-8 to unicode, replacing bad characters.

@@ -460,51 +615,14 @@ def console_handler(_unquoted_localpath, remoteurl, page_data=None):

return page_data

-def get_or_create_row(localpath, revision):

- return Row.get_or_insert(

- key_name=revision + ' '+ localpath,

- rev_number=revision,

- localpath=localpath,

- fetch_timestamp=datetime.datetime.now())

-def save_row(row_data, localpath, timestamp):

- rev_number = row_data['rev_number']

- row = get_or_create_row(localpath, rev_number)

- row_key = row.key()

- def tx_row(row_key):

- row = Row.get(row_key)

- # E1103:959,7:save_row.tx_row: Instance of 'list' has no

- # 'fetch_timestamp' member (but some types could not be inferred)

- # pylint: disable=E1103

- # if row.fetch_timestamp > timestamp:

- # return

- row.fetch_timestamp = timestamp

- row.revision = row_data['rev']

- row.name = row_data['name']

- row.status = row_data['status']

- row.comment = row_data['comment']

- row.details = row_data['details']

- # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member

- # (but some types could not be inferred)

- # pylint: disable=E1103

- row.put()

- db.run_in_transaction(tx_row, row_key)

- memcache_data = json.dumps(row_data)

- # A row should never be large enough to hit the blobstore, so we

- # explicitly don't handle rows larger than 10^6 bytes.

- if not memcache.set(key=str(row_key), value=memcache_data, time=2*60):

- logging.error('save_row(\'%s\'): memcache.set() failed' % (row_key))

+# W0613:600,28:parse_master: Unused argument 'remoteurl'

+# pylint: disable=W0613

def parse_master(localpath, remoteurl, page_data=None):

"""Part of the new pipeline to store individual rows rather than

whole pages of html. Parses the master data into a set of rows,

and writes them out to the datastore in an easily retrievable format.

- Returns the same page_data as it was passed, so as to not interrupt

- the current pipeline. This may change when we switch over.

+ Doesn't modify page_data dict.

"""

ts = datetime.datetime.now()

page_data = page_data or {}

@@ -516,17 +634,19 @@ def parse_master(localpath, remoteurl, page_data=None):

# Split page into surroundings (announce, legend, footer) and data (rows).

surroundings = BeautifulSoup(content)

data = surroundings.find('table', 'ConsoleData')

- data.extract()

+ new_data = Tag(surroundings, 'table', [('class', 'ConsoleData'),

+ ('width', '96%')])

+ data.replaceWith(new_data)

- surroundings_page = get_or_create_page(localpath + '/surroundings',

+ surroundings_page = get_or_create_page('surroundings',

None, maxage=30)

surroundings_data = {}

- surroundings_data['title'] = 'Surroundings for ' + localpath

+ surroundings_data['title'] = 'Surroundings'

surroundings_data['content'] = unicode(surroundings)

- save_page(surroundings_page, localpath + '/surroundings', ts,

+ save_page(surroundings_page, 'surroundings', ts,

surroundings_data)

- rows = data.tbody.findAll('tr', recursive=False)

+ rows = data.findAll('tr', recursive=False)

# The first table row can be special: the list of categories.

categories = None

# If the first row contains a DevStatus cell...

@@ -567,11 +687,11 @@ def parse_master(localpath, remoteurl, page_data=None):

curr_row['rev'] = unicode(row.find('td', 'DevRev'))

curr_row['rev_number'] = unicode(row.find('td', 'DevRev').a.string)

curr_row['name'] = unicode(row.find('td', 'DevName'))

- curr_row['status'] = unicode(row.findAll('td', 'DevStatus'))

+ curr_row['status'] = unicode(row.findAll('table'))

else:

if 'details' not in curr_row:

curr_row['details'] = ''

- save_row(curr_row, localpath, ts)

+ save_row(curr_row, localpath + '/' + curr_row['rev_number'], ts)

curr_row = {}

return page_data

@@ -598,6 +718,124 @@ def one_box_handler(unquoted_localpath, remoteurl, page_data=None):

return page_data

+##########

+# Utility functions for blobstore and memcache.

+##########

+def get_data_from_cache(localpath):

+ memcache_data = memcache.get(localpath)

+ if not memcache_data:

+ return None

+ logging.debug('content for %s found in memcache' % localpath)

+ return json.loads(memcache_data)

+def put_data_into_cache(localpath, data):

+ memcache_data = json.dumps(data)

+ if not memcache.set(key=localpath, value=memcache_data, time=2*60):

+ logging.error('put_data_into_cache(\'%s\'): memcache.set() failed' % (

+ localpath))

+def write_blob(data, mime_type):

+ """Saves a Unicode string as a new blob, returns the blob's key."""

+ file_name = files.blobstore.create(mime_type=mime_type)

+ data = data.encode('utf-8')

+ with files.open(file_name, 'a') as blob_file:

+ blob_file.write(data)

+ files.finalize(file_name)

+ return files.blobstore.get_blob_key(file_name)

+def path_to_mime_type(path):

+ return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html')

+EXT_TO_MIME = {

+ '.css': 'text/css',

+ '.js': 'text/javascript',

+ '.json': 'application/json',

+ '.html': 'text/html',

+##########

+# Functions for actually fetching original pages.

+##########

+def fetch_pages():

+ """Starts a background fetch operation for pages that need it."""

+ logging.debug('fetch_pages()')

+ for url in URLS:

+ deferred.defer(fetch_page, **url)

+def nonfatal_fetch_url(url, *args, **kwargs):

+ # Temporary workaround to disable AppEngine global cache of these pages.

+ if '?' in url:

+ url += '&' + str(random.random())

+ else:

+ url += '?' + str(random.random())

+ try:

+ return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs)

+ except urlfetch.DownloadError:

+ logging.warn('urlfetch failed: %s' % url, exc_info=1)

+ return None

+def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None,

+ fetch_url=nonfatal_fetch_url):

+ """Fetches data about a set of pages."""

+ if type(localpath) != type(''):

+ logging.error('fetch_page: localpath is %r, expected a string' % (

+ repr(localpath)))

+ return

+ unquoted_localpath = urllib.unquote(localpath)

+ logging.debug('fetch_page("%s", "%s", "%s")' % (

+ unquoted_localpath, remoteurl, maxage))

+ page = get_or_create_page(unquoted_localpath, remoteurl, maxage)

+ # Check if our copy of the page is younger than maxage. If it is, we'll

+ # skip the fetch.

+ oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta(

+ seconds=maxage)

+ if (page.fetch_timestamp and

+ page.fetch_timestamp > oldest_acceptable_timestamp):

+ logging.debug('fetch_page: too recent, skipping')

+ return

+ # Perform the actual page fetch.

+ fetch_timestamp = datetime.datetime.now()

+ response = fetch_url(remoteurl)

+ if not response:

+ logging.warning('fetch_page: got empty response')

+ return

+ if response.status_code != 200:

+ logging.warning('fetch_page: got non-empty response but code '

+ '%d' % response.status_code)

+ return

+ # We have actual content. If there's one or more handlers, call them.

+ page_data = {}

+ page_data['content'] = response.content

+ if postfetch:

+ if not isinstance(postfetch, list):

+ postfetch = [postfetch]

+ for handler in postfetch:

+ logging.debug('fetch_page: calling postfetch handler '

+ '%s' % handler.__name__)

+ page_data = handler(unquoted_localpath, remoteurl, page_data)

+ # Save the returned content into the DB and caching layers.

+ logging.debug('fetch_page: saving page')

+ save_page(page, unquoted_localpath, fetch_timestamp, page_data)

+ if postsave:

+ if not isinstance(postsave, list):

+ postsave = [postsave]

+ for handler in postsave:

+ logging.debug('fetch_page: calling postsave handler '

+ '%s' % handler.__name__)

+ handler(unquoted_localpath, remoteurl, page_data)

# List of URLs to fetch.

URLS = [

@@ -921,183 +1159,3 @@ URLS = [

'maxage': 2*60, # 2 mins

]

-def nonfatal_fetch_url(url, *args, **kwargs):

- # Temporary workaround to disable AppEngine global cache of these pages.

- if '?' in url:

- url += '&' + str(random.random())

- else:

- url += '?' + str(random.random())

- try:

- return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs)

- except urlfetch.DownloadError:

- logging.warn('urlfetch failed: %s' % url, exc_info=1)

- return None

-class Row(db.Model):

- fetch_timestamp = db.DateTimeProperty(required=True)

- rev_number = db.StringProperty(required=True)

- localpath = db.StringProperty(required=True)

- revision = db.TextProperty()

- name = db.TextProperty()

- status = db.TextProperty()

- comment = db.TextProperty()

- details = db.TextProperty()

-class Page(db.Model):

- fetch_timestamp = db.DateTimeProperty(required=True)

- localpath = db.StringProperty(required=True)

- content = db.TextProperty()

- title = db.StringProperty()

- offsite_base = db.StringProperty()

- body_class = db.StringProperty()

- remoteurl = db.TextProperty()

- # Data updated separately, after creation.

- content_blob = blobstore.BlobReferenceProperty()

cmp 2012/12/27 01:19:22 lg

-def write_blob(data, mime_type):

- """Saves a Unicode string as a new blob, returns the blob's key."""

- file_name = files.blobstore.create(mime_type=mime_type)

- data = data.encode('utf-8')

- with files.open(file_name, 'a') as blob_file:

- blob_file.write(data)

- files.finalize(file_name)

- return files.blobstore.get_blob_key(file_name)

-def save_page(page, localpath, fetch_timestamp, page_data):

- body_class = page_data.get('body_class', '')

- content = page_data.get('content')

- offsite_base = page_data.get('offsite_base', '')

- title = page_data.get('title', '')

- content_blob_key = None

- try:

- content = content.decode('utf-8', 'replace')

- except UnicodeEncodeError:

- logging.debug('save_page: content was already in unicode')

- logging.debug('save_page: content size is %d' % len(content))

- if len(content.encode('utf-8')) >= 10**6:

- logging.debug('save_page: saving to blob')

- content_blob_key = write_blob(content, path_to_mime_type(localpath))

- content = None

- def tx_page(page_key):

- page = Page.get(page_key)

- # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no

- # 'fetch_timestamp' member (but some types could not be inferred)

- # pylint: disable=E1103

- if page.fetch_timestamp > fetch_timestamp:

- return

- page.content = content

- page.content_blob = content_blob_key

- page.fetch_timestamp = fetch_timestamp

- # title, offsite_base, body_class can all be empty strings for some

- # content. Where that's true, they're not used for displaying a console-

- # like resource, and the content alone is returned to the web user.

- page.title = title

- page.offsite_base = offsite_base

- page.body_class = body_class

- # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member

- # (but some types could not be inferred)

- # pylint: disable=E1103

- page.put()

- db.run_in_transaction(tx_page, page.key())

- page_data = {

- 'body_class': body_class,

- 'content': content,

- 'offsite_base': offsite_base,

- 'title': title,

- }

- if content_blob_key:

- page_data['content_blob'] = True

- put_pagedata_into_cache(localpath, page_data)

-def get_or_create_page(localpath, remoteurl, maxage):

- return Page.get_or_insert(

- key_name=localpath,

- localpath=localpath,

- remoteurl=remoteurl,

- maxage=maxage,

- fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24),

- content=None,

- content_blob=None)

cmp 2012/12/27 01:19:22 lg

-def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None,

- fetch_url=nonfatal_fetch_url):

- """Fetches data about a set of pages."""

- if type(localpath) != type(''):

- logging.error('fetch_page: localpath is %r, expected a string' % (

- repr(localpath)))

- return

- unquoted_localpath = urllib.unquote(localpath)

- logging.debug('fetch_page("%s", "%s", "%s")' % (

- unquoted_localpath, remoteurl, maxage))

- page = get_or_create_page(unquoted_localpath, remoteurl, maxage)

- # Check if our copy of the page is younger than maxage. If it is, we'll

- # skip the fetch.

- oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta(

- seconds=maxage)

- if (page.fetch_timestamp and

- page.fetch_timestamp > oldest_acceptable_timestamp):

- logging.debug('fetch_page: too recent, skipping')

- return

- # Perform the actual page fetch.

- fetch_timestamp = datetime.datetime.now()

- response = fetch_url(remoteurl)

- if not response:

- logging.warning('fetch_page: got empty response')

- return

- if response.status_code != 200:

- logging.warning('fetch_page: got non-empty response but code '

- '%d' % response.status_code)

- return

- # We have actual content. If there's one or more handlers, call them.

- page_data = {}

- page_data['content'] = response.content

- if postfetch:

- if not isinstance(postfetch, list):

- postfetch = [postfetch]

- for handler in postfetch:

- logging.debug('fetch_page: calling postfetch handler '

- '%s' % handler.__name__)

- page_data = handler(unquoted_localpath, remoteurl, page_data)

- # Save the returned content into the DB and caching layers.

- logging.debug('fetch_page: saving page')

- save_page(page, unquoted_localpath, fetch_timestamp, page_data)

- if postsave:

- if not isinstance(postsave, list):

- postsave = [postsave]

- for handler in postsave:

- logging.debug('fetch_page: calling postsave handler '

- '%s' % handler.__name__)

- handler(unquoted_localpath, remoteurl, page_data)

-EXT_TO_MIME = {

- '.css': 'text/css',

- '.js': 'text/javascript',

- '.json': 'application/json',

- '.html': 'text/html',

-def path_to_mime_type(path):

- return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html')

-def fetch_pages():

- """Starts a background fetch operation for pages that need it."""

- logging.debug('fetch_pages()')

- for url in URLS:

- deferred.defer(fetch_page, **url)

« no previous file with comments | « no previous file | app_test.py » ('j') | app_test.py » ('J')