OLD | NEW |
---|---|
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 from __future__ import with_statement | 5 from __future__ import with_statement |
6 | 6 |
7 import datetime | 7 import datetime |
8 import json | |
8 import logging | 9 import logging |
9 import os | 10 import os |
10 import random | 11 import random |
11 import re | 12 import re |
12 import string | 13 import string |
13 import urllib | 14 import urllib |
14 | 15 |
15 from google.appengine.api import files, memcache, urlfetch | 16 from google.appengine.api import files, memcache, urlfetch |
16 from google.appengine.api.app_identity import get_application_id | 17 from google.appengine.api.app_identity import get_application_id |
17 from google.appengine.ext import blobstore, db, deferred | 18 from google.appengine.ext import blobstore, db, deferred |
(...skipping 12 matching lines...) Expand all Loading... | |
30 APP_NAME = get_application_id() | 31 APP_NAME = get_application_id() |
31 | 32 |
32 # Deadline for fetching URLs (in seconds). | 33 # Deadline for fetching URLs (in seconds). |
33 URLFETCH_DEADLINE = 60*5 # 5 mins | 34 URLFETCH_DEADLINE = 60*5 # 5 mins |
34 | 35 |
35 | 36 |
36 # Perform initial bootstrap for this module. | 37 # Perform initial bootstrap for this module. |
37 console_template = '' | 38 console_template = '' |
38 def bootstrap(): | 39 def bootstrap(): |
39 global console_template | 40 global console_template |
40 with open('templates/console.html', 'r') as fh: | 41 with open('templates/merger.html', 'r') as fh: |
41 console_template = fh.read() | 42 console_template = fh.read() |
42 | 43 |
43 | 44 |
44 # Assumes localpath is already unquoted. | 45 def get_pagedata_from_cache(localpath): |
45 def get_and_cache_page(localpath): | 46 memcache_data = memcache.get(localpath) |
46 # E1101: 29,12:get_and_cache_page: Module 'google.appengine.api.memcache' has | 47 if not memcache_data: |
47 # no 'get' member | 48 return None |
48 # pylint: disable=E1101 | 49 logging.debug('content for %s found in memcache' % localpath) |
49 content = memcache.get(localpath) | 50 return json.loads(memcache_data) |
50 if content is not None: | |
51 logging.debug('content for %s found in memcache' % localpath) | |
52 return content | |
53 | 51 |
52 | |
53 def put_pagedata_into_cache(localpath, page_data): | |
54 memcache_data = json.dumps(page_data) | |
55 if not memcache.set(key=localpath, value=memcache_data, time=2*60): | |
56 logging.error('put_pagedata_into_cache(\'%s\'): memcache.set() failed' % ( | |
57 localpath)) | |
58 | |
59 | |
60 def get_and_cache_pagedata(localpath): | |
61 """Return a page_data dict, optionally caching and looking up a blob. | |
M-A Ruel
2012/05/29 20:03:49
Returns
cmp
2012/05/29 20:20:28
Done.
| |
62 | |
63 get_and_cache_pagedata takes a localpath which is used to fetch data | |
64 from the cache. If the data is present and there's no content blob, | |
65 then we have all of the data we need to return a page view to the user | |
66 and we return early. | |
67 | |
68 Otherwise, we need to fetch the page object and set up the page data | |
69 for the page view. If the page has a blob associated with it, then we | |
70 mark the page data as having a blob and cache it as-is without the blob. | |
71 If there's no blob, we associate the content with the page data and | |
72 cache that. This is so the next time get_and_cache_pagedata is called | |
73 for either case, we'll get the same behavior (a page-lookup for blobful | |
74 content and a page cache hit for blobless content). | |
75 | |
76 Here we assume localpath is already unquoted. | |
77 """ | |
78 page_data = get_pagedata_from_cache(localpath) | |
79 if page_data and not page_data.get('content_blob'): | |
80 return page_data | |
54 page = Page.all().filter('localpath =', localpath).get() | 81 page = Page.all().filter('localpath =', localpath).get() |
55 if not page: | 82 if not page: |
56 logging.error('get_and_cache_page(\'%s\'): no matching localpath in ' | 83 logging.error('get_and_cache_pagedata(\'%s\'): no matching localpath in ' |
57 'datastore' % localpath) | 84 'datastore' % localpath) |
58 return None | 85 return {'content': None} |
59 if page.content_blob is not None: | 86 page_data = { |
87 'body_class': page.body_class, | |
88 'offsite_base': page.offsite_base, | |
89 'title': page.title, | |
90 } | |
91 if page.content_blob: | |
60 # Get the blob. | 92 # Get the blob. |
93 logging.debug('content for %s found in blobstore' % localpath) | |
61 blob_reader = blobstore.BlobReader(page.content_blob) | 94 blob_reader = blobstore.BlobReader(page.content_blob) |
62 content = blob_reader.read().decode('utf-8', 'replace') | 95 page_data['content_blob'] = True |
63 logging.debug('content for %s found in blobstore' % localpath) | 96 put_pagedata_into_cache(localpath, page_data) |
97 page_data['content'] = blob_reader.read().decode('utf-8', 'replace') | |
64 else: | 98 else: |
65 logging.debug('content for %s found in datastore' % localpath) | 99 logging.debug('content for %s found in datastore' % localpath) |
66 content = page.content | 100 page_data['content'] = page.content |
67 # E1101: 39,11:get_and_cache_page: Module 'google.appengine.api.memcache' | 101 put_pagedata_into_cache(localpath, page_data) |
68 # has no 'set' member | 102 return page_data |
69 # pylint: disable=E1101 | |
70 if not memcache.set(key=localpath, value=content, time=2*60): | |
71 logging.error('get_and_cache_page(\'%s\'): memcache.set() failed' % | |
72 localpath) | |
73 return content | |
74 | 103 |
75 | 104 |
76 class ConsoleData(object): | 105 class ConsoleData(object): |
77 def __init__(self): | 106 def __init__(self): |
78 self.row_orderedkeys = [] | 107 self.row_orderedkeys = [] |
79 self.row_data = {} | 108 self.row_data = {} |
80 | 109 |
81 # Retain order of observed masters. | 110 # Retain order of observed masters. |
82 self.masters = [] | 111 self.masters = [] |
83 | 112 |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
164 def Finish(self): | 193 def Finish(self): |
165 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) | 194 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) |
166 # TODO(cmp): Look for row/master/categories that are unset. If they are | 195 # TODO(cmp): Look for row/master/categories that are unset. If they are |
167 # at the latest revisions, leave them unset. If they are at | 196 # at the latest revisions, leave them unset. If they are at |
168 # the earliest revisions, set them to ''. | 197 # the earliest revisions, set them to ''. |
169 | 198 |
170 | 199 |
171 # W0613:169,39:console_merger: Unused argument 'remoteurl' | 200 # W0613:169,39:console_merger: Unused argument 'remoteurl' |
172 # W0613:169,19:console_merger: Unused argument 'unquoted_localpath' | 201 # W0613:169,19:console_merger: Unused argument 'unquoted_localpath' |
173 # pylint: disable=W0613 | 202 # pylint: disable=W0613 |
174 def console_merger(unquoted_localpath, remote_url, content=None): | 203 def console_merger(unquoted_localpath, remote_url, page_data=None): |
175 if content is None: | 204 page_data = page_data or {} |
176 return None | |
177 | 205 |
178 masters = [ | 206 masters = [ |
179 'chromium.main', | 207 'chromium.main', |
180 'chromium.chromiumos', | 208 'chromium.chromiumos', |
181 'chromium.chrome', | 209 'chromium.chrome', |
182 'chromium.memory', | 210 'chromium.memory', |
183 ] | 211 ] |
184 mergedconsole = ConsoleData() | 212 mergedconsole = ConsoleData() |
185 merged_page = None | 213 merged_page = None |
186 merged_tag = None | 214 merged_tag = None |
187 fetch_timestamp = datetime.datetime.now() | 215 fetch_timestamp = datetime.datetime.now() |
188 for master in masters: | 216 for master in masters: |
189 master_content = get_and_cache_page('%s/console' % master) | 217 page_data = get_and_cache_pagedata('%s/console' % master) |
218 master_content = page_data['content'] | |
190 if master_content is None: | 219 if master_content is None: |
191 continue | 220 continue |
192 master_content = master_content.encode('ascii', 'replace') | 221 master_content = master_content.encode('ascii', 'replace') |
193 this_page = BeautifulSoup(master_content) | 222 this_page = BeautifulSoup(master_content) |
194 this_tag = this_page.find('table', {'class': 'ConsoleData'}) | 223 this_tag = this_page.find('table', {'class': 'ConsoleData'}) |
195 # The first console is special, we reuse all of the console page. | 224 # The first console is special, we reuse all of the console page. |
196 if not merged_page: | 225 if not merged_page: |
197 merged_page = this_page | 226 merged_page = this_page |
198 merged_tag = this_tag | 227 merged_tag = this_tag |
199 mergedconsole.SawMaster(master) | 228 mergedconsole.SawMaster(master) |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
274 merged_content = re.sub( | 303 merged_content = re.sub( |
275 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) | 304 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) |
276 merged_content = re.sub( | 305 merged_content = re.sub( |
277 r'\<iframe\>\</iframe\>', | 306 r'\<iframe\>\</iframe\>', |
278 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', | 307 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', |
279 merged_content) | 308 merged_content) |
280 | 309 |
281 # Update the merged console page. | 310 # Update the merged console page. |
282 merged_page = get_or_create_page('chromium/console', None, maxage=30) | 311 merged_page = get_or_create_page('chromium/console', None, maxage=30) |
283 logging.debug('console_merger: saving merged console') | 312 logging.debug('console_merger: saving merged console') |
284 save_page(merged_page, 'chromium/console', merged_content, | 313 page_data['title'] = 'BuildBot: Chromium' |
285 fetch_timestamp) | 314 page_data['offsite_base'] = 'http://build.chromium.org/p/chromium' |
286 return merged_content | 315 page_data['body_class'] = 'interface' |
316 page_data['content'] = merged_content | |
317 save_page(merged_page, 'chromium/console', fetch_timestamp, page_data) | |
318 return | |
287 | 319 |
288 | 320 |
289 # W0613:284,20:console_handler: Unused argument 'unquoted_localpath' | 321 def console_handler(_unquoted_localpath, remoteurl, page_data=None): |
290 # pylint: disable=W0613 | 322 page_data = page_data or {} |
291 def console_handler(unquoted_localpath, remoteurl, content=None): | 323 content = page_data.get('content') |
292 if content is None: | 324 if not content: |
293 return None | 325 return page_data |
294 # TODO(cmp): Fix the LKGR link. | |
295 | 326 |
296 # Decode content from utf-8 to unicode, replacing bad characters. | 327 # Decode content from utf-8 to unicode, replacing bad characters. |
297 content = content.decode('utf-8', 'replace') | 328 content = content.decode('utf-8', 'replace') |
298 | 329 |
299 # Scrub in sheriff file content to console. | 330 # Scrub in sheriff file content to console. |
300 sheriff_files = [ | 331 sheriff_files = [ |
301 'sheriff', | 332 'sheriff', |
302 'sheriff_android', | 333 'sheriff_android', |
303 'sheriff_cr_cros_gardeners', | 334 'sheriff_cr_cros_gardeners', |
304 'sheriff_cros_mtv', | 335 'sheriff_cros_mtv', |
305 'sheriff_cros_nonmtv', | 336 'sheriff_cros_nonmtv', |
306 'sheriff_gpu', | 337 'sheriff_gpu', |
307 'sheriff_memory', | 338 'sheriff_memory', |
308 'sheriff_nacl', | 339 'sheriff_nacl', |
309 'sheriff_perf', | 340 'sheriff_perf', |
310 'sheriff_webkit', | 341 'sheriff_webkit', |
311 ] | 342 ] |
312 for sheriff_file in sheriff_files: | 343 for sheriff_file in sheriff_files: |
313 sheriff_content = get_and_cache_page('chromium/%s.js' % sheriff_file) | 344 sheriff_page_data = get_and_cache_pagedata('chromium/%s.js' % sheriff_file) |
345 sheriff_content = sheriff_page_data['content'] | |
314 console_re = (r'<script src=\'http://chromium-build.appspot.com/' | 346 console_re = (r'<script src=\'http://chromium-build.appspot.com/' |
315 'p/chromium/%s.js\'></script>') | 347 'p/chromium/%s.js\'></script>') |
316 content = re.sub(console_re % sheriff_file, | 348 content = re.sub(console_re % sheriff_file, |
317 '<script>%s</script>' % sheriff_content, content) | 349 '<script>%s</script>' % sheriff_content, content) |
318 | 350 |
319 # Replace showBuildBox with direct links. | 351 # Replace showBuildBox with direct links. |
320 content = re.sub(r'<a href=\'#\' onclick=\'showBuildBox\(\"./(.+)\", event\);' | 352 content = re.sub(r'<a href=\'#\' onclick=\'showBuildBox\(\"./(.+)\", event\);' |
321 ' return false;\'', | 353 ' return false;\'', |
322 r"<a href='\1'", content) | 354 r"<a href='\1'", content) |
323 | 355 |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
367 content = string.replace(content, | 399 content = string.replace(content, |
368 "'/json/builders/Linux%20x64/builds/-1?as_text=1';", | 400 "'/json/builders/Linux%20x64/builds/-1?as_text=1';", |
369 "'/json/builders/Linux%20x64/builds/-1/as_text=1.json';") | 401 "'/json/builders/Linux%20x64/builds/-1/as_text=1.json';") |
370 | 402 |
371 # Fix up a reference to http chromium-build in BarUrl(). | 403 # Fix up a reference to http chromium-build in BarUrl(). |
372 content = string.replace(content, | 404 content = string.replace(content, |
373 "return 'http://chromium-build.appspot.com/p/'", | 405 "return 'http://chromium-build.appspot.com/p/'", |
374 "return 'https://chromium-build.appspot.com/p/'") | 406 "return 'https://chromium-build.appspot.com/p/'") |
375 | 407 |
376 # Encode content from unicode to utf-8. | 408 # Encode content from unicode to utf-8. |
377 content = content.encode('utf-8') | 409 page_data['content'] = content.encode('utf-8') |
378 return content | 410 |
411 # Last tweaks to HTML, plus extracting metadata about the page itself. | |
412 page_data['offsite_base'] = remoteurl + '/../' | |
413 | |
414 # Extract the title from the page. | |
415 md = re.search( | |
416 r'^.*<title>([^\<]+)</title>', | |
417 page_data['content'], | |
418 re.MULTILINE|re.DOTALL) | |
419 if not md: | |
420 raise Exception('failed to locate title in page') | |
421 page_data['title'] = md.group(1) | |
422 | |
423 # Remove the leading text up to the end of the opening body tag. While | |
424 # there, extract the body_class from the page. | |
425 md = re.search( | |
426 r'^.*<body class="(\w+)\">(.*)$', | |
427 page_data['content'], | |
428 re.MULTILINE|re.DOTALL) | |
429 if not md: | |
430 raise Exception('failed to locate leading text up to body tag') | |
431 page_data['body_class'] = md.group(1) | |
432 page_data['content'] = md.group(2) | |
433 | |
434 # Remove the leading div and hr tags. | |
435 md = re.search( | |
436 r'^.*?<hr/>(.*)$', | |
437 page_data['content'], | |
438 re.MULTILINE|re.DOTALL) | |
439 if not md: | |
440 raise Exception('failed to locate leading div and hr tags') | |
441 page_data['content'] = md.group(1) | |
442 | |
443 # Strip the trailing body and html tags. | |
444 md = re.search( | |
445 r'^(.*)</body>.*$', | |
446 page_data['content'], | |
447 re.MULTILINE|re.DOTALL) | |
448 if not md: | |
449 raise Exception('failed to locate trailing body and html tags') | |
450 page_data['content'] = md.group(1) | |
451 | |
452 return page_data | |
379 | 453 |
380 | 454 |
381 def one_box_handler(unquoted_localpath, remoteurl, content=None): | 455 def one_box_handler(unquoted_localpath, remoteurl, page_data=None): |
456 page_data = page_data or {} | |
457 content = page_data.get('content') | |
382 if content is None: | 458 if content is None: |
383 return None | 459 return page_data |
384 # Get the site name from the local path. | 460 # Get the site name from the local path. |
385 md = re.match('^([^\/]+)/.*$', unquoted_localpath) | 461 md = re.match('^([^\/]+)/.*$', unquoted_localpath) |
386 if not md: | 462 if not md: |
387 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' | 463 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' |
388 'from local path' % (unquoted_localpath, remoteurl, content)) | 464 'from local path' % ( |
389 return content | 465 unquoted_localpath, remoteurl, page_data)) |
466 return page_data | |
390 site = md.group(1) | 467 site = md.group(1) |
391 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site | 468 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site |
392 content = re.sub(r'waterfall', new_waterfall_url, content) | 469 page_data['content'] = re.sub( |
393 return content | 470 r'waterfall', |
471 new_waterfall_url, | |
472 page_data['content']) | |
473 return page_data | |
394 | 474 |
395 | 475 |
396 | 476 |
397 # List of URLs to fetch. | 477 # List of URLs to fetch. |
398 URLS = [ | 478 URLS = [ |
399 # Console URLs. | 479 # Console URLs. |
400 { | 480 { |
401 'remoteurl': 'http://build.chromium.org/p/chromium/console', | 481 'remoteurl': 'http://build.chromium.org/p/chromium/console', |
402 'localpath': 'chromium.main/console', | 482 'localpath': 'chromium.main/console', |
403 'postfetch': console_handler, | 483 'postfetch': console_handler, |
(...skipping 233 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
637 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) | 717 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) |
638 except urlfetch.DownloadError: | 718 except urlfetch.DownloadError: |
639 logging.warn('urlfetch failed: %s' % url, exc_info=1) | 719 logging.warn('urlfetch failed: %s' % url, exc_info=1) |
640 return None | 720 return None |
641 | 721 |
642 | 722 |
643 class Page(db.Model): | 723 class Page(db.Model): |
644 fetch_timestamp = db.DateTimeProperty(required=True) | 724 fetch_timestamp = db.DateTimeProperty(required=True) |
645 localpath = db.StringProperty(required=True) | 725 localpath = db.StringProperty(required=True) |
646 content = db.TextProperty() | 726 content = db.TextProperty() |
727 title = db.StringProperty() | |
728 offsite_base = db.StringProperty() | |
729 body_class = db.StringProperty() | |
647 remoteurl = db.TextProperty() | 730 remoteurl = db.TextProperty() |
648 # Data updated separately, after creation. | 731 # Data updated separately, after creation. |
649 content_blob = blobstore.BlobReferenceProperty() | 732 content_blob = blobstore.BlobReferenceProperty() |
650 | 733 |
651 | 734 |
652 def write_blob(data, mime_type): | 735 def write_blob(data, mime_type): |
653 """Saves a Unicode string as a new blob, returns the blob's key.""" | 736 """Saves a Unicode string as a new blob, returns the blob's key.""" |
654 file_name = files.blobstore.create(mime_type=mime_type) | 737 file_name = files.blobstore.create(mime_type=mime_type) |
655 data = data.encode('utf-8') | 738 data = data.encode('utf-8') |
656 with files.open(file_name, 'a') as blob_file: | 739 with files.open(file_name, 'a') as blob_file: |
657 blob_file.write(data) | 740 blob_file.write(data) |
658 files.finalize(file_name) | 741 files.finalize(file_name) |
659 return files.blobstore.get_blob_key(file_name) | 742 return files.blobstore.get_blob_key(file_name) |
660 | 743 |
661 | 744 |
662 def save_page(page, localpath, content, fetch_timestamp): | 745 def save_page(page, localpath, fetch_timestamp, page_data): |
746 body_class = page_data.get('body_class', '') | |
747 content = page_data.get('content') | |
748 offsite_base = page_data.get('offsite_base', '') | |
749 title = page_data.get('title', '') | |
750 | |
663 content_blob_key = None | 751 content_blob_key = None |
664 try: | 752 try: |
665 content = content.decode('utf-8', 'replace') | 753 content = content.decode('utf-8', 'replace') |
666 except UnicodeEncodeError: | 754 except UnicodeEncodeError: |
667 logging.debug('save_page: content was already in unicode') | 755 logging.debug('save_page: content was already in unicode') |
668 logging.debug('save_page: content size is %d' % len(content)) | 756 logging.debug('save_page: content size is %d' % len(content)) |
669 if len(content.encode('utf-8')) >= 1024*1024: | 757 if len(content.encode('utf-8')) >= 1024*1024: |
670 logging.debug('save_page: saving to blob') | 758 logging.debug('save_page: saving to blob') |
671 content_blob_key = write_blob(content, path_to_mime_type(localpath)) | 759 content_blob_key = write_blob(content, path_to_mime_type(localpath)) |
672 content = None | 760 content = None |
673 def tx_page(page_key): | 761 def tx_page(page_key): |
674 page = Page.get(page_key) | 762 page = Page.get(page_key) |
675 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no | 763 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no |
676 # 'fetch_timestamp' member (but some types could not be inferred) | 764 # 'fetch_timestamp' member (but some types could not be inferred) |
677 # pylint: disable=E1103 | 765 # pylint: disable=E1103 |
678 if page.fetch_timestamp > fetch_timestamp: | 766 if page.fetch_timestamp > fetch_timestamp: |
679 return | 767 return |
680 page.content = content | 768 page.content = content |
681 page.content_blob = content_blob_key | 769 page.content_blob = content_blob_key |
682 page.fetch_timestamp = fetch_timestamp | 770 page.fetch_timestamp = fetch_timestamp |
771 # title, offsite_base, body_class can all be empty strings for some | |
772 # content. Where that's true, they're not used for displaying a console- | |
773 # like resource, and the content alone is returned to the web user. | |
774 page.title = title | |
775 page.offsite_base = offsite_base | |
776 page.body_class = body_class | |
683 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member | 777 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member |
684 # (but some types could not be inferred) | 778 # (but some types could not be inferred) |
685 # pylint: disable=E1103 | 779 # pylint: disable=E1103 |
686 page.put() | 780 page.put() |
687 db.run_in_transaction(tx_page, page.key()) | 781 db.run_in_transaction(tx_page, page.key()) |
688 # E1101:232,11:fetch_page.tx_page: Module 'google.appengine.api.memcache' | 782 page_data = { |
M-A Ruel
2012/05/29 20:03:49
I agree the function shouldn't be named as_dict()
cmp
2012/05/29 20:20:28
My preference given we're both on the fence is to
| |
689 # has no 'set' member | 783 'body_class': body_class, |
690 # pylint: disable=E1101 | 784 'content': content, |
691 if page.content_blob is None: | 785 'offsite_base': offsite_base, |
692 if memcache.set(key=localpath, value=page.content, time=60): | 786 'title': title, |
693 logging.debug('tx_page(page key="%s"): memcache.set() succeeded' % | 787 } |
694 page.key()) | 788 if content_blob_key: |
695 else: | 789 page_data['content_blob'] = True |
696 logging.error('tx_page(page key="%s"): memcache.set() failed' % | 790 put_pagedata_into_cache(localpath, page_data) |
697 page.key()) | |
698 | 791 |
699 | 792 |
700 def get_or_create_page(localpath, remoteurl, maxage): | 793 def get_or_create_page(localpath, remoteurl, maxage): |
701 return Page.get_or_insert( | 794 return Page.get_or_insert( |
702 key_name=localpath, | 795 key_name=localpath, |
703 localpath=localpath, | 796 localpath=localpath, |
704 remoteurl=remoteurl, | 797 remoteurl=remoteurl, |
705 maxage=maxage, | 798 maxage=maxage, |
706 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), | 799 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), |
707 content=None, | 800 content=None, |
708 content_blob=None) | 801 content_blob=None) |
709 | 802 |
710 | 803 |
711 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None): | 804 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None, |
805 fetch_url=nonfatal_fetch_url): | |
712 """Fetches data about a set of pages.""" | 806 """Fetches data about a set of pages.""" |
713 unquoted_localpath = urllib.unquote(localpath) | 807 unquoted_localpath = urllib.unquote(localpath) |
714 logging.debug('fetch_page("%s", "%s", "%s")' % ( | 808 logging.debug('fetch_page("%s", "%s", "%s")' % ( |
715 unquoted_localpath, remoteurl, maxage)) | 809 unquoted_localpath, remoteurl, maxage)) |
716 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) | 810 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) |
717 | 811 |
718 # Check if our copy of the page is younger than maxage. If it is, we'll | 812 # Check if our copy of the page is younger than maxage. If it is, we'll |
719 # skip the fetch. | 813 # skip the fetch. |
720 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( | 814 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( |
721 seconds=maxage) | 815 seconds=maxage) |
722 if (page.fetch_timestamp and | 816 if (page.fetch_timestamp and |
723 page.fetch_timestamp > oldest_acceptable_timestamp): | 817 page.fetch_timestamp > oldest_acceptable_timestamp): |
724 logging.debug('fetch_page: too recent, skipping') | 818 logging.debug('fetch_page: too recent, skipping') |
725 return | 819 return |
726 | 820 |
727 # Perform the actual page fetch. | 821 # Perform the actual page fetch. |
728 fetch_timestamp = datetime.datetime.now() | 822 fetch_timestamp = datetime.datetime.now() |
729 response = nonfatal_fetch_url(remoteurl) | 823 response = fetch_url(remoteurl) |
730 if not response: | 824 if not response: |
731 logging.warning('fetch_page: got empty response') | 825 logging.warning('fetch_page: got empty response') |
732 return | 826 return |
733 if response.status_code != 200: | 827 if response.status_code != 200: |
734 logging.warning('fetch_page: got non-empty response but code ' | 828 logging.warning('fetch_page: got non-empty response but code ' |
735 '%d' % response.status_code) | 829 '%d' % response.status_code) |
736 return | 830 return |
737 | 831 |
738 # We have actual content. If there's one or more handlers, call them. | 832 # We have actual content. If there's one or more handlers, call them. |
739 content = response.content | 833 page_data = {} |
834 page_data['content'] = response.content | |
740 if postfetch: | 835 if postfetch: |
741 if not isinstance(postfetch, list): | 836 if not isinstance(postfetch, list): |
742 postfetch = [postfetch] | 837 postfetch = [postfetch] |
743 for handler in postfetch: | 838 for handler in postfetch: |
744 logging.debug('fetch_page: calling postfetch handler ' | 839 logging.debug('fetch_page: calling postfetch handler ' |
745 '%s' % handler.__name__) | 840 '%s' % handler.__name__) |
746 content = handler(unquoted_localpath, remoteurl, content) | 841 page_data = handler(unquoted_localpath, remoteurl, page_data) |
747 | 842 |
748 # Save the returned content into the DB and caching layers. | 843 # Save the returned content into the DB and caching layers. |
749 logging.debug('fetch_page: saving page') | 844 logging.debug('fetch_page: saving page') |
750 save_page(page, unquoted_localpath, content, fetch_timestamp) | 845 save_page(page, unquoted_localpath, fetch_timestamp, page_data) |
751 if postsave: | 846 if postsave: |
752 if not isinstance(postsave, list): | 847 if not isinstance(postsave, list): |
753 postsave = [postsave] | 848 postsave = [postsave] |
754 for handler in postsave: | 849 for handler in postsave: |
755 logging.debug('fetch_page: calling postsave handler ' | 850 logging.debug('fetch_page: calling postsave handler ' |
756 '%s' % handler.__name__) | 851 '%s' % handler.__name__) |
757 handler(unquoted_localpath, remoteurl, content) | 852 handler(unquoted_localpath, remoteurl, page_data) |
758 | 853 |
759 | 854 |
760 EXT_TO_MIME = { | 855 EXT_TO_MIME = { |
761 '.css': 'text/css', | 856 '.css': 'text/css', |
762 '.js': 'text/javascript', | 857 '.js': 'text/javascript', |
763 '.json': 'application/json', | 858 '.json': 'application/json', |
764 '.html': 'text/html', | 859 '.html': 'text/html', |
765 } | 860 } |
766 | 861 |
767 | 862 |
768 def path_to_mime_type(path): | 863 def path_to_mime_type(path): |
769 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') | 864 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') |
770 | 865 |
771 | 866 |
772 def fetch_pages(): | 867 def fetch_pages(): |
773 """Starts a background fetch operation for pages that need it.""" | 868 """Starts a background fetch operation for pages that need it.""" |
774 logging.debug('fetch_pages()') | 869 logging.debug('fetch_pages()') |
775 for url in URLS: | 870 for url in URLS: |
776 deferred.defer(fetch_page, **url) | 871 deferred.defer(fetch_page, **url) |
OLD | NEW |