OLD | NEW |
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 from __future__ import with_statement | 5 from __future__ import with_statement |
6 | 6 |
7 import datetime | 7 import datetime |
| 8 import json |
8 import logging | 9 import logging |
9 import os | 10 import os |
10 import random | 11 import random |
11 import re | 12 import re |
12 import string | 13 import string |
13 import urllib | 14 import urllib |
14 | 15 |
15 from google.appengine.api import files, memcache, urlfetch | 16 from google.appengine.api import files, memcache, urlfetch |
16 from google.appengine.api.app_identity import get_application_id | 17 from google.appengine.api.app_identity import get_application_id |
17 from google.appengine.ext import blobstore, db, deferred | 18 from google.appengine.ext import blobstore, db, deferred |
(...skipping 12 matching lines...) Expand all Loading... |
30 APP_NAME = get_application_id() | 31 APP_NAME = get_application_id() |
31 | 32 |
32 # Deadline for fetching URLs (in seconds). | 33 # Deadline for fetching URLs (in seconds). |
33 URLFETCH_DEADLINE = 60*5 # 5 mins | 34 URLFETCH_DEADLINE = 60*5 # 5 mins |
34 | 35 |
35 | 36 |
36 # Perform initial bootstrap for this module. | 37 # Perform initial bootstrap for this module. |
37 console_template = '' | 38 console_template = '' |
38 def bootstrap(): | 39 def bootstrap(): |
39 global console_template | 40 global console_template |
40 with open('templates/console.html', 'r') as fh: | 41 with open('templates/merger.html', 'r') as fh: |
41 console_template = fh.read() | 42 console_template = fh.read() |
42 | 43 |
43 | 44 |
44 # Assumes localpath is already unquoted. | 45 def get_pagedata_from_cache(localpath): |
45 def get_and_cache_page(localpath): | 46 memcache_data = memcache.get(localpath) |
46 # E1101: 29,12:get_and_cache_page: Module 'google.appengine.api.memcache' has | 47 if not memcache_data: |
47 # no 'get' member | 48 return None |
48 # pylint: disable=E1101 | 49 logging.debug('content for %s found in memcache' % localpath) |
49 content = memcache.get(localpath) | 50 return json.loads(memcache_data) |
50 if content is not None: | |
51 logging.debug('content for %s found in memcache' % localpath) | |
52 return content | |
53 | 51 |
| 52 |
| 53 def put_pagedata_into_cache(localpath, page_data): |
| 54 memcache_data = json.dumps(page_data) |
| 55 if not memcache.set(key=localpath, value=memcache_data, time=2*60): |
| 56 logging.error('put_pagedata_into_cache(\'%s\'): memcache.set() failed' % ( |
| 57 localpath)) |
| 58 |
| 59 |
| 60 def get_and_cache_pagedata(localpath): |
| 61 """Returns a page_data dict, optionally caching and looking up a blob. |
| 62 |
| 63 get_and_cache_pagedata takes a localpath which is used to fetch data |
| 64 from the cache. If the data is present and there's no content blob, |
| 65 then we have all of the data we need to return a page view to the user |
| 66 and we return early. |
| 67 |
| 68 Otherwise, we need to fetch the page object and set up the page data |
| 69 for the page view. If the page has a blob associated with it, then we |
| 70 mark the page data as having a blob and cache it as-is without the blob. |
| 71 If there's no blob, we associate the content with the page data and |
| 72 cache that. This is so the next time get_and_cache_pagedata is called |
| 73 for either case, we'll get the same behavior (a page-lookup for blobful |
| 74 content and a page cache hit for blobless content). |
| 75 |
| 76 Here we assume localpath is already unquoted. |
| 77 """ |
| 78 page_data = get_pagedata_from_cache(localpath) |
| 79 if page_data and not page_data.get('content_blob'): |
| 80 return page_data |
54 page = Page.all().filter('localpath =', localpath).get() | 81 page = Page.all().filter('localpath =', localpath).get() |
55 if not page: | 82 if not page: |
56 logging.error('get_and_cache_page(\'%s\'): no matching localpath in ' | 83 logging.error('get_and_cache_pagedata(\'%s\'): no matching localpath in ' |
57 'datastore' % localpath) | 84 'datastore' % localpath) |
58 return None | 85 return {'content': None} |
59 if page.content_blob is not None: | 86 page_data = { |
| 87 'body_class': page.body_class, |
| 88 'offsite_base': page.offsite_base, |
| 89 'title': page.title, |
| 90 } |
| 91 if page.content_blob: |
60 # Get the blob. | 92 # Get the blob. |
| 93 logging.debug('content for %s found in blobstore' % localpath) |
61 blob_reader = blobstore.BlobReader(page.content_blob) | 94 blob_reader = blobstore.BlobReader(page.content_blob) |
62 content = blob_reader.read().decode('utf-8', 'replace') | 95 page_data['content_blob'] = True |
63 logging.debug('content for %s found in blobstore' % localpath) | 96 put_pagedata_into_cache(localpath, page_data) |
| 97 page_data['content'] = blob_reader.read().decode('utf-8', 'replace') |
64 else: | 98 else: |
65 logging.debug('content for %s found in datastore' % localpath) | 99 logging.debug('content for %s found in datastore' % localpath) |
66 content = page.content | 100 page_data['content'] = page.content |
67 # E1101: 39,11:get_and_cache_page: Module 'google.appengine.api.memcache' | 101 put_pagedata_into_cache(localpath, page_data) |
68 # has no 'set' member | 102 return page_data |
69 # pylint: disable=E1101 | |
70 if not memcache.set(key=localpath, value=content, time=2*60): | |
71 logging.error('get_and_cache_page(\'%s\'): memcache.set() failed' % | |
72 localpath) | |
73 return content | |
74 | 103 |
75 | 104 |
76 class ConsoleData(object): | 105 class ConsoleData(object): |
77 def __init__(self): | 106 def __init__(self): |
78 self.row_orderedkeys = [] | 107 self.row_orderedkeys = [] |
79 self.row_data = {} | 108 self.row_data = {} |
80 | 109 |
81 # Retain order of observed masters. | 110 # Retain order of observed masters. |
82 self.masters = [] | 111 self.masters = [] |
83 | 112 |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
164 def Finish(self): | 193 def Finish(self): |
165 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) | 194 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) |
166 # TODO(cmp): Look for row/master/categories that are unset. If they are | 195 # TODO(cmp): Look for row/master/categories that are unset. If they are |
167 # at the latest revisions, leave them unset. If they are at | 196 # at the latest revisions, leave them unset. If they are at |
168 # the earliest revisions, set them to ''. | 197 # the earliest revisions, set them to ''. |
169 | 198 |
170 | 199 |
171 # W0613:169,39:console_merger: Unused argument 'remoteurl' | 200 # W0613:169,39:console_merger: Unused argument 'remoteurl' |
172 # W0613:169,19:console_merger: Unused argument 'unquoted_localpath' | 201 # W0613:169,19:console_merger: Unused argument 'unquoted_localpath' |
173 # pylint: disable=W0613 | 202 # pylint: disable=W0613 |
174 def console_merger(unquoted_localpath, remote_url, content=None): | 203 def console_merger(unquoted_localpath, remote_url, page_data=None): |
175 if content is None: | 204 page_data = page_data or {} |
176 return None | |
177 | 205 |
178 masters = [ | 206 masters = [ |
179 'chromium.main', | 207 'chromium.main', |
180 'chromium.chromiumos', | 208 'chromium.chromiumos', |
181 'chromium.chrome', | 209 'chromium.chrome', |
182 'chromium.memory', | 210 'chromium.memory', |
183 ] | 211 ] |
184 mergedconsole = ConsoleData() | 212 mergedconsole = ConsoleData() |
185 merged_page = None | 213 merged_page = None |
186 merged_tag = None | 214 merged_tag = None |
187 fetch_timestamp = datetime.datetime.now() | 215 fetch_timestamp = datetime.datetime.now() |
188 for master in masters: | 216 for master in masters: |
189 master_content = get_and_cache_page('%s/console' % master) | 217 page_data = get_and_cache_pagedata('%s/console' % master) |
| 218 master_content = page_data['content'] |
190 if master_content is None: | 219 if master_content is None: |
191 continue | 220 continue |
192 master_content = master_content.encode('ascii', 'replace') | 221 master_content = master_content.encode('ascii', 'replace') |
193 this_page = BeautifulSoup(master_content) | 222 this_page = BeautifulSoup(master_content) |
194 this_tag = this_page.find('table', {'class': 'ConsoleData'}) | 223 this_tag = this_page.find('table', {'class': 'ConsoleData'}) |
195 # The first console is special, we reuse all of the console page. | 224 # The first console is special, we reuse all of the console page. |
196 if not merged_page: | 225 if not merged_page: |
197 merged_page = this_page | 226 merged_page = this_page |
198 merged_tag = this_tag | 227 merged_tag = this_tag |
199 mergedconsole.SawMaster(master) | 228 mergedconsole.SawMaster(master) |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
274 merged_content = re.sub( | 303 merged_content = re.sub( |
275 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) | 304 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) |
276 merged_content = re.sub( | 305 merged_content = re.sub( |
277 r'\<iframe\>\</iframe\>', | 306 r'\<iframe\>\</iframe\>', |
278 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', | 307 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', |
279 merged_content) | 308 merged_content) |
280 | 309 |
281 # Update the merged console page. | 310 # Update the merged console page. |
282 merged_page = get_or_create_page('chromium/console', None, maxage=30) | 311 merged_page = get_or_create_page('chromium/console', None, maxage=30) |
283 logging.debug('console_merger: saving merged console') | 312 logging.debug('console_merger: saving merged console') |
284 save_page(merged_page, 'chromium/console', merged_content, | 313 page_data['title'] = 'BuildBot: Chromium' |
285 fetch_timestamp) | 314 page_data['offsite_base'] = 'http://build.chromium.org/p/chromium' |
286 return merged_content | 315 page_data['body_class'] = 'interface' |
| 316 page_data['content'] = merged_content |
| 317 save_page(merged_page, 'chromium/console', fetch_timestamp, page_data) |
| 318 return |
287 | 319 |
288 | 320 |
289 # W0613:284,20:console_handler: Unused argument 'unquoted_localpath' | 321 def console_handler(_unquoted_localpath, remoteurl, page_data=None): |
290 # pylint: disable=W0613 | 322 page_data = page_data or {} |
291 def console_handler(unquoted_localpath, remoteurl, content=None): | 323 content = page_data.get('content') |
292 if content is None: | 324 if not content: |
293 return None | 325 return page_data |
294 # TODO(cmp): Fix the LKGR link. | |
295 | 326 |
296 # Decode content from utf-8 to unicode, replacing bad characters. | 327 # Decode content from utf-8 to unicode, replacing bad characters. |
297 content = content.decode('utf-8', 'replace') | 328 content = content.decode('utf-8', 'replace') |
298 | 329 |
299 # Scrub in sheriff file content to console. | 330 # Scrub in sheriff file content to console. |
300 sheriff_files = [ | 331 sheriff_files = [ |
301 'sheriff', | 332 'sheriff', |
302 'sheriff_android', | 333 'sheriff_android', |
303 'sheriff_cr_cros_gardeners', | 334 'sheriff_cr_cros_gardeners', |
304 'sheriff_cros_mtv', | 335 'sheriff_cros_mtv', |
305 'sheriff_cros_nonmtv', | 336 'sheriff_cros_nonmtv', |
306 'sheriff_gpu', | 337 'sheriff_gpu', |
307 'sheriff_memory', | 338 'sheriff_memory', |
308 'sheriff_nacl', | 339 'sheriff_nacl', |
309 'sheriff_perf', | 340 'sheriff_perf', |
310 'sheriff_webkit', | 341 'sheriff_webkit', |
311 ] | 342 ] |
312 for sheriff_file in sheriff_files: | 343 for sheriff_file in sheriff_files: |
313 sheriff_content = get_and_cache_page('chromium/%s.js' % sheriff_file) | 344 sheriff_page_data = get_and_cache_pagedata('chromium/%s.js' % sheriff_file) |
| 345 sheriff_content = sheriff_page_data['content'] |
314 console_re = (r'<script src=\'http://chromium-build.appspot.com/' | 346 console_re = (r'<script src=\'http://chromium-build.appspot.com/' |
315 'p/chromium/%s.js\'></script>') | 347 'p/chromium/%s.js\'></script>') |
316 content = re.sub(console_re % sheriff_file, | 348 content = re.sub(console_re % sheriff_file, |
317 '<script>%s</script>' % sheriff_content, content) | 349 '<script>%s</script>' % sheriff_content, content) |
318 | 350 |
319 # Replace showBuildBox with direct links. | 351 # Replace showBuildBox with direct links. |
320 content = re.sub(r'<a href=\'#\' onclick=\'showBuildBox\(\"./(.+)\", event\);' | 352 content = re.sub(r'<a href=\'#\' onclick=\'showBuildBox\(\"./(.+)\", event\);' |
321 ' return false;\'', | 353 ' return false;\'', |
322 r"<a href='\1'", content) | 354 r"<a href='\1'", content) |
323 | 355 |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
367 content = string.replace(content, | 399 content = string.replace(content, |
368 "'/json/builders/Linux%20x64/builds/-1?as_text=1';", | 400 "'/json/builders/Linux%20x64/builds/-1?as_text=1';", |
369 "'/json/builders/Linux%20x64/builds/-1/as_text=1.json';") | 401 "'/json/builders/Linux%20x64/builds/-1/as_text=1.json';") |
370 | 402 |
371 # Fix up a reference to http chromium-build in BarUrl(). | 403 # Fix up a reference to http chromium-build in BarUrl(). |
372 content = string.replace(content, | 404 content = string.replace(content, |
373 "return 'http://chromium-build.appspot.com/p/'", | 405 "return 'http://chromium-build.appspot.com/p/'", |
374 "return 'https://chromium-build.appspot.com/p/'") | 406 "return 'https://chromium-build.appspot.com/p/'") |
375 | 407 |
376 # Encode content from unicode to utf-8. | 408 # Encode content from unicode to utf-8. |
377 content = content.encode('utf-8') | 409 page_data['content'] = content.encode('utf-8') |
378 return content | 410 |
| 411 # Last tweaks to HTML, plus extracting metadata about the page itself. |
| 412 page_data['offsite_base'] = remoteurl + '/../' |
| 413 |
| 414 # Extract the title from the page. |
| 415 md = re.search( |
| 416 r'^.*<title>([^\<]+)</title>', |
| 417 page_data['content'], |
| 418 re.MULTILINE|re.DOTALL) |
| 419 if not md: |
| 420 raise Exception('failed to locate title in page') |
| 421 page_data['title'] = md.group(1) |
| 422 |
| 423 # Remove the leading text up to the end of the opening body tag. While |
| 424 # there, extract the body_class from the page. |
| 425 md = re.search( |
| 426 r'^.*<body class="(\w+)\">(.*)$', |
| 427 page_data['content'], |
| 428 re.MULTILINE|re.DOTALL) |
| 429 if not md: |
| 430 raise Exception('failed to locate leading text up to body tag') |
| 431 page_data['body_class'] = md.group(1) |
| 432 page_data['content'] = md.group(2) |
| 433 |
| 434 # Remove the leading div and hr tags. |
| 435 md = re.search( |
| 436 r'^.*?<hr/>(.*)$', |
| 437 page_data['content'], |
| 438 re.MULTILINE|re.DOTALL) |
| 439 if not md: |
| 440 raise Exception('failed to locate leading div and hr tags') |
| 441 page_data['content'] = md.group(1) |
| 442 |
| 443 # Strip the trailing body and html tags. |
| 444 md = re.search( |
| 445 r'^(.*)</body>.*$', |
| 446 page_data['content'], |
| 447 re.MULTILINE|re.DOTALL) |
| 448 if not md: |
| 449 raise Exception('failed to locate trailing body and html tags') |
| 450 page_data['content'] = md.group(1) |
| 451 |
| 452 return page_data |
379 | 453 |
380 | 454 |
381 def one_box_handler(unquoted_localpath, remoteurl, content=None): | 455 def one_box_handler(unquoted_localpath, remoteurl, page_data=None): |
| 456 page_data = page_data or {} |
| 457 content = page_data.get('content') |
382 if content is None: | 458 if content is None: |
383 return None | 459 return page_data |
384 # Get the site name from the local path. | 460 # Get the site name from the local path. |
385 md = re.match('^([^\/]+)/.*$', unquoted_localpath) | 461 md = re.match('^([^\/]+)/.*$', unquoted_localpath) |
386 if not md: | 462 if not md: |
387 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' | 463 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' |
388 'from local path' % (unquoted_localpath, remoteurl, content)) | 464 'from local path' % ( |
389 return content | 465 unquoted_localpath, remoteurl, page_data)) |
| 466 return page_data |
390 site = md.group(1) | 467 site = md.group(1) |
391 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site | 468 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site |
392 content = re.sub(r'waterfall', new_waterfall_url, content) | 469 page_data['content'] = re.sub( |
393 return content | 470 r'waterfall', |
| 471 new_waterfall_url, |
| 472 page_data['content']) |
| 473 return page_data |
394 | 474 |
395 | 475 |
396 | 476 |
397 # List of URLs to fetch. | 477 # List of URLs to fetch. |
398 URLS = [ | 478 URLS = [ |
399 # Console URLs. | 479 # Console URLs. |
400 { | 480 { |
401 'remoteurl': 'http://build.chromium.org/p/chromium/console', | 481 'remoteurl': 'http://build.chromium.org/p/chromium/console', |
402 'localpath': 'chromium.main/console', | 482 'localpath': 'chromium.main/console', |
403 'postfetch': console_handler, | 483 'postfetch': console_handler, |
(...skipping 233 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
637 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) | 717 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) |
638 except urlfetch.DownloadError: | 718 except urlfetch.DownloadError: |
639 logging.warn('urlfetch failed: %s' % url, exc_info=1) | 719 logging.warn('urlfetch failed: %s' % url, exc_info=1) |
640 return None | 720 return None |
641 | 721 |
642 | 722 |
643 class Page(db.Model): | 723 class Page(db.Model): |
644 fetch_timestamp = db.DateTimeProperty(required=True) | 724 fetch_timestamp = db.DateTimeProperty(required=True) |
645 localpath = db.StringProperty(required=True) | 725 localpath = db.StringProperty(required=True) |
646 content = db.TextProperty() | 726 content = db.TextProperty() |
| 727 title = db.StringProperty() |
| 728 offsite_base = db.StringProperty() |
| 729 body_class = db.StringProperty() |
647 remoteurl = db.TextProperty() | 730 remoteurl = db.TextProperty() |
648 # Data updated separately, after creation. | 731 # Data updated separately, after creation. |
649 content_blob = blobstore.BlobReferenceProperty() | 732 content_blob = blobstore.BlobReferenceProperty() |
650 | 733 |
651 | 734 |
652 def write_blob(data, mime_type): | 735 def write_blob(data, mime_type): |
653 """Saves a Unicode string as a new blob, returns the blob's key.""" | 736 """Saves a Unicode string as a new blob, returns the blob's key.""" |
654 file_name = files.blobstore.create(mime_type=mime_type) | 737 file_name = files.blobstore.create(mime_type=mime_type) |
655 data = data.encode('utf-8') | 738 data = data.encode('utf-8') |
656 with files.open(file_name, 'a') as blob_file: | 739 with files.open(file_name, 'a') as blob_file: |
657 blob_file.write(data) | 740 blob_file.write(data) |
658 files.finalize(file_name) | 741 files.finalize(file_name) |
659 return files.blobstore.get_blob_key(file_name) | 742 return files.blobstore.get_blob_key(file_name) |
660 | 743 |
661 | 744 |
662 def save_page(page, localpath, content, fetch_timestamp): | 745 def save_page(page, localpath, fetch_timestamp, page_data): |
| 746 body_class = page_data.get('body_class', '') |
| 747 content = page_data.get('content') |
| 748 offsite_base = page_data.get('offsite_base', '') |
| 749 title = page_data.get('title', '') |
| 750 |
663 content_blob_key = None | 751 content_blob_key = None |
664 try: | 752 try: |
665 content = content.decode('utf-8', 'replace') | 753 content = content.decode('utf-8', 'replace') |
666 except UnicodeEncodeError: | 754 except UnicodeEncodeError: |
667 logging.debug('save_page: content was already in unicode') | 755 logging.debug('save_page: content was already in unicode') |
668 logging.debug('save_page: content size is %d' % len(content)) | 756 logging.debug('save_page: content size is %d' % len(content)) |
669 if len(content.encode('utf-8')) >= 1024*1024: | 757 if len(content.encode('utf-8')) >= 1024*1024: |
670 logging.debug('save_page: saving to blob') | 758 logging.debug('save_page: saving to blob') |
671 content_blob_key = write_blob(content, path_to_mime_type(localpath)) | 759 content_blob_key = write_blob(content, path_to_mime_type(localpath)) |
672 content = None | 760 content = None |
673 def tx_page(page_key): | 761 def tx_page(page_key): |
674 page = Page.get(page_key) | 762 page = Page.get(page_key) |
675 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no | 763 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no |
676 # 'fetch_timestamp' member (but some types could not be inferred) | 764 # 'fetch_timestamp' member (but some types could not be inferred) |
677 # pylint: disable=E1103 | 765 # pylint: disable=E1103 |
678 if page.fetch_timestamp > fetch_timestamp: | 766 if page.fetch_timestamp > fetch_timestamp: |
679 return | 767 return |
680 page.content = content | 768 page.content = content |
681 page.content_blob = content_blob_key | 769 page.content_blob = content_blob_key |
682 page.fetch_timestamp = fetch_timestamp | 770 page.fetch_timestamp = fetch_timestamp |
| 771 # title, offsite_base, body_class can all be empty strings for some |
| 772 # content. Where that's true, they're not used for displaying a console- |
| 773 # like resource, and the content alone is returned to the web user. |
| 774 page.title = title |
| 775 page.offsite_base = offsite_base |
| 776 page.body_class = body_class |
683 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member | 777 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member |
684 # (but some types could not be inferred) | 778 # (but some types could not be inferred) |
685 # pylint: disable=E1103 | 779 # pylint: disable=E1103 |
686 page.put() | 780 page.put() |
687 db.run_in_transaction(tx_page, page.key()) | 781 db.run_in_transaction(tx_page, page.key()) |
688 # E1101:232,11:fetch_page.tx_page: Module 'google.appengine.api.memcache' | 782 page_data = { |
689 # has no 'set' member | 783 'body_class': body_class, |
690 # pylint: disable=E1101 | 784 'content': content, |
691 if page.content_blob is None: | 785 'offsite_base': offsite_base, |
692 if memcache.set(key=localpath, value=page.content, time=60): | 786 'title': title, |
693 logging.debug('tx_page(page key="%s"): memcache.set() succeeded' % | 787 } |
694 page.key()) | 788 if content_blob_key: |
695 else: | 789 page_data['content_blob'] = True |
696 logging.error('tx_page(page key="%s"): memcache.set() failed' % | 790 put_pagedata_into_cache(localpath, page_data) |
697 page.key()) | |
698 | 791 |
699 | 792 |
700 def get_or_create_page(localpath, remoteurl, maxage): | 793 def get_or_create_page(localpath, remoteurl, maxage): |
701 return Page.get_or_insert( | 794 return Page.get_or_insert( |
702 key_name=localpath, | 795 key_name=localpath, |
703 localpath=localpath, | 796 localpath=localpath, |
704 remoteurl=remoteurl, | 797 remoteurl=remoteurl, |
705 maxage=maxage, | 798 maxage=maxage, |
706 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), | 799 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), |
707 content=None, | 800 content=None, |
708 content_blob=None) | 801 content_blob=None) |
709 | 802 |
710 | 803 |
711 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None): | 804 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None, |
| 805 fetch_url=nonfatal_fetch_url): |
712 """Fetches data about a set of pages.""" | 806 """Fetches data about a set of pages.""" |
713 unquoted_localpath = urllib.unquote(localpath) | 807 unquoted_localpath = urllib.unquote(localpath) |
714 logging.debug('fetch_page("%s", "%s", "%s")' % ( | 808 logging.debug('fetch_page("%s", "%s", "%s")' % ( |
715 unquoted_localpath, remoteurl, maxage)) | 809 unquoted_localpath, remoteurl, maxage)) |
716 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) | 810 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) |
717 | 811 |
718 # Check if our copy of the page is younger than maxage. If it is, we'll | 812 # Check if our copy of the page is younger than maxage. If it is, we'll |
719 # skip the fetch. | 813 # skip the fetch. |
720 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( | 814 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( |
721 seconds=maxage) | 815 seconds=maxage) |
722 if (page.fetch_timestamp and | 816 if (page.fetch_timestamp and |
723 page.fetch_timestamp > oldest_acceptable_timestamp): | 817 page.fetch_timestamp > oldest_acceptable_timestamp): |
724 logging.debug('fetch_page: too recent, skipping') | 818 logging.debug('fetch_page: too recent, skipping') |
725 return | 819 return |
726 | 820 |
727 # Perform the actual page fetch. | 821 # Perform the actual page fetch. |
728 fetch_timestamp = datetime.datetime.now() | 822 fetch_timestamp = datetime.datetime.now() |
729 response = nonfatal_fetch_url(remoteurl) | 823 response = fetch_url(remoteurl) |
730 if not response: | 824 if not response: |
731 logging.warning('fetch_page: got empty response') | 825 logging.warning('fetch_page: got empty response') |
732 return | 826 return |
733 if response.status_code != 200: | 827 if response.status_code != 200: |
734 logging.warning('fetch_page: got non-empty response but code ' | 828 logging.warning('fetch_page: got non-empty response but code ' |
735 '%d' % response.status_code) | 829 '%d' % response.status_code) |
736 return | 830 return |
737 | 831 |
738 # We have actual content. If there's one or more handlers, call them. | 832 # We have actual content. If there's one or more handlers, call them. |
739 content = response.content | 833 page_data = {} |
| 834 page_data['content'] = response.content |
740 if postfetch: | 835 if postfetch: |
741 if not isinstance(postfetch, list): | 836 if not isinstance(postfetch, list): |
742 postfetch = [postfetch] | 837 postfetch = [postfetch] |
743 for handler in postfetch: | 838 for handler in postfetch: |
744 logging.debug('fetch_page: calling postfetch handler ' | 839 logging.debug('fetch_page: calling postfetch handler ' |
745 '%s' % handler.__name__) | 840 '%s' % handler.__name__) |
746 content = handler(unquoted_localpath, remoteurl, content) | 841 page_data = handler(unquoted_localpath, remoteurl, page_data) |
747 | 842 |
748 # Save the returned content into the DB and caching layers. | 843 # Save the returned content into the DB and caching layers. |
749 logging.debug('fetch_page: saving page') | 844 logging.debug('fetch_page: saving page') |
750 save_page(page, unquoted_localpath, content, fetch_timestamp) | 845 save_page(page, unquoted_localpath, fetch_timestamp, page_data) |
751 if postsave: | 846 if postsave: |
752 if not isinstance(postsave, list): | 847 if not isinstance(postsave, list): |
753 postsave = [postsave] | 848 postsave = [postsave] |
754 for handler in postsave: | 849 for handler in postsave: |
755 logging.debug('fetch_page: calling postsave handler ' | 850 logging.debug('fetch_page: calling postsave handler ' |
756 '%s' % handler.__name__) | 851 '%s' % handler.__name__) |
757 handler(unquoted_localpath, remoteurl, content) | 852 handler(unquoted_localpath, remoteurl, page_data) |
758 | 853 |
759 | 854 |
760 EXT_TO_MIME = { | 855 EXT_TO_MIME = { |
761 '.css': 'text/css', | 856 '.css': 'text/css', |
762 '.js': 'text/javascript', | 857 '.js': 'text/javascript', |
763 '.json': 'application/json', | 858 '.json': 'application/json', |
764 '.html': 'text/html', | 859 '.html': 'text/html', |
765 } | 860 } |
766 | 861 |
767 | 862 |
768 def path_to_mime_type(path): | 863 def path_to_mime_type(path): |
769 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') | 864 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') |
770 | 865 |
771 | 866 |
772 def fetch_pages(): | 867 def fetch_pages(): |
773 """Starts a background fetch operation for pages that need it.""" | 868 """Starts a background fetch operation for pages that need it.""" |
774 logging.debug('fetch_pages()') | 869 logging.debug('fetch_pages()') |
775 for url in URLS: | 870 for url in URLS: |
776 deferred.defer(fetch_page, **url) | 871 deferred.defer(fetch_page, **url) |
OLD | NEW |