OLD | NEW |
---|---|
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 from __future__ import with_statement | 5 from __future__ import with_statement |
6 | 6 |
7 import datetime | 7 import datetime |
8 import json | |
8 import logging | 9 import logging |
9 import os | 10 import os |
10 import random | 11 import random |
11 import re | 12 import re |
12 import string | 13 import string |
13 import urllib | 14 import urllib |
14 | 15 |
15 from google.appengine.api import files, memcache, urlfetch | 16 from google.appengine.api import files, memcache, urlfetch |
16 from google.appengine.api.app_identity import get_application_id | 17 from google.appengine.api.app_identity import get_application_id |
17 from google.appengine.ext import blobstore, db, deferred | 18 from google.appengine.ext import blobstore, db, deferred |
(...skipping 12 matching lines...) Expand all Loading... | |
30 APP_NAME = get_application_id() | 31 APP_NAME = get_application_id() |
31 | 32 |
32 # Deadline for fetching URLs (in seconds). | 33 # Deadline for fetching URLs (in seconds). |
33 URLFETCH_DEADLINE = 60*5 # 5 mins | 34 URLFETCH_DEADLINE = 60*5 # 5 mins |
34 | 35 |
35 | 36 |
36 # Perform initial bootstrap for this module. | 37 # Perform initial bootstrap for this module. |
37 console_template = '' | 38 console_template = '' |
38 def bootstrap(): | 39 def bootstrap(): |
39 global console_template | 40 global console_template |
40 with open('templates/console.html', 'r') as fh: | 41 with open('templates/merger.html', 'r') as fh: |
41 console_template = fh.read() | 42 console_template = fh.read() |
42 | 43 |
43 | 44 |
45 def get_pagedata_from_cache(localpath): | |
46 memcache_data = memcache.get(localpath) | |
47 if not memcache_data: | |
48 return None | |
49 logging.debug('content for %s found in memcache' % localpath) | |
50 return json.loads(memcache_data) | |
51 | |
52 | |
53 def put_pagedata_into_cache(localpath, page_data): | |
54 memcache_data = json.dumps(page_data) | |
55 if not memcache.set(key=localpath, value=memcache_data, time=2*60): | |
56 logging.error('put_pagedata_into_cache(\'%s\'): memcache.set() failed' % ( | |
57 localpath)) | |
58 return True | |
M-A Ruel
2012/05/29 18:46:33
why return something at all?
cmp
2012/05/29 19:38:03
Done.
| |
59 | |
60 | |
44 # Assumes localpath is already unquoted. | 61 # Assumes localpath is already unquoted. |
M-A Ruel
2012/05/29 18:46:33
This should go in a docstring to explain what the
cmp
2012/05/29 19:38:03
Done.
| |
45 def get_and_cache_page(localpath): | 62 def get_and_cache_pagedata(localpath): |
46 # E1101: 29,12:get_and_cache_page: Module 'google.appengine.api.memcache' has | 63 page_data = get_pagedata_from_cache(localpath) |
47 # no 'get' member | 64 if page_data and not page_data.get('content_blob'): |
48 # pylint: disable=E1101 | 65 return page_data |
49 content = memcache.get(localpath) | |
50 if content is not None: | |
51 logging.debug('content for %s found in memcache' % localpath) | |
52 return content | |
53 | |
54 page = Page.all().filter('localpath =', localpath).get() | 66 page = Page.all().filter('localpath =', localpath).get() |
55 if not page: | 67 if not page: |
56 logging.error('get_and_cache_page(\'%s\'): no matching localpath in ' | 68 logging.error('get_and_cache_pagedata(\'%s\'): no matching localpath in ' |
57 'datastore' % localpath) | 69 'datastore' % localpath) |
58 return None | 70 return {'content': None} |
59 if page.content_blob is not None: | 71 page_data = { |
72 'body_class': page.body_class, | |
73 'offsite_base': page.offsite_base, | |
74 'title': page.title, | |
75 } | |
76 if page.content_blob: | |
60 # Get the blob. | 77 # Get the blob. |
78 logging.debug('content for %s found in blobstore' % localpath) | |
61 blob_reader = blobstore.BlobReader(page.content_blob) | 79 blob_reader = blobstore.BlobReader(page.content_blob) |
62 content = blob_reader.read().decode('utf-8', 'replace') | 80 page_data['content_blob'] = True |
63 logging.debug('content for %s found in blobstore' % localpath) | 81 put_pagedata_into_cache(localpath, page_data) |
82 page_data['content'] = blob_reader.read().decode('utf-8', 'replace') | |
64 else: | 83 else: |
65 logging.debug('content for %s found in datastore' % localpath) | 84 logging.debug('content for %s found in datastore' % localpath) |
66 content = page.content | 85 page_data['content'] = page.content |
67 # E1101: 39,11:get_and_cache_page: Module 'google.appengine.api.memcache' | 86 put_pagedata_into_cache(localpath, page_data) |
68 # has no 'set' member | 87 return page_data |
69 # pylint: disable=E1101 | |
70 if not memcache.set(key=localpath, value=content, time=2*60): | |
71 logging.error('get_and_cache_page(\'%s\'): memcache.set() failed' % | |
72 localpath) | |
73 return content | |
74 | 88 |
75 | 89 |
76 class ConsoleData(object): | 90 class ConsoleData(object): |
77 def __init__(self): | 91 def __init__(self): |
78 self.row_orderedkeys = [] | 92 self.row_orderedkeys = [] |
79 self.row_data = {} | 93 self.row_data = {} |
80 | 94 |
81 # Retain order of observed masters. | 95 # Retain order of observed masters. |
82 self.masters = [] | 96 self.masters = [] |
83 | 97 |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
164 def Finish(self): | 178 def Finish(self): |
165 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) | 179 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) |
166 # TODO(cmp): Look for row/master/categories that are unset. If they are | 180 # TODO(cmp): Look for row/master/categories that are unset. If they are |
167 # at the latest revisions, leave them unset. If they are at | 181 # at the latest revisions, leave them unset. If they are at |
168 # the earliest revisions, set them to ''. | 182 # the earliest revisions, set them to ''. |
169 | 183 |
170 | 184 |
171 # W0613:169,39:console_merger: Unused argument 'remoteurl' | 185 # W0613:169,39:console_merger: Unused argument 'remoteurl' |
172 # W0613:169,19:console_merger: Unused argument 'unquoted_localpath' | 186 # W0613:169,19:console_merger: Unused argument 'unquoted_localpath' |
173 # pylint: disable=W0613 | 187 # pylint: disable=W0613 |
174 def console_merger(unquoted_localpath, remote_url, content=None): | 188 def console_merger(unquoted_localpath, remote_url, page_data=None): |
175 if content is None: | 189 page_data = page_data or {} |
176 return None | |
177 | 190 |
178 masters = [ | 191 masters = [ |
179 'chromium.main', | 192 'chromium.main', |
180 'chromium.chromiumos', | 193 'chromium.chromiumos', |
181 'chromium.chrome', | 194 'chromium.chrome', |
182 'chromium.memory', | 195 'chromium.memory', |
183 ] | 196 ] |
184 mergedconsole = ConsoleData() | 197 mergedconsole = ConsoleData() |
185 merged_page = None | 198 merged_page = None |
186 merged_tag = None | 199 merged_tag = None |
187 fetch_timestamp = datetime.datetime.now() | 200 fetch_timestamp = datetime.datetime.now() |
188 for master in masters: | 201 for master in masters: |
189 master_content = get_and_cache_page('%s/console' % master) | 202 page_data = get_and_cache_pagedata('%s/console' % master) |
203 master_content = page_data['content'] | |
190 if master_content is None: | 204 if master_content is None: |
191 continue | 205 continue |
192 master_content = master_content.encode('ascii', 'replace') | 206 master_content = master_content.encode('ascii', 'replace') |
193 this_page = BeautifulSoup(master_content) | 207 this_page = BeautifulSoup(master_content) |
194 this_tag = this_page.find('table', {'class': 'ConsoleData'}) | 208 this_tag = this_page.find('table', {'class': 'ConsoleData'}) |
195 # The first console is special, we reuse all of the console page. | 209 # The first console is special, we reuse all of the console page. |
196 if not merged_page: | 210 if not merged_page: |
197 merged_page = this_page | 211 merged_page = this_page |
198 merged_tag = this_tag | 212 merged_tag = this_tag |
199 mergedconsole.SawMaster(master) | 213 mergedconsole.SawMaster(master) |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
274 merged_content = re.sub( | 288 merged_content = re.sub( |
275 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) | 289 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) |
276 merged_content = re.sub( | 290 merged_content = re.sub( |
277 r'\<iframe\>\</iframe\>', | 291 r'\<iframe\>\</iframe\>', |
278 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', | 292 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', |
279 merged_content) | 293 merged_content) |
280 | 294 |
281 # Update the merged console page. | 295 # Update the merged console page. |
282 merged_page = get_or_create_page('chromium/console', None, maxage=30) | 296 merged_page = get_or_create_page('chromium/console', None, maxage=30) |
283 logging.debug('console_merger: saving merged console') | 297 logging.debug('console_merger: saving merged console') |
284 save_page(merged_page, 'chromium/console', merged_content, | 298 page_data['title'] = 'BuildBot: Chromium' |
285 fetch_timestamp) | 299 page_data['offsite_base'] = 'http://build.chromium.org/p/chromium' |
286 return merged_content | 300 page_data['body_class'] = 'interface' |
301 page_data['content'] = merged_content | |
302 save_page(merged_page, 'chromium/console', fetch_timestamp, page_data) | |
303 return | |
287 | 304 |
288 | 305 |
289 # W0613:284,20:console_handler: Unused argument 'unquoted_localpath' | 306 # W0613:284,20:console_handler: Unused argument 'unquoted_localpath' |
290 # pylint: disable=W0613 | 307 # pylint: disable=W0613 |
291 def console_handler(unquoted_localpath, remoteurl, content=None): | 308 def console_handler(unquoted_localpath, remoteurl, page_data=None): |
M-A Ruel
2012/05/29 18:46:33
Use a leading underscore to silence the warning, e
cmp
2012/05/29 19:38:03
Done.
| |
292 if content is None: | 309 page_data = page_data or {} |
293 return None | 310 content = page_data.get('content') |
294 # TODO(cmp): Fix the LKGR link. | 311 if not content: |
312 return page_data | |
295 | 313 |
296 # Decode content from utf-8 to unicode, replacing bad characters. | 314 # Decode content from utf-8 to unicode, replacing bad characters. |
297 content = content.decode('utf-8', 'replace') | 315 content = content.decode('utf-8', 'replace') |
298 | 316 |
299 # Scrub in sheriff file content to console. | 317 # Scrub in sheriff file content to console. |
300 sheriff_files = [ | 318 sheriff_files = [ |
301 'sheriff', | 319 'sheriff', |
302 'sheriff_android', | 320 'sheriff_android', |
303 'sheriff_cr_cros_gardeners', | 321 'sheriff_cr_cros_gardeners', |
304 'sheriff_cros_mtv', | 322 'sheriff_cros_mtv', |
305 'sheriff_cros_nonmtv', | 323 'sheriff_cros_nonmtv', |
306 'sheriff_gpu', | 324 'sheriff_gpu', |
307 'sheriff_memory', | 325 'sheriff_memory', |
308 'sheriff_nacl', | 326 'sheriff_nacl', |
309 'sheriff_perf', | 327 'sheriff_perf', |
310 'sheriff_webkit', | 328 'sheriff_webkit', |
311 ] | 329 ] |
312 for sheriff_file in sheriff_files: | 330 for sheriff_file in sheriff_files: |
313 sheriff_content = get_and_cache_page('chromium/%s.js' % sheriff_file) | 331 sheriff_page_data = get_and_cache_pagedata('chromium/%s.js' % sheriff_file) |
332 sheriff_content = sheriff_page_data['content'] | |
314 console_re = (r'<script src=\'http://chromium-build.appspot.com/' | 333 console_re = (r'<script src=\'http://chromium-build.appspot.com/' |
315 'p/chromium/%s.js\'></script>') | 334 'p/chromium/%s.js\'></script>') |
316 content = re.sub(console_re % sheriff_file, | 335 content = re.sub(console_re % sheriff_file, |
317 '<script>%s</script>' % sheriff_content, content) | 336 '<script>%s</script>' % sheriff_content, content) |
318 | 337 |
319 # Replace showBuildBox with direct links. | 338 # Replace showBuildBox with direct links. |
320 content = re.sub(r'<a href=\'#\' onclick=\'showBuildBox\(\"./(.+)\", event\);' | 339 content = re.sub(r'<a href=\'#\' onclick=\'showBuildBox\(\"./(.+)\", event\);' |
321 ' return false;\'', | 340 ' return false;\'', |
322 r"<a href='\1'", content) | 341 r"<a href='\1'", content) |
323 | 342 |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
367 content = string.replace(content, | 386 content = string.replace(content, |
368 "'/json/builders/Linux%20x64/builds/-1?as_text=1';", | 387 "'/json/builders/Linux%20x64/builds/-1?as_text=1';", |
369 "'/json/builders/Linux%20x64/builds/-1/as_text=1.json';") | 388 "'/json/builders/Linux%20x64/builds/-1/as_text=1.json';") |
370 | 389 |
371 # Fix up a reference to http chromium-build in BarUrl(). | 390 # Fix up a reference to http chromium-build in BarUrl(). |
372 content = string.replace(content, | 391 content = string.replace(content, |
373 "return 'http://chromium-build.appspot.com/p/'", | 392 "return 'http://chromium-build.appspot.com/p/'", |
374 "return 'https://chromium-build.appspot.com/p/'") | 393 "return 'https://chromium-build.appspot.com/p/'") |
375 | 394 |
376 # Encode content from unicode to utf-8. | 395 # Encode content from unicode to utf-8. |
377 content = content.encode('utf-8') | 396 page_data['content'] = content.encode('utf-8') |
378 return content | 397 |
398 # Last tweaks to HTML, plus extracting metadata about the page itself. | |
399 page_data['offsite_base'] = remoteurl + '/../' | |
400 | |
401 # Extract the title from the page. | |
402 md = re.search( | |
403 r'^.*<title>([^\<]+)</title>', | |
404 page_data['content'], | |
405 re.MULTILINE|re.DOTALL) | |
406 if md: | |
407 page_data['title'] = md.group(1) | |
408 | |
409 # Remove the leading text up to the end of the opening body tag. While | |
410 # there, extract the body_class from the page. | |
411 md = re.search( | |
412 r'^.*<body class="(\w+)\">(.*)$', | |
413 page_data['content'], | |
414 re.MULTILINE|re.DOTALL) | |
415 if md: | |
M-A Ruel
2012/05/29 18:46:33
Are these expected to fail in the normal case? Bec
cmp
2012/05/29 19:38:03
Done.
| |
416 page_data['body_class'] = md.group(1) | |
417 page_data['content'] = md.group(2) | |
418 | |
419 # Remove the leading div and hr tags. | |
420 md = re.search( | |
421 r'^.*?<hr/>(.*)$', | |
422 page_data['content'], | |
423 re.MULTILINE|re.DOTALL) | |
424 if md: | |
425 page_data['content'] = md.group(1) | |
426 | |
427 # Strip the trailing body and html tags. | |
428 md = re.search( | |
429 r'^(.*)</body>.*$', | |
430 page_data['content'], | |
431 re.MULTILINE|re.DOTALL) | |
432 if md: | |
433 page_data['content'] = md.group(1) | |
434 | |
435 return page_data | |
379 | 436 |
380 | 437 |
381 def one_box_handler(unquoted_localpath, remoteurl, content=None): | 438 def one_box_handler(unquoted_localpath, remoteurl, page_data=None): |
439 page_data = page_data or {} | |
440 content = page_data.get('content') | |
382 if content is None: | 441 if content is None: |
383 return None | 442 return page_data |
384 # Get the site name from the local path. | 443 # Get the site name from the local path. |
385 md = re.match('^([^\/]+)/.*$', unquoted_localpath) | 444 md = re.match('^([^\/]+)/.*$', unquoted_localpath) |
386 if not md: | 445 if not md: |
387 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' | 446 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' |
388 'from local path' % (unquoted_localpath, remoteurl, content)) | 447 'from local path' % ( |
389 return content | 448 unquoted_localpath, remoteurl, page_data)) |
449 return page_data | |
390 site = md.group(1) | 450 site = md.group(1) |
391 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site | 451 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site |
392 content = re.sub(r'waterfall', new_waterfall_url, content) | 452 page_data['content'] = re.sub( |
393 return content | 453 r'waterfall', |
454 new_waterfall_url, | |
455 page_data['content']) | |
456 return page_data | |
394 | 457 |
395 | 458 |
396 | 459 |
397 # List of URLs to fetch. | 460 # List of URLs to fetch. |
398 URLS = [ | 461 URLS = [ |
399 # Console URLs. | 462 # Console URLs. |
400 { | 463 { |
401 'remoteurl': 'http://build.chromium.org/p/chromium/console', | 464 'remoteurl': 'http://build.chromium.org/p/chromium/console', |
402 'localpath': 'chromium.main/console', | 465 'localpath': 'chromium.main/console', |
403 'postfetch': console_handler, | 466 'postfetch': console_handler, |
(...skipping 233 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
637 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) | 700 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) |
638 except urlfetch.DownloadError: | 701 except urlfetch.DownloadError: |
639 logging.warn('urlfetch failed: %s' % url, exc_info=1) | 702 logging.warn('urlfetch failed: %s' % url, exc_info=1) |
640 return None | 703 return None |
641 | 704 |
642 | 705 |
643 class Page(db.Model): | 706 class Page(db.Model): |
644 fetch_timestamp = db.DateTimeProperty(required=True) | 707 fetch_timestamp = db.DateTimeProperty(required=True) |
645 localpath = db.StringProperty(required=True) | 708 localpath = db.StringProperty(required=True) |
646 content = db.TextProperty() | 709 content = db.TextProperty() |
710 title = db.StringProperty() | |
711 offsite_base = db.StringProperty() | |
712 body_class = db.StringProperty() | |
647 remoteurl = db.TextProperty() | 713 remoteurl = db.TextProperty() |
648 # Data updated separately, after creation. | 714 # Data updated separately, after creation. |
649 content_blob = blobstore.BlobReferenceProperty() | 715 content_blob = blobstore.BlobReferenceProperty() |
650 | 716 |
651 | 717 |
652 def write_blob(data, mime_type): | 718 def write_blob(data, mime_type): |
653 """Saves a Unicode string as a new blob, returns the blob's key.""" | 719 """Saves a Unicode string as a new blob, returns the blob's key.""" |
654 file_name = files.blobstore.create(mime_type=mime_type) | 720 file_name = files.blobstore.create(mime_type=mime_type) |
655 data = data.encode('utf-8') | 721 data = data.encode('utf-8') |
656 with files.open(file_name, 'a') as blob_file: | 722 with files.open(file_name, 'a') as blob_file: |
657 blob_file.write(data) | 723 blob_file.write(data) |
658 files.finalize(file_name) | 724 files.finalize(file_name) |
659 return files.blobstore.get_blob_key(file_name) | 725 return files.blobstore.get_blob_key(file_name) |
660 | 726 |
661 | 727 |
662 def save_page(page, localpath, content, fetch_timestamp): | 728 def save_page(page, localpath, fetch_timestamp, page_data): |
729 body_class = page_data.get('body_class', '') | |
730 content = page_data.get('content') | |
731 offsite_base = page_data.get('offsite_base', '') | |
732 title = page_data.get('title', '') | |
733 | |
663 content_blob_key = None | 734 content_blob_key = None |
664 try: | 735 try: |
665 content = content.decode('utf-8', 'replace') | 736 content = content.decode('utf-8', 'replace') |
666 except UnicodeEncodeError: | 737 except UnicodeEncodeError: |
667 logging.debug('save_page: content was already in unicode') | 738 logging.debug('save_page: content was already in unicode') |
668 logging.debug('save_page: content size is %d' % len(content)) | 739 logging.debug('save_page: content size is %d' % len(content)) |
669 if len(content.encode('utf-8')) >= 1024*1024: | 740 if len(content.encode('utf-8')) >= 1024*1024: |
670 logging.debug('save_page: saving to blob') | 741 logging.debug('save_page: saving to blob') |
671 content_blob_key = write_blob(content, path_to_mime_type(localpath)) | 742 content_blob_key = write_blob(content, path_to_mime_type(localpath)) |
672 content = None | 743 content = None |
673 def tx_page(page_key): | 744 def tx_page(page_key): |
674 page = Page.get(page_key) | 745 page = Page.get(page_key) |
675 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no | 746 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no |
676 # 'fetch_timestamp' member (but some types could not be inferred) | 747 # 'fetch_timestamp' member (but some types could not be inferred) |
677 # pylint: disable=E1103 | 748 # pylint: disable=E1103 |
678 if page.fetch_timestamp > fetch_timestamp: | 749 if page.fetch_timestamp > fetch_timestamp: |
679 return | 750 return |
680 page.content = content | 751 page.content = content |
681 page.content_blob = content_blob_key | 752 page.content_blob = content_blob_key |
682 page.fetch_timestamp = fetch_timestamp | 753 page.fetch_timestamp = fetch_timestamp |
754 # title, offsite_base, body_class can all be empty strings for some | |
755 # content. Where that's true, they're not used for displaying a console- | |
756 # like resource, and the content alone is returned to the web user. | |
757 page.title = title | |
758 page.offsite_base = offsite_base | |
759 page.body_class = body_class | |
683 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member | 760 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member |
684 # (but some types could not be inferred) | 761 # (but some types could not be inferred) |
685 # pylint: disable=E1103 | 762 # pylint: disable=E1103 |
686 page.put() | 763 page.put() |
687 db.run_in_transaction(tx_page, page.key()) | 764 db.run_in_transaction(tx_page, page.key()) |
688 # E1101:232,11:fetch_page.tx_page: Module 'google.appengine.api.memcache' | 765 page_data = { |
M-A Ruel
2012/05/29 18:46:33
you could make a page.as_dict() member function, t
cmp
2012/05/29 19:38:03
page_data is not really a Page-as-dict. It's more
| |
689 # has no 'set' member | 766 'body_class': body_class, |
690 # pylint: disable=E1101 | 767 'content': content, |
691 if page.content_blob is None: | 768 'offsite_base': offsite_base, |
692 if memcache.set(key=localpath, value=page.content, time=60): | 769 'title': title, |
693 logging.debug('tx_page(page key="%s"): memcache.set() succeeded' % | 770 } |
694 page.key()) | 771 if content_blob_key: |
695 else: | 772 page_data['content_blob'] = True |
696 logging.error('tx_page(page key="%s"): memcache.set() failed' % | 773 put_pagedata_into_cache(localpath, page_data) |
697 page.key()) | |
698 | 774 |
699 | 775 |
700 def get_or_create_page(localpath, remoteurl, maxage): | 776 def get_or_create_page(localpath, remoteurl, maxage): |
701 return Page.get_or_insert( | 777 return Page.get_or_insert( |
702 key_name=localpath, | 778 key_name=localpath, |
703 localpath=localpath, | 779 localpath=localpath, |
704 remoteurl=remoteurl, | 780 remoteurl=remoteurl, |
705 maxage=maxage, | 781 maxage=maxage, |
706 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), | 782 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), |
707 content=None, | 783 content=None, |
708 content_blob=None) | 784 content_blob=None) |
709 | 785 |
710 | 786 |
711 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None): | 787 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None, |
788 fetch_url=nonfatal_fetch_url): | |
712 """Fetches data about a set of pages.""" | 789 """Fetches data about a set of pages.""" |
713 unquoted_localpath = urllib.unquote(localpath) | 790 unquoted_localpath = urllib.unquote(localpath) |
714 logging.debug('fetch_page("%s", "%s", "%s")' % ( | 791 logging.debug('fetch_page("%s", "%s", "%s")' % ( |
715 unquoted_localpath, remoteurl, maxage)) | 792 unquoted_localpath, remoteurl, maxage)) |
716 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) | 793 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) |
717 | 794 |
718 # Check if our copy of the page is younger than maxage. If it is, we'll | 795 # Check if our copy of the page is younger than maxage. If it is, we'll |
719 # skip the fetch. | 796 # skip the fetch. |
720 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( | 797 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( |
721 seconds=maxage) | 798 seconds=maxage) |
722 if (page.fetch_timestamp and | 799 if (page.fetch_timestamp and |
723 page.fetch_timestamp > oldest_acceptable_timestamp): | 800 page.fetch_timestamp > oldest_acceptable_timestamp): |
724 logging.debug('fetch_page: too recent, skipping') | 801 logging.debug('fetch_page: too recent, skipping') |
725 return | 802 return |
726 | 803 |
727 # Perform the actual page fetch. | 804 # Perform the actual page fetch. |
728 fetch_timestamp = datetime.datetime.now() | 805 fetch_timestamp = datetime.datetime.now() |
729 response = nonfatal_fetch_url(remoteurl) | 806 response = fetch_url(remoteurl) |
730 if not response: | 807 if not response: |
731 logging.warning('fetch_page: got empty response') | 808 logging.warning('fetch_page: got empty response') |
732 return | 809 return |
733 if response.status_code != 200: | 810 if response.status_code != 200: |
734 logging.warning('fetch_page: got non-empty response but code ' | 811 logging.warning('fetch_page: got non-empty response but code ' |
735 '%d' % response.status_code) | 812 '%d' % response.status_code) |
736 return | 813 return |
737 | 814 |
738 # We have actual content. If there's one or more handlers, call them. | 815 # We have actual content. If there's one or more handlers, call them. |
739 content = response.content | 816 page_data = {} |
817 page_data['content'] = response.content | |
740 if postfetch: | 818 if postfetch: |
741 if not isinstance(postfetch, list): | 819 if not isinstance(postfetch, list): |
742 postfetch = [postfetch] | 820 postfetch = [postfetch] |
743 for handler in postfetch: | 821 for handler in postfetch: |
744 logging.debug('fetch_page: calling postfetch handler ' | 822 logging.debug('fetch_page: calling postfetch handler ' |
745 '%s' % handler.__name__) | 823 '%s' % handler.__name__) |
746 content = handler(unquoted_localpath, remoteurl, content) | 824 page_data = handler(unquoted_localpath, remoteurl, page_data) |
747 | 825 |
748 # Save the returned content into the DB and caching layers. | 826 # Save the returned content into the DB and caching layers. |
749 logging.debug('fetch_page: saving page') | 827 logging.debug('fetch_page: saving page') |
750 save_page(page, unquoted_localpath, content, fetch_timestamp) | 828 save_page(page, unquoted_localpath, fetch_timestamp, page_data) |
751 if postsave: | 829 if postsave: |
752 if not isinstance(postsave, list): | 830 if not isinstance(postsave, list): |
753 postsave = [postsave] | 831 postsave = [postsave] |
754 for handler in postsave: | 832 for handler in postsave: |
755 logging.debug('fetch_page: calling postsave handler ' | 833 logging.debug('fetch_page: calling postsave handler ' |
756 '%s' % handler.__name__) | 834 '%s' % handler.__name__) |
757 handler(unquoted_localpath, remoteurl, content) | 835 handler(unquoted_localpath, remoteurl, page_data) |
758 | 836 |
759 | 837 |
760 EXT_TO_MIME = { | 838 EXT_TO_MIME = { |
761 '.css': 'text/css', | 839 '.css': 'text/css', |
762 '.js': 'text/javascript', | 840 '.js': 'text/javascript', |
763 '.json': 'application/json', | 841 '.json': 'application/json', |
764 '.html': 'text/html', | 842 '.html': 'text/html', |
765 } | 843 } |
766 | 844 |
767 | 845 |
768 def path_to_mime_type(path): | 846 def path_to_mime_type(path): |
769 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') | 847 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') |
770 | 848 |
771 | 849 |
772 def fetch_pages(): | 850 def fetch_pages(): |
773 """Starts a background fetch operation for pages that need it.""" | 851 """Starts a background fetch operation for pages that need it.""" |
774 logging.debug('fetch_pages()') | 852 logging.debug('fetch_pages()') |
775 for url in URLS: | 853 for url in URLS: |
776 deferred.defer(fetch_page, **url) | 854 deferred.defer(fetch_page, **url) |
OLD | NEW |