OLD | NEW |
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 from __future__ import with_statement | 5 from __future__ import with_statement |
6 | 6 |
7 import datetime | 7 import datetime |
8 import json | 8 import json |
9 import logging | 9 import logging |
10 import os | 10 import os |
11 import random | 11 import random |
12 import re | 12 import re |
13 import string | 13 import string |
14 import urllib | 14 import urllib |
15 | 15 |
16 from google.appengine.api import files, memcache, urlfetch | 16 from google.appengine.api import files, memcache, urlfetch |
17 from google.appengine.api.app_identity import get_application_id | 17 from google.appengine.api.app_identity import get_application_id |
18 from google.appengine.ext import blobstore, db, deferred | 18 from google.appengine.ext import blobstore, db, deferred |
19 # F0401: 16,0: Unable to import 'webapp2_extras' | 19 # F0401: 16,0: Unable to import 'webapp2_extras' |
20 # W0611: 16,0: Unused import jinja2 | 20 # W0611: 16,0: Unused import jinja2 |
21 # pylint: disable=F0401, W0611 | 21 # pylint: disable=F0401, W0611 |
22 from webapp2_extras import jinja2 | 22 from webapp2_extras import jinja2 |
23 # F0401:22,0: Unable to import 'jinja2' | 23 # F0401:22,0: Unable to import 'jinja2' |
24 # pylint: disable=F0401 | 24 # pylint: disable=F0401 |
25 from jinja2 import Environment, FileSystemLoader | 25 from jinja2 import Environment, FileSystemLoader |
26 | 26 |
27 from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup | 27 from third_party.BeautifulSoup.BeautifulSoup import BeautifulSoup, Tag |
28 | 28 |
29 | 29 |
30 # Current application name. | 30 # Current application name. |
31 APP_NAME = get_application_id() | 31 APP_NAME = get_application_id() |
32 | 32 |
33 # Deadline for fetching URLs (in seconds). | 33 # Deadline for fetching URLs (in seconds). |
34 URLFETCH_DEADLINE = 60*5 # 5 mins | 34 URLFETCH_DEADLINE = 60*5 # 5 mins |
35 | 35 |
36 # Default masters to merge together. | 36 # Default masters to merge together. |
37 DEFAULT_MASTERS_TO_MERGE = [ | 37 DEFAULT_MASTERS_TO_MERGE = [ |
38 'chromium.main', | 38 'chromium.main', |
39 'chromium.win', | 39 'chromium.win', |
40 'chromium.mac', | 40 'chromium.mac', |
41 'chromium.linux', | 41 'chromium.linux', |
42 'chromium.chromiumos', | 42 'chromium.chromiumos', |
43 'chromium.chrome', | 43 'chromium.chrome', |
44 'chromium.memory', | 44 'chromium.memory', |
45 ] | 45 ] |
46 | 46 |
47 | 47 |
48 # Perform initial bootstrap for this module. | 48 # Perform initial bootstrap for this module. |
49 console_template = '' | 49 console_template = '' |
50 def bootstrap(): | 50 def bootstrap(): |
51 global console_template | 51 global console_template |
52 with open('templates/merger.html', 'r') as fh: | 52 with open('templates/merger.html', 'r') as fh: |
53 console_template = fh.read() | 53 console_template = fh.read() |
54 | 54 |
55 | 55 |
56 def get_pagedata_from_cache(localpath): | 56 ########## |
57 memcache_data = memcache.get(localpath) | 57 # Page class definition and related functions. |
58 if not memcache_data: | 58 ########## |
59 return None | 59 class Page(db.Model): |
60 logging.debug('content for %s found in memcache' % localpath) | 60 fetch_timestamp = db.DateTimeProperty(required=True) |
61 return json.loads(memcache_data) | 61 localpath = db.StringProperty(required=True) |
| 62 content = db.TextProperty() |
| 63 title = db.StringProperty() |
| 64 offsite_base = db.StringProperty() |
| 65 body_class = db.StringProperty() |
| 66 remoteurl = db.TextProperty() |
| 67 # Data updated separately, after creation. |
| 68 content_blob = blobstore.BlobReferenceProperty() |
62 | 69 |
63 | 70 |
64 def put_pagedata_into_cache(localpath, page_data): | 71 def get_or_create_page(localpath, remoteurl, maxage): |
65 memcache_data = json.dumps(page_data) | 72 return Page.get_or_insert( |
66 if not memcache.set(key=localpath, value=memcache_data, time=2*60): | 73 key_name=localpath, |
67 logging.error('put_pagedata_into_cache(\'%s\'): memcache.set() failed' % ( | 74 localpath=localpath, |
68 localpath)) | 75 remoteurl=remoteurl, |
| 76 maxage=maxage, |
| 77 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), |
| 78 content=None, |
| 79 content_blob=None) |
69 | 80 |
70 | 81 |
71 def get_and_cache_pagedata(localpath): | 82 def get_and_cache_pagedata(localpath): |
72 """Returns a page_data dict, optionally caching and looking up a blob. | 83 """Returns a page_data dict, optionally caching and looking up a blob. |
73 | 84 |
74 get_and_cache_pagedata takes a localpath which is used to fetch data | 85 get_and_cache_pagedata takes a localpath which is used to fetch data |
75 from the cache. If the data is present and there's no content blob, | 86 from the cache. If the data is present and there's no content blob, |
76 then we have all of the data we need to return a page view to the user | 87 then we have all of the data we need to return a page view to the user |
77 and we return early. | 88 and we return early. |
78 | 89 |
79 Otherwise, we need to fetch the page object and set up the page data | 90 Otherwise, we need to fetch the page object and set up the page data |
80 for the page view. If the page has a blob associated with it, then we | 91 for the page view. If the page has a blob associated with it, then we |
81 mark the page data as having a blob and cache it as-is without the blob. | 92 mark the page data as having a blob and cache it as-is without the blob. |
82 If there's no blob, we associate the content with the page data and | 93 If there's no blob, we associate the content with the page data and |
83 cache that. This is so the next time get_and_cache_pagedata is called | 94 cache that. This is so the next time get_and_cache_pagedata is called |
84 for either case, we'll get the same behavior (a page-lookup for blobful | 95 for either case, we'll get the same behavior (a page-lookup for blobful |
85 content and a page cache hit for blobless content). | 96 content and a page cache hit for blobless content). |
86 | 97 |
87 Here we assume localpath is already unquoted. | 98 Here we assume localpath is already unquoted. |
88 """ | 99 """ |
89 page_data = get_pagedata_from_cache(localpath) | 100 page_data = get_data_from_cache(localpath) |
90 if page_data and not page_data.get('content_blob'): | 101 if page_data and not page_data.get('content_blob'): |
91 return page_data | 102 return page_data |
92 page = Page.all().filter('localpath =', localpath).get() | 103 page = Page.all().filter('localpath =', localpath).get() |
93 if not page: | 104 if not page: |
94 logging.error('get_and_cache_pagedata(\'%s\'): no matching localpath in ' | 105 logging.error('get_and_cache_pagedata(\'%s\'): no matching localpath in ' |
95 'datastore' % localpath) | 106 'datastore' % localpath) |
96 return {'content': None} | 107 return {'content': None} |
97 page_data = { | 108 page_data = { |
98 'body_class': page.body_class, | 109 'body_class': page.body_class, |
99 'offsite_base': page.offsite_base, | 110 'offsite_base': page.offsite_base, |
100 'title': page.title, | 111 'title': page.title, |
101 } | 112 } |
102 if page.content_blob: | 113 if page.content_blob: |
103 # Get the blob. | 114 # Get the blob. |
104 logging.debug('content for %s found in blobstore' % localpath) | 115 logging.debug('content for %s found in blobstore' % localpath) |
105 blob_reader = blobstore.BlobReader(page.content_blob) | 116 blob_reader = blobstore.BlobReader(page.content_blob) |
106 page_data['content_blob'] = True | 117 page_data['content_blob'] = True |
107 put_pagedata_into_cache(localpath, page_data) | 118 put_data_into_cache(localpath, page_data) |
108 page_data['content'] = blob_reader.read().decode('utf-8', 'replace') | 119 page_data['content'] = blob_reader.read().decode('utf-8', 'replace') |
109 else: | 120 else: |
110 logging.debug('content for %s found in datastore' % localpath) | 121 logging.debug('content for %s found in datastore' % localpath) |
111 page_data['content'] = page.content | 122 page_data['content'] = page.content |
112 put_pagedata_into_cache(localpath, page_data) | 123 put_data_into_cache(localpath, page_data) |
113 return page_data | 124 return page_data |
114 | 125 |
115 | 126 |
| 127 def save_page(page, localpath, fetch_timestamp, page_data): |
| 128 body_class = page_data.get('body_class', '') |
| 129 content = page_data.get('content') |
| 130 offsite_base = page_data.get('offsite_base', '') |
| 131 title = page_data.get('title', '') |
| 132 |
| 133 content_blob_key = None |
| 134 try: |
| 135 content = content.decode('utf-8', 'replace') |
| 136 except UnicodeEncodeError: |
| 137 logging.debug('save_page: content was already in unicode') |
| 138 logging.debug('save_page: content size is %d' % len(content)) |
| 139 if len(content.encode('utf-8')) >= 10**6: |
| 140 logging.debug('save_page: saving to blob') |
| 141 content_blob_key = write_blob(content, path_to_mime_type(localpath)) |
| 142 content = None |
| 143 def tx_page(page_key): |
| 144 page = Page.get(page_key) |
| 145 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no |
| 146 # 'fetch_timestamp' member (but some types could not be inferred) |
| 147 # pylint: disable=E1103 |
| 148 if page.fetch_timestamp > fetch_timestamp: |
| 149 return |
| 150 page.content = content |
| 151 page.content_blob = content_blob_key |
| 152 page.fetch_timestamp = fetch_timestamp |
| 153 # title, offsite_base, body_class can all be empty strings for some |
| 154 # content. Where that's true, they're not used for displaying a console- |
| 155 # like resource, and the content alone is returned to the web user. |
| 156 page.title = title |
| 157 page.offsite_base = offsite_base |
| 158 page.body_class = body_class |
| 159 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member |
| 160 # (but some types could not be inferred) |
| 161 # pylint: disable=E1103 |
| 162 page.put() |
| 163 db.run_in_transaction(tx_page, page.key()) |
| 164 page_data = { |
| 165 'body_class': body_class, |
| 166 'content': content, |
| 167 'offsite_base': offsite_base, |
| 168 'title': title, |
| 169 } |
| 170 if content_blob_key: |
| 171 page_data['content_blob'] = True |
| 172 put_data_into_cache(localpath, page_data) |
| 173 logging.info('Saved and cached page with localpath %s' % localpath) |
| 174 |
| 175 |
| 176 ########## |
| 177 # Row class definition and related functions. |
| 178 ########## |
| 179 class Row(db.Model): |
| 180 fetch_timestamp = db.DateTimeProperty(required=True) |
| 181 rev_number = db.StringProperty(required=True) |
| 182 localpath = db.StringProperty(required=True) |
| 183 revision = db.TextProperty() |
| 184 name = db.TextProperty() |
| 185 status = db.TextProperty() |
| 186 comment = db.TextProperty() |
| 187 details = db.TextProperty() |
| 188 |
| 189 |
| 190 def get_or_create_row(localpath, revision): |
| 191 return Row.get_or_insert( |
| 192 key_name=localpath, |
| 193 rev_number=revision, |
| 194 localpath=localpath, |
| 195 fetch_timestamp=datetime.datetime.now()) |
| 196 |
| 197 |
| 198 def get_and_cache_rowdata(localpath): |
| 199 """Returns a row_data dict. |
| 200 |
| 201 get_and_cache_rowdata takes a localpath which is used to fetch data from the |
| 202 cache. If the data is present, then we have all of the data we need and we |
| 203 return early. |
| 204 |
| 205 Otherwise, we need to fetch the row object and set up the row data. |
| 206 |
| 207 Here we assume localpath is already unquoted. |
| 208 """ |
| 209 row_data = get_data_from_cache(localpath) |
| 210 if row_data: |
| 211 return row_data |
| 212 row = Row.all().filter('localpath =', localpath).get() |
| 213 if not row: |
| 214 logging.error('get_and_cache_rowdata(\'%s\'): no matching localpath in ' |
| 215 'datastore' % localpath) |
| 216 return {} |
| 217 row_data = {} |
| 218 row_data['rev'] = row.revision |
| 219 row_data['name'] = row.name |
| 220 row_data['status'] = row.status |
| 221 row_data['comment'] = row.comment |
| 222 row_data['details'] = row.details |
| 223 row_data['rev_number'] = row.rev_number |
| 224 logging.debug('content for %s found in datastore' % localpath) |
| 225 put_data_into_cache(localpath, row_data) |
| 226 return row_data |
| 227 |
| 228 |
| 229 def save_row(row_data, localpath, timestamp): |
| 230 rev_number = row_data['rev_number'] |
| 231 row = get_or_create_row(localpath, rev_number) |
| 232 row_key = row.key() |
| 233 def tx_row(row_key): |
| 234 row = Row.get(row_key) |
| 235 # E1103:959,7:save_row.tx_row: Instance of 'list' has no |
| 236 # 'fetch_timestamp' member (but some types could not be inferred) |
| 237 # pylint: disable=E1103 |
| 238 # if row.fetch_timestamp > timestamp: |
| 239 # return |
| 240 row.fetch_timestamp = timestamp |
| 241 row.revision = row_data['rev'] |
| 242 row.name = row_data['name'] |
| 243 row.status = row_data['status'] |
| 244 row.comment = row_data['comment'] |
| 245 row.details = row_data['details'] |
| 246 # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member |
| 247 # (but some types could not be inferred) |
| 248 # pylint: disable=E1103 |
| 249 row.put() |
| 250 db.run_in_transaction(tx_row, row_key) |
| 251 prev_rev = memcache.get(key='latest_rev') |
| 252 if (rev_number > prev_rev): |
| 253 memcache.set(key='latest_rev', value=rev_number) |
| 254 put_data_into_cache(localpath, row_data) |
| 255 logging.info('Saved and cached row with localpath %s' % localpath) |
| 256 |
| 257 |
| 258 ########## |
| 259 # ConsoleData class definition and related functions. |
| 260 ########## |
116 class ConsoleData(object): | 261 class ConsoleData(object): |
117 def __init__(self): | 262 def __init__(self): |
118 self.row_orderedkeys = [] | 263 self.row_orderedkeys = [] |
119 self.row_data = {} | 264 self.row_data = {} |
120 | 265 |
121 # Retain order of observed masters. | 266 # Retain order of observed masters. |
122 self.masters = [] | 267 self.masters = [] |
123 | 268 |
124 # Map(k,v): k=Master, v=List of categories | 269 # Map(k,v): k=Master, v=List of categories |
125 self.category_order = {} | 270 self.category_order = {} |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
171 def SetDetail(self, detail): | 316 def SetDetail(self, detail): |
172 self.last_row['detail'] = detail | 317 self.last_row['detail'] = detail |
173 | 318 |
174 def AddCategory(self, category, builder_status): | 319 def AddCategory(self, category, builder_status): |
175 self.category_order[self.lastMasterSeen].append(category) | 320 self.category_order[self.lastMasterSeen].append(category) |
176 # Map(k,v): k=Master/category, v=Dict of category data (last build status) | 321 # Map(k,v): k=Master/category, v=Dict of category data (last build status) |
177 self.category_data[self.lastMasterSeen].setdefault(category, {}) | 322 self.category_data[self.lastMasterSeen].setdefault(category, {}) |
178 self.category_data[self.lastMasterSeen][category] = builder_status | 323 self.category_data[self.lastMasterSeen][category] = builder_status |
179 self.category_count += 1 | 324 self.category_count += 1 |
180 | 325 |
| 326 def AddRow(self, row): |
| 327 revision = row['rev_number'] |
| 328 self.SawRevision(revision) |
| 329 revlink = BeautifulSoup(row['rev']).td.a['href'] |
| 330 self.SetLink(revlink) |
| 331 name = BeautifulSoup(row['name']).td.contents |
| 332 self.SetName(self.ContentsToHtml(name)) |
| 333 status = BeautifulSoup(row['status']).findAll('table') |
| 334 for i, stat in enumerate(status): |
| 335 self.SetStatus(self.category_order[self.lastMasterSeen][i], |
| 336 unicode(stat)) |
| 337 comment = BeautifulSoup(row['comment']).td.contents |
| 338 self.SetComment(self.ContentsToHtml(comment)) |
| 339 if row['details']: |
| 340 details = BeautifulSoup(row['details']).td.contents |
| 341 self.SetDetail(self.ContentsToHtml(details)) |
| 342 |
181 def ParseRow(self, row): | 343 def ParseRow(self, row): |
182 cells = row.findAll('td', recursive=False) | 344 cells = row.findAll('td', recursive=False) |
183 # Figure out which row this is. | 345 # Figure out which row this is. |
184 for attrname, attrvalue in cells[0].attrs: | 346 for attrname, attrvalue in cells[0].attrs: |
185 if attrname != 'class': | 347 if attrname != 'class': |
186 continue | 348 continue |
187 attrvalue = re.sub(r'^(\S+).*', r'\1', attrvalue) | 349 attrvalue = re.sub(r'^(\S+).*', r'\1', attrvalue) |
188 if attrvalue == 'DevRev': | 350 if attrvalue == 'DevRev': |
189 revision = cells[0] | 351 revision = cells[0] |
190 self.SawRevision(revision=revision.findAll('a')[0].contents[0]) | 352 self.SawRevision(revision=revision.findAll('a')[0].contents[0]) |
(...skipping 10 matching lines...) Expand all Loading... |
201 if attrvalue == 'DevDetails': | 363 if attrvalue == 'DevDetails': |
202 self.SetDetail(detail=self.ContentsToHtml(cells[0].contents)) | 364 self.SetDetail(detail=self.ContentsToHtml(cells[0].contents)) |
203 | 365 |
204 def Finish(self): | 366 def Finish(self): |
205 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) | 367 self.row_orderedkeys = sorted(self.row_orderedkeys, key=int, reverse=True) |
206 # TODO(cmp): Look for row/master/categories that are unset. If they are | 368 # TODO(cmp): Look for row/master/categories that are unset. If they are |
207 # at the latest revisions, leave them unset. If they are at | 369 # at the latest revisions, leave them unset. If they are at |
208 # the earliest revisions, set them to ''. | 370 # the earliest revisions, set them to ''. |
209 | 371 |
210 | 372 |
211 # W0613:169,39:console_merger: Unused argument 'remoteurl' | 373 ########## |
212 # W0613:169,19:console_merger: Unused argument 'unquoted_localpath' | 374 # Heavy-lifting functions that do most of the console processing. |
213 # pylint: disable=W0613 | 375 # AKA postfetch and postsave functions/handlers. |
214 def console_merger(unquoted_localpath, remote_url, page_data=None, | 376 ########## |
215 masters_to_merge=None): | 377 def console_merger(localpath, remoteurl, page_data, |
216 page_data = page_data or {} | 378 masters_to_merge=None, num_rows_to_merge=25): |
217 | |
218 masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE | 379 masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE |
219 mergedconsole = ConsoleData() | 380 mergedconsole = ConsoleData() |
220 merged_page = None | 381 surroundings = get_and_cache_pagedata('surroundings') |
221 merged_tag = None | 382 merged_page = BeautifulSoup(surroundings['content']) |
| 383 merged_tag = merged_page.find('table', 'ConsoleData') |
| 384 latest_rev = int(memcache.get(key='latest_rev')) |
| 385 if not latest_rev: |
| 386 logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest ' |
| 387 'revision number.' % ( |
| 388 localpath, remoteurl, page_data)) |
| 389 return |
222 fetch_timestamp = datetime.datetime.now() | 390 fetch_timestamp = datetime.datetime.now() |
223 for master in masters_to_merge: | 391 for master in masters_to_merge: |
224 page_data = get_and_cache_pagedata('%s/console' % master) | 392 # Fetch the summary one-box-per-builder for the master. |
225 master_content = page_data['content'] | 393 # If we don't get it, something is wrong, skip the master entirely. |
226 if master_content is None: | 394 master_summary = get_and_cache_pagedata('%s/console/summary' % master) |
| 395 if not master_summary['content']: |
227 continue | 396 continue |
228 master_content = master_content.encode('ascii', 'replace') | |
229 this_page = BeautifulSoup(master_content) | |
230 this_tag = this_page.find('table', {'class': 'ConsoleData'}) | |
231 # The first console is special, we reuse all of the console page. | |
232 if not merged_page: | |
233 merged_page = this_page | |
234 merged_tag = this_tag | |
235 mergedconsole.SawMaster(master) | 397 mergedconsole.SawMaster(master) |
| 398 # Get the categories for this builder. If the builder doesn't have any |
| 399 # categories, just use the default empty-string category. |
| 400 category_list = [] |
| 401 master_categories = get_and_cache_pagedata('%s/console/categories' % master) |
| 402 if not master_categories['content']: |
| 403 category_list.append('') |
| 404 else: |
| 405 category_row = BeautifulSoup(master_categories['content']) |
| 406 category_list = map(lambda x: x.text, |
| 407 category_row.findAll('td', 'DevStatus')) |
| 408 # Get the corresponding summary box(es). |
| 409 summary_row = BeautifulSoup(master_summary['content']) |
| 410 summary_list = summary_row.findAll('table') |
| 411 for category, summary in zip(category_list, summary_list): |
| 412 mergedconsole.AddCategory(category, summary) |
236 | 413 |
237 # Parse each of the rows. | 414 # Fetch all of the rows that we need. |
238 CATEGORY_ROW = 0 | 415 rows_fetched = 0 |
239 trs = this_tag.findAll('tr', recursive=False) | 416 current_rev = latest_rev |
240 | 417 while rows_fetched < num_rows_to_merge and current_rev >= 0: |
241 # Get the list of categories in |master|. | 418 row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev)) |
242 category_tds = trs[CATEGORY_ROW].findAll('td', recursive=False)[2:] | 419 if not row_data: |
243 third_cell = category_tds[0] | 420 current_rev -= 1 |
244 third_cell_class = third_cell.attrs[0][1] | 421 continue |
245 categories = [] | 422 mergedconsole.AddRow(row_data) |
246 if third_cell_class.startswith('DevStatus '): | 423 current_rev -= 1 |
247 BUILDER_STATUS_ROW = 2 | 424 rows_fetched += 1 |
248 FIRST_CL_ROW = 3 | |
249 for index, category_td in enumerate(category_tds): | |
250 categories.append(category_td.contents[0].strip()) | |
251 else: | |
252 # There's no categories + spacing row, the first row will be the builder | |
253 # status row. | |
254 categories.append('') | |
255 BUILDER_STATUS_ROW = 0 | |
256 FIRST_CL_ROW = 1 | |
257 | |
258 # For each category in |master|, add the category plus its |builder_status|. | |
259 builder_tds = trs[BUILDER_STATUS_ROW].findAll('td', recursive=False)[2:] | |
260 for index, category in enumerate(categories): | |
261 builder_status = builder_tds[index].findAll('table', recursive=False)[0] | |
262 mergedconsole.AddCategory(category=category, | |
263 builder_status=builder_status) | |
264 | |
265 # For each of the remaining rows, add them to the console data. | |
266 for console_index in range(FIRST_CL_ROW, len(trs)): | |
267 console_row = trs[console_index] | |
268 mergedconsole.ParseRow(console_row) | |
269 # Add GC memory profiling. | |
270 # import gc | |
271 # gc.set_debug(gc.DEBUG_LEAK) | |
272 # logging.debug(gc.garbage) | |
273 # del gc.garbage[:] | |
274 mergedconsole.Finish() | |
275 | 425 |
276 # Convert the merged content into console content. | 426 # Convert the merged content into console content. |
| 427 mergedconsole.Finish() |
277 template_environment = Environment() | 428 template_environment = Environment() |
278 template_environment.loader = FileSystemLoader('.') | 429 template_environment.loader = FileSystemLoader('.') |
279 def notstarted(builder_status): | 430 def notstarted(builder_status): |
280 """Convert a BeautifulSoup Tag from builder status to a notstarted line.""" | 431 """Convert a BeautifulSoup Tag from builder status to a notstarted line.""" |
281 builder_status = re.sub(r'DevSlaveBox', 'DevStatusBox', str(builder_status)) | 432 builder_status = re.sub(r'DevSlaveBox', 'DevStatusBox', str(builder_status)) |
282 builder_status = re.sub(r'class=\'([^\']*)\' target=', | 433 builder_status = re.sub(r'class=\'([^\']*)\' target=', |
283 'class=\'DevStatusBox notstarted\' target=', | 434 'class=\'DevStatusBox notstarted\' target=', |
284 builder_status) | 435 builder_status) |
285 builder_status = re.sub(r'class="([^"]*)" target=', | 436 builder_status = re.sub(r'class="([^"]*)" target=', |
286 'class="DevStatusBox notstarted" target=', | 437 'class="DevStatusBox notstarted" target=', |
(...skipping 22 matching lines...) Expand all Loading... |
309 r'\'\<div\>\'', r"'<div ' + attributes + '>'", merged_content) | 460 r'\'\<div\>\'', r"'<div ' + attributes + '>'", merged_content) |
310 merged_content = re.sub( | 461 merged_content = re.sub( |
311 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) | 462 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) |
312 merged_content = re.sub( | 463 merged_content = re.sub( |
313 r'\<iframe\>\</iframe\>', | 464 r'\<iframe\>\</iframe\>', |
314 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', | 465 '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', |
315 merged_content) | 466 merged_content) |
316 | 467 |
317 # Update the merged console page. | 468 # Update the merged console page. |
318 merged_page = get_or_create_page('chromium/console', None, maxage=30) | 469 merged_page = get_or_create_page('chromium/console', None, maxage=30) |
319 logging.debug('console_merger: saving merged console') | 470 logging.info('console_merger: saving merged console') |
| 471 page_data = get_and_cache_pagedata('chromium/console') |
320 page_data['title'] = 'BuildBot: Chromium' | 472 page_data['title'] = 'BuildBot: Chromium' |
321 page_data['offsite_base'] = 'http://build.chromium.org/p/chromium' | 473 page_data['offsite_base'] = 'http://build.chromium.org/p/chromium' |
322 page_data['body_class'] = 'interface' | 474 page_data['body_class'] = 'interface' |
323 page_data['content'] = merged_content | 475 page_data['content'] = merged_content |
324 save_page(merged_page, 'chromium/console', fetch_timestamp, page_data) | 476 save_page(merged_page, 'chromium/console', fetch_timestamp, page_data) |
325 return | 477 return |
326 | 478 |
327 | 479 |
328 def console_handler(_unquoted_localpath, remoteurl, page_data=None): | 480 def console_handler(unquoted_localpath, remoteurl, page_data=None): |
329 page_data = page_data or {} | 481 page_data = page_data or {} |
330 content = page_data.get('content') | 482 content = page_data.get('content') |
331 if not content: | 483 if not content: |
| 484 logging.error('console_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' |
| 485 'from local path' % ( |
| 486 unquoted_localpath, remoteurl, page_data)) |
332 return page_data | 487 return page_data |
333 | 488 |
334 # Decode content from utf-8 to unicode, replacing bad characters. | 489 # Decode content from utf-8 to unicode, replacing bad characters. |
335 content = content.decode('utf-8', 'replace') | 490 content = content.decode('utf-8', 'replace') |
336 | 491 |
337 # Scrub in sheriff file content to console. | 492 # Scrub in sheriff file content to console. |
338 sheriff_files = [ | 493 sheriff_files = [ |
339 'sheriff', | 494 'sheriff', |
340 'sheriff_android', | 495 'sheriff_android', |
341 'sheriff_cr_cros_gardeners', | 496 'sheriff_cr_cros_gardeners', |
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
453 md = re.search( | 608 md = re.search( |
454 r'^(.*)</body>.*$', | 609 r'^(.*)</body>.*$', |
455 page_data['content'], | 610 page_data['content'], |
456 re.MULTILINE|re.DOTALL) | 611 re.MULTILINE|re.DOTALL) |
457 if not md: | 612 if not md: |
458 raise Exception('failed to locate trailing body and html tags') | 613 raise Exception('failed to locate trailing body and html tags') |
459 page_data['content'] = md.group(1) | 614 page_data['content'] = md.group(1) |
460 | 615 |
461 return page_data | 616 return page_data |
462 | 617 |
463 | 618 # W0613:600,28:parse_master: Unused argument 'remoteurl' |
464 def get_or_create_row(localpath, revision): | 619 # pylint: disable=W0613 |
465 return Row.get_or_insert( | |
466 key_name=revision + ' '+ localpath, | |
467 rev_number=revision, | |
468 localpath=localpath, | |
469 fetch_timestamp=datetime.datetime.now()) | |
470 | |
471 | |
472 def save_row(row_data, localpath, timestamp): | |
473 rev_number = row_data['rev_number'] | |
474 row = get_or_create_row(localpath, rev_number) | |
475 row_key = row.key() | |
476 def tx_row(row_key): | |
477 row = Row.get(row_key) | |
478 # E1103:959,7:save_row.tx_row: Instance of 'list' has no | |
479 # 'fetch_timestamp' member (but some types could not be inferred) | |
480 # pylint: disable=E1103 | |
481 # if row.fetch_timestamp > timestamp: | |
482 # return | |
483 row.fetch_timestamp = timestamp | |
484 row.revision = row_data['rev'] | |
485 row.name = row_data['name'] | |
486 row.status = row_data['status'] | |
487 row.comment = row_data['comment'] | |
488 row.details = row_data['details'] | |
489 # E1103:967,4:save_row.tx_row: Instance of 'list' has no 'put' member | |
490 # (but some types could not be inferred) | |
491 # pylint: disable=E1103 | |
492 row.put() | |
493 db.run_in_transaction(tx_row, row_key) | |
494 memcache_data = json.dumps(row_data) | |
495 # A row should never be large enough to hit the blobstore, so we | |
496 # explicitly don't handle rows larger than 10^6 bytes. | |
497 if not memcache.set(key=str(row_key), value=memcache_data, time=2*60): | |
498 logging.error('save_row(\'%s\'): memcache.set() failed' % (row_key)) | |
499 | |
500 | |
501 def parse_master(localpath, remoteurl, page_data=None): | 620 def parse_master(localpath, remoteurl, page_data=None): |
502 """Part of the new pipeline to store individual rows rather than | 621 """Part of the new pipeline to store individual rows rather than |
503 whole pages of html. Parses the master data into a set of rows, | 622 whole pages of html. Parses the master data into a set of rows, |
504 and writes them out to the datastore in an easily retrievable format. | 623 and writes them out to the datastore in an easily retrievable format. |
505 | 624 |
506 Returns the same page_data as it was passed, so as to not interrupt | 625 Doesn't modify page_data dict. |
507 the current pipeline. This may change when we switch over. | |
508 """ | 626 """ |
509 ts = datetime.datetime.now() | 627 ts = datetime.datetime.now() |
510 page_data = page_data or {} | 628 page_data = page_data or {} |
511 content = page_data.get('content') | 629 content = page_data.get('content') |
512 if not content: | 630 if not content: |
513 return page_data | 631 return page_data |
514 content = content.decode('utf-8', 'replace') | 632 content = content.decode('utf-8', 'replace') |
515 | 633 |
516 # Split page into surroundings (announce, legend, footer) and data (rows). | 634 # Split page into surroundings (announce, legend, footer) and data (rows). |
517 surroundings = BeautifulSoup(content) | 635 surroundings = BeautifulSoup(content) |
518 data = surroundings.find('table', 'ConsoleData') | 636 data = surroundings.find('table', 'ConsoleData') |
519 data.extract() | 637 new_data = Tag(surroundings, 'table', [('class', 'ConsoleData'), |
| 638 ('width', '96%')]) |
| 639 data.replaceWith(new_data) |
520 | 640 |
521 surroundings_page = get_or_create_page(localpath + '/surroundings', | 641 surroundings_page = get_or_create_page('surroundings', |
522 None, maxage=30) | 642 None, maxage=30) |
523 surroundings_data = {} | 643 surroundings_data = {} |
524 surroundings_data['title'] = 'Surroundings for ' + localpath | 644 surroundings_data['title'] = 'Surroundings' |
525 surroundings_data['content'] = unicode(surroundings) | 645 surroundings_data['content'] = unicode(surroundings) |
526 save_page(surroundings_page, localpath + '/surroundings', ts, | 646 save_page(surroundings_page, 'surroundings', ts, |
527 surroundings_data) | 647 surroundings_data) |
528 | 648 |
529 rows = data.tbody.findAll('tr', recursive=False) | 649 rows = data.findAll('tr', recursive=False) |
530 # The first table row can be special: the list of categories. | 650 # The first table row can be special: the list of categories. |
531 categories = None | 651 categories = None |
532 # If the first row contains a DevStatus cell... | 652 # If the first row contains a DevStatus cell... |
533 if rows[0].find('td', 'DevStatus') != None: | 653 if rows[0].find('td', 'DevStatus') != None: |
534 # ...extract it into the categories... | 654 # ...extract it into the categories... |
535 categories = rows[0] | 655 categories = rows[0] |
536 # ...and get rid of the next (spacer) row too. | 656 # ...and get rid of the next (spacer) row too. |
537 rows = rows[2:] | 657 rows = rows[2:] |
538 | 658 |
539 if categories: | 659 if categories: |
(...skipping 20 matching lines...) Expand all Loading... |
560 # or a spacer row (in which case we finalize the row and save it). | 680 # or a spacer row (in which case we finalize the row and save it). |
561 for row in rows: | 681 for row in rows: |
562 if row.find('td', 'DevComment'): | 682 if row.find('td', 'DevComment'): |
563 curr_row['comment'] = unicode(row) | 683 curr_row['comment'] = unicode(row) |
564 elif row.find('td', 'DevDetails'): | 684 elif row.find('td', 'DevDetails'): |
565 curr_row['details'] = unicode(row) | 685 curr_row['details'] = unicode(row) |
566 elif row.find('td', 'DevStatus'): | 686 elif row.find('td', 'DevStatus'): |
567 curr_row['rev'] = unicode(row.find('td', 'DevRev')) | 687 curr_row['rev'] = unicode(row.find('td', 'DevRev')) |
568 curr_row['rev_number'] = unicode(row.find('td', 'DevRev').a.string) | 688 curr_row['rev_number'] = unicode(row.find('td', 'DevRev').a.string) |
569 curr_row['name'] = unicode(row.find('td', 'DevName')) | 689 curr_row['name'] = unicode(row.find('td', 'DevName')) |
570 curr_row['status'] = unicode(row.findAll('td', 'DevStatus')) | 690 curr_row['status'] = unicode(row.findAll('table')) |
571 else: | 691 else: |
572 if 'details' not in curr_row: | 692 if 'details' not in curr_row: |
573 curr_row['details'] = '' | 693 curr_row['details'] = '' |
574 save_row(curr_row, localpath, ts) | 694 save_row(curr_row, localpath + '/' + curr_row['rev_number'], ts) |
575 curr_row = {} | 695 curr_row = {} |
576 | 696 |
577 return page_data | 697 return page_data |
578 | 698 |
579 | 699 |
580 def one_box_handler(unquoted_localpath, remoteurl, page_data=None): | 700 def one_box_handler(unquoted_localpath, remoteurl, page_data=None): |
581 page_data = page_data or {} | 701 page_data = page_data or {} |
582 content = page_data.get('content') | 702 content = page_data.get('content') |
583 if content is None: | 703 if content is None: |
584 return page_data | 704 return page_data |
585 # Get the site name from the local path. | 705 # Get the site name from the local path. |
586 md = re.match('^([^\/]+)/.*$', unquoted_localpath) | 706 md = re.match('^([^\/]+)/.*$', unquoted_localpath) |
587 if not md: | 707 if not md: |
588 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' | 708 logging.error('one_box_handler(\'%s\', \'%s\', \'%s\'): cannot get site ' |
589 'from local path' % ( | 709 'from local path' % ( |
590 unquoted_localpath, remoteurl, page_data)) | 710 unquoted_localpath, remoteurl, page_data)) |
591 return page_data | 711 return page_data |
592 site = md.group(1) | 712 site = md.group(1) |
593 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site | 713 new_waterfall_url = 'http://build.chromium.org/p/%s/waterfall' % site |
594 page_data['content'] = re.sub( | 714 page_data['content'] = re.sub( |
595 r'waterfall', | 715 r'waterfall', |
596 new_waterfall_url, | 716 new_waterfall_url, |
597 page_data['content']) | 717 page_data['content']) |
598 return page_data | 718 return page_data |
599 | 719 |
600 | 720 |
| 721 ########## |
| 722 # Utility functions for blobstore and memcache. |
| 723 ########## |
| 724 def get_data_from_cache(localpath): |
| 725 memcache_data = memcache.get(localpath) |
| 726 if not memcache_data: |
| 727 return None |
| 728 logging.debug('content for %s found in memcache' % localpath) |
| 729 return json.loads(memcache_data) |
| 730 |
| 731 |
| 732 def put_data_into_cache(localpath, data): |
| 733 memcache_data = json.dumps(data) |
| 734 if not memcache.set(key=localpath, value=memcache_data, time=2*60): |
| 735 logging.error('put_data_into_cache(\'%s\'): memcache.set() failed' % ( |
| 736 localpath)) |
| 737 |
| 738 |
| 739 def write_blob(data, mime_type): |
| 740 """Saves a Unicode string as a new blob, returns the blob's key.""" |
| 741 file_name = files.blobstore.create(mime_type=mime_type) |
| 742 data = data.encode('utf-8') |
| 743 with files.open(file_name, 'a') as blob_file: |
| 744 blob_file.write(data) |
| 745 files.finalize(file_name) |
| 746 return files.blobstore.get_blob_key(file_name) |
| 747 |
| 748 |
| 749 def path_to_mime_type(path): |
| 750 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') |
| 751 |
| 752 |
| 753 EXT_TO_MIME = { |
| 754 '.css': 'text/css', |
| 755 '.js': 'text/javascript', |
| 756 '.json': 'application/json', |
| 757 '.html': 'text/html', |
| 758 } |
| 759 |
| 760 |
| 761 ########## |
| 762 # Functions for actually fetching original pages. |
| 763 ########## |
| 764 def fetch_pages(): |
| 765 """Starts a background fetch operation for pages that need it.""" |
| 766 logging.debug('fetch_pages()') |
| 767 for url in URLS: |
| 768 deferred.defer(fetch_page, **url) |
| 769 |
| 770 |
| 771 def nonfatal_fetch_url(url, *args, **kwargs): |
| 772 # Temporary workaround to disable AppEngine global cache of these pages. |
| 773 if '?' in url: |
| 774 url += '&' + str(random.random()) |
| 775 else: |
| 776 url += '?' + str(random.random()) |
| 777 |
| 778 try: |
| 779 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) |
| 780 except urlfetch.DownloadError: |
| 781 logging.warn('urlfetch failed: %s' % url, exc_info=1) |
| 782 return None |
| 783 |
| 784 |
| 785 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None, |
| 786 fetch_url=nonfatal_fetch_url): |
| 787 """Fetches data about a set of pages.""" |
| 788 if type(localpath) != type(''): |
| 789 logging.error('fetch_page: localpath is %r, expected a string' % ( |
| 790 repr(localpath))) |
| 791 return |
| 792 unquoted_localpath = urllib.unquote(localpath) |
| 793 logging.debug('fetch_page("%s", "%s", "%s")' % ( |
| 794 unquoted_localpath, remoteurl, maxage)) |
| 795 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) |
| 796 |
| 797 # Check if our copy of the page is younger than maxage. If it is, we'll |
| 798 # skip the fetch. |
| 799 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( |
| 800 seconds=maxage) |
| 801 if (page.fetch_timestamp and |
| 802 page.fetch_timestamp > oldest_acceptable_timestamp): |
| 803 logging.debug('fetch_page: too recent, skipping') |
| 804 return |
| 805 |
| 806 # Perform the actual page fetch. |
| 807 fetch_timestamp = datetime.datetime.now() |
| 808 response = fetch_url(remoteurl) |
| 809 if not response: |
| 810 logging.warning('fetch_page: got empty response') |
| 811 return |
| 812 if response.status_code != 200: |
| 813 logging.warning('fetch_page: got non-empty response but code ' |
| 814 '%d' % response.status_code) |
| 815 return |
| 816 |
| 817 # We have actual content. If there's one or more handlers, call them. |
| 818 page_data = {} |
| 819 page_data['content'] = response.content |
| 820 if postfetch: |
| 821 if not isinstance(postfetch, list): |
| 822 postfetch = [postfetch] |
| 823 for handler in postfetch: |
| 824 logging.debug('fetch_page: calling postfetch handler ' |
| 825 '%s' % handler.__name__) |
| 826 page_data = handler(unquoted_localpath, remoteurl, page_data) |
| 827 |
| 828 # Save the returned content into the DB and caching layers. |
| 829 logging.debug('fetch_page: saving page') |
| 830 save_page(page, unquoted_localpath, fetch_timestamp, page_data) |
| 831 if postsave: |
| 832 if not isinstance(postsave, list): |
| 833 postsave = [postsave] |
| 834 for handler in postsave: |
| 835 logging.debug('fetch_page: calling postsave handler ' |
| 836 '%s' % handler.__name__) |
| 837 handler(unquoted_localpath, remoteurl, page_data) |
| 838 |
601 | 839 |
602 # List of URLs to fetch. | 840 # List of URLs to fetch. |
603 URLS = [ | 841 URLS = [ |
604 # Console URLs. | 842 # Console URLs. |
605 { | 843 { |
606 'remoteurl': 'http://build.chromium.org/p/chromium.chrome/console', | 844 'remoteurl': 'http://build.chromium.org/p/chromium.chrome/console', |
607 'localpath': 'chromium.chrome/console', | 845 'localpath': 'chromium.chrome/console', |
608 'postfetch': [console_handler, parse_master], | 846 'postfetch': [console_handler, parse_master], |
609 'postsave': console_merger, | 847 'postsave': console_merger, |
610 'maxage': 30, # 30 secs | 848 'maxage': 30, # 30 secs |
(...skipping 303 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
914 # LKGR JSON. | 1152 # LKGR JSON. |
915 { | 1153 { |
916 'remoteurl': | 1154 'remoteurl': |
917 ('http://build.chromium.org/p/chromium.lkgr/json/builders/Linux%20x64/' | 1155 ('http://build.chromium.org/p/chromium.lkgr/json/builders/Linux%20x64/' |
918 'builds/-1?as_text=1'), | 1156 'builds/-1?as_text=1'), |
919 'localpath': | 1157 'localpath': |
920 'chromium.lkgr/json/builders/Linux%20x64/builds/-1/as_text=1.json', | 1158 'chromium.lkgr/json/builders/Linux%20x64/builds/-1/as_text=1.json', |
921 'maxage': 2*60, # 2 mins | 1159 'maxage': 2*60, # 2 mins |
922 }, | 1160 }, |
923 ] | 1161 ] |
924 | |
925 | |
926 def nonfatal_fetch_url(url, *args, **kwargs): | |
927 # Temporary workaround to disable AppEngine global cache of these pages. | |
928 if '?' in url: | |
929 url += '&' + str(random.random()) | |
930 else: | |
931 url += '?' + str(random.random()) | |
932 | |
933 try: | |
934 return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE, *args, **kwargs) | |
935 except urlfetch.DownloadError: | |
936 logging.warn('urlfetch failed: %s' % url, exc_info=1) | |
937 return None | |
938 | |
939 | |
940 class Row(db.Model): | |
941 fetch_timestamp = db.DateTimeProperty(required=True) | |
942 rev_number = db.StringProperty(required=True) | |
943 localpath = db.StringProperty(required=True) | |
944 revision = db.TextProperty() | |
945 name = db.TextProperty() | |
946 status = db.TextProperty() | |
947 comment = db.TextProperty() | |
948 details = db.TextProperty() | |
949 | |
950 | |
951 class Page(db.Model): | |
952 fetch_timestamp = db.DateTimeProperty(required=True) | |
953 localpath = db.StringProperty(required=True) | |
954 content = db.TextProperty() | |
955 title = db.StringProperty() | |
956 offsite_base = db.StringProperty() | |
957 body_class = db.StringProperty() | |
958 remoteurl = db.TextProperty() | |
959 # Data updated separately, after creation. | |
960 content_blob = blobstore.BlobReferenceProperty() | |
961 | |
962 | |
963 def write_blob(data, mime_type): | |
964 """Saves a Unicode string as a new blob, returns the blob's key.""" | |
965 file_name = files.blobstore.create(mime_type=mime_type) | |
966 data = data.encode('utf-8') | |
967 with files.open(file_name, 'a') as blob_file: | |
968 blob_file.write(data) | |
969 files.finalize(file_name) | |
970 return files.blobstore.get_blob_key(file_name) | |
971 | |
972 | |
973 def save_page(page, localpath, fetch_timestamp, page_data): | |
974 body_class = page_data.get('body_class', '') | |
975 content = page_data.get('content') | |
976 offsite_base = page_data.get('offsite_base', '') | |
977 title = page_data.get('title', '') | |
978 | |
979 content_blob_key = None | |
980 try: | |
981 content = content.decode('utf-8', 'replace') | |
982 except UnicodeEncodeError: | |
983 logging.debug('save_page: content was already in unicode') | |
984 logging.debug('save_page: content size is %d' % len(content)) | |
985 if len(content.encode('utf-8')) >= 10**6: | |
986 logging.debug('save_page: saving to blob') | |
987 content_blob_key = write_blob(content, path_to_mime_type(localpath)) | |
988 content = None | |
989 def tx_page(page_key): | |
990 page = Page.get(page_key) | |
991 # E1103:225,7:fetch_page.tx_page: Instance of 'list' has no | |
992 # 'fetch_timestamp' member (but some types could not be inferred) | |
993 # pylint: disable=E1103 | |
994 if page.fetch_timestamp > fetch_timestamp: | |
995 return | |
996 page.content = content | |
997 page.content_blob = content_blob_key | |
998 page.fetch_timestamp = fetch_timestamp | |
999 # title, offsite_base, body_class can all be empty strings for some | |
1000 # content. Where that's true, they're not used for displaying a console- | |
1001 # like resource, and the content alone is returned to the web user. | |
1002 page.title = title | |
1003 page.offsite_base = offsite_base | |
1004 page.body_class = body_class | |
1005 # E1103:231,4:fetch_page.tx_page: Instance of 'list' has no 'put' member | |
1006 # (but some types could not be inferred) | |
1007 # pylint: disable=E1103 | |
1008 page.put() | |
1009 db.run_in_transaction(tx_page, page.key()) | |
1010 page_data = { | |
1011 'body_class': body_class, | |
1012 'content': content, | |
1013 'offsite_base': offsite_base, | |
1014 'title': title, | |
1015 } | |
1016 if content_blob_key: | |
1017 page_data['content_blob'] = True | |
1018 put_pagedata_into_cache(localpath, page_data) | |
1019 | |
1020 | |
1021 def get_or_create_page(localpath, remoteurl, maxage): | |
1022 return Page.get_or_insert( | |
1023 key_name=localpath, | |
1024 localpath=localpath, | |
1025 remoteurl=remoteurl, | |
1026 maxage=maxage, | |
1027 fetch_timestamp=datetime.datetime.now() - datetime.timedelta(hours=24), | |
1028 content=None, | |
1029 content_blob=None) | |
1030 | |
1031 | |
1032 def fetch_page(localpath, remoteurl, maxage, postfetch=None, postsave=None, | |
1033 fetch_url=nonfatal_fetch_url): | |
1034 """Fetches data about a set of pages.""" | |
1035 if type(localpath) != type(''): | |
1036 logging.error('fetch_page: localpath is %r, expected a string' % ( | |
1037 repr(localpath))) | |
1038 return | |
1039 unquoted_localpath = urllib.unquote(localpath) | |
1040 logging.debug('fetch_page("%s", "%s", "%s")' % ( | |
1041 unquoted_localpath, remoteurl, maxage)) | |
1042 page = get_or_create_page(unquoted_localpath, remoteurl, maxage) | |
1043 | |
1044 # Check if our copy of the page is younger than maxage. If it is, we'll | |
1045 # skip the fetch. | |
1046 oldest_acceptable_timestamp = datetime.datetime.now() - datetime.timedelta( | |
1047 seconds=maxage) | |
1048 if (page.fetch_timestamp and | |
1049 page.fetch_timestamp > oldest_acceptable_timestamp): | |
1050 logging.debug('fetch_page: too recent, skipping') | |
1051 return | |
1052 | |
1053 # Perform the actual page fetch. | |
1054 fetch_timestamp = datetime.datetime.now() | |
1055 response = fetch_url(remoteurl) | |
1056 if not response: | |
1057 logging.warning('fetch_page: got empty response') | |
1058 return | |
1059 if response.status_code != 200: | |
1060 logging.warning('fetch_page: got non-empty response but code ' | |
1061 '%d' % response.status_code) | |
1062 return | |
1063 | |
1064 # We have actual content. If there's one or more handlers, call them. | |
1065 page_data = {} | |
1066 page_data['content'] = response.content | |
1067 if postfetch: | |
1068 if not isinstance(postfetch, list): | |
1069 postfetch = [postfetch] | |
1070 for handler in postfetch: | |
1071 logging.debug('fetch_page: calling postfetch handler ' | |
1072 '%s' % handler.__name__) | |
1073 page_data = handler(unquoted_localpath, remoteurl, page_data) | |
1074 | |
1075 # Save the returned content into the DB and caching layers. | |
1076 logging.debug('fetch_page: saving page') | |
1077 save_page(page, unquoted_localpath, fetch_timestamp, page_data) | |
1078 if postsave: | |
1079 if not isinstance(postsave, list): | |
1080 postsave = [postsave] | |
1081 for handler in postsave: | |
1082 logging.debug('fetch_page: calling postsave handler ' | |
1083 '%s' % handler.__name__) | |
1084 handler(unquoted_localpath, remoteurl, page_data) | |
1085 | |
1086 | |
1087 EXT_TO_MIME = { | |
1088 '.css': 'text/css', | |
1089 '.js': 'text/javascript', | |
1090 '.json': 'application/json', | |
1091 '.html': 'text/html', | |
1092 } | |
1093 | |
1094 | |
1095 def path_to_mime_type(path): | |
1096 return EXT_TO_MIME.get(os.path.splitext(path)[1], 'text/html') | |
1097 | |
1098 | |
1099 def fetch_pages(): | |
1100 """Starts a background fetch operation for pages that need it.""" | |
1101 logging.debug('fetch_pages()') | |
1102 for url in URLS: | |
1103 deferred.defer(fetch_page, **url) | |
OLD | NEW |