OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/env python | |
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 """Prepares a Chrome HTML file by inlining resources and adding references to hi gh DPI resources. | |
tony
2012/05/17 17:57:25
Why can't we reuse html_inline.py? Can we just ad
| |
7 | |
8 This is a small script that takes a HTML file, looks for src attributes | |
9 and inlines the specified file, producing one HTML file with no external | |
10 dependencies. It recursively inlines the included files. When inlining CSS | |
11 image files this script also checks for the existence of high DPI versions | |
12 of the inlined file including those on relevant platforms. | |
13 """ | |
14 | |
15 import os | |
16 import re | |
17 import sys | |
18 import types | |
19 import base64 | |
20 import mimetypes | |
21 | |
22 from grit.gather import interface | |
23 from grit import lazy_re | |
24 from grit import util | |
25 | |
26 scale_factors = ['2x'] | |
27 | |
28 DIST_DEFAULT = 'chromium' | |
29 DIST_ENV_VAR = 'CHROMIUM_BUILD' | |
30 DIST_SUBSTR = '%DISTRIBUTION%' | |
31 | |
32 # Matches beginning of an "if" block with trailing spaces. | |
33 _BEGIN_IF_BLOCK = lazy_re.compile( | |
34 '<if [^>]*?expr="(?P<expression>[^"]*)"[^>]*?>\s*') | |
35 | |
36 # Matches ending of an "if" block with preceding spaces. | |
37 _END_IF_BLOCK = lazy_re.compile('\s*</if>') | |
38 | |
39 # Matches a chrome theme source URL. | |
40 _THEME_SOURCE = lazy_re.compile('chrome://theme/IDR_[A-Z0-9_]*') | |
41 | |
42 def ReadFile(input_filename): | |
43 """Helper function that returns input_filename as a string. | |
44 | |
45 Args: | |
46 input_filename: name of file to be read | |
47 | |
48 Returns: | |
49 string | |
50 """ | |
51 f = open(input_filename, 'rb') | |
52 file_contents = f.read() | |
53 f.close() | |
54 return file_contents | |
55 | |
56 def FileDataUrl(path): | |
57 mimetype = mimetypes.guess_type(path)[0] or 'text/plain' | |
58 inline_data = base64.standard_b64encode(ReadFile(path)) | |
59 return "data:%s;base64,%s" % (mimetype, inline_data) | |
60 | |
61 def SrcInlineAsDataURL( | |
62 src_match, base_path, distribution, inlined_files, names_only=False): | |
63 """regex replace function. | |
64 | |
65 Takes a regex match for src="filename", attempts to read the file | |
66 at 'filename' and returns the src attribute with the file inlined | |
67 as a data URI. If it finds DIST_SUBSTR string in file name, replaces | |
68 it with distribution. | |
69 | |
70 Args: | |
71 src_match: regex match object with 'filename' named capturing group | |
72 base_path: path that to look for files in | |
73 distribution: string that should replace DIST_SUBSTR | |
74 inlined_files: The name of the opened file is appended to this list. | |
75 names_only: If true, the function will not read the file but just return "". | |
76 It will still add the filename to |inlined_files|. | |
77 | |
78 Returns: | |
79 string | |
80 """ | |
81 filename = src_match.group('filename') | |
82 | |
83 if filename.find(':') != -1: | |
84 # filename is probably a URL, which we don't want to bother inlining | |
85 return src_match.group(0) | |
86 | |
87 filename = filename.replace('%DISTRIBUTION%', distribution) | |
88 filepath = os.path.join(base_path, filename) | |
89 inlined_files.add(filepath) | |
90 | |
91 if names_only: | |
92 return "" | |
93 | |
94 prefix = src_match.string[src_match.start():src_match.start('filename')-1] | |
95 return "%s\"%s\"" % (prefix, FileDataUrl(filepath)) | |
96 | |
97 def InsertImageSet( | |
98 src_match, base_path, distribution, inlined_files, names_only=False): | |
99 filename = src_match.group('filename') | |
100 attr = src_match.group('attribute') | |
101 prefix = src_match.string[src_match.start():src_match.start('filename')-1] | |
102 | |
103 # Any matches for which a chrome URL handler will serve all scale factors | |
104 # can simply request all scale factors. | |
105 if _THEME_SOURCE.match(filename): | |
106 images = ["url(\"%s\") %s" % (filename, '1x')] | |
107 for sc in scale_factors: | |
108 images.append("url(\"%s@%s\") %s" % (filename, sc, sc)) | |
109 return "%s: -webkit-image-set(%s" % (attr, ', '.join(images)) | |
110 | |
111 if filename.find(':') != -1: | |
112 # filename is probably a URL, which we don't want to bother inlining | |
113 return src_match.group(0) | |
114 | |
115 filename = filename.replace('%DISTRIBUTION%', distribution) | |
116 filepath = os.path.join(base_path, filename) | |
117 inlined_files.add(filepath) | |
118 images = ["url(\"%s\") %s" % (FileDataUrl(filepath), '1x')] | |
119 | |
120 for sc in scale_factors: | |
121 # check for existence of file and add to image set. | |
122 scale_image = filename.replace('%DISTRIBUTION%', distribution) | |
123 scale_path = os.path.split(os.path.join(base_path, scale_image)) | |
124 scale_image_path = "%s/%s/%s" % (scale_path[0], sc, scale_path[1]) | |
125 if os.path.isfile(scale_image_path): | |
126 inlined_files.add(scale_image_path) | |
127 images.append("url(\"%s\") %s" % (FileDataUrl(scale_image_path), sc)) | |
128 return "%s: -webkit-image-set(%s" % (attr, ', '.join(images)) | |
129 | |
130 class InlinedData: | |
131 """Helper class holding the results from DoInline(). | |
132 | |
133 Holds the inlined data and the set of filenames of all the inlined | |
134 files. | |
135 """ | |
136 def __init__(self, inlined_data, inlined_files): | |
137 self.inlined_data = inlined_data | |
138 self.inlined_files = inlined_files | |
139 | |
140 def DoInline( | |
141 input_filename, grd_node, allow_external_script=False, names_only=False): | |
142 """Helper function that inlines the resources in a specified file. | |
143 | |
144 Reads input_filename, finds all the src attributes and attempts to | |
145 inline the files they are referring to, then returns the result and | |
146 the set of inlined files. | |
147 | |
148 Args: | |
149 input_filename: name of file to read in | |
150 grd_node: html node from the grd file for this include tag | |
151 names_only: |nil| will be returned for the inlined contents (faster). | |
152 Returns: | |
153 a tuple of the inlined data as a string and the set of filenames | |
154 of all the inlined files | |
155 """ | |
156 input_filepath = os.path.dirname(input_filename) | |
157 | |
158 distribution = DIST_DEFAULT | |
159 if DIST_ENV_VAR in os.environ.keys(): | |
160 distribution = os.environ[DIST_ENV_VAR] | |
161 if len(distribution) > 1 and distribution[0] == '_': | |
162 distribution = distribution[1:].lower() | |
163 | |
164 # Keep track of all the files we inline. | |
165 inlined_files = set() | |
166 | |
167 def SrcReplace(src_match, filepath=input_filepath, | |
168 inlined_files=inlined_files): | |
169 """Helper function to provide SrcInlineAsDataURL with the base file path""" | |
170 return SrcInlineAsDataURL( | |
171 src_match, filepath, distribution, inlined_files, names_only=names_only) | |
172 | |
173 def SrcImageSet(src_match, filepath=input_filepath, | |
174 inlined_files=inlined_files): | |
175 """Helper function to provide InsertImageSet with the base file path""" | |
176 return InsertImageSet( | |
177 src_match, filepath, distribution, inlined_files, names_only=names_only) | |
178 | |
179 def GetFilepath(src_match): | |
180 filename = src_match.group('filename') | |
181 | |
182 if filename.find(':') != -1: | |
183 # filename is probably a URL, which we don't want to bother inlining | |
184 return None | |
185 | |
186 filename = filename.replace('%DISTRIBUTION%', distribution) | |
187 return os.path.join(input_filepath, filename) | |
188 | |
189 def IsConditionSatisfied(src_match): | |
190 expression = src_match.group('expression') | |
191 return grd_node is None or grd_node.EvaluateCondition(expression) | |
192 | |
193 def CheckConditionalElements(str): | |
194 """Helper function to conditionally inline inner elements""" | |
195 while True: | |
196 begin_if = _BEGIN_IF_BLOCK.search(str) | |
197 if begin_if is None: | |
198 return str | |
199 | |
200 condition_satisfied = IsConditionSatisfied(begin_if) | |
201 leading = str[0:begin_if.start()] | |
202 content_start = begin_if.end() | |
203 | |
204 # Find matching "if" block end. | |
205 count = 1 | |
206 pos = begin_if.end() | |
207 while True: | |
208 end_if = _END_IF_BLOCK.search(str, pos) | |
209 if end_if is None: | |
210 raise Exception('Unmatched <if>') | |
211 | |
212 next_if = _BEGIN_IF_BLOCK.search(str, pos) | |
213 if next_if is None or next_if.start() >= end_if.end(): | |
214 count = count - 1 | |
215 if count == 0: | |
216 break | |
217 pos = end_if.end() | |
218 else: | |
219 count = count + 1 | |
220 pos = next_if.end() | |
221 | |
222 content = str[content_start:end_if.start()] | |
223 trailing = str[end_if.end():] | |
224 | |
225 if condition_satisfied: | |
226 str = leading + CheckConditionalElements(content) + trailing | |
227 else: | |
228 str = leading + trailing | |
229 | |
230 def InlineFileContents(src_match, pattern, inlined_files=inlined_files): | |
231 """Helper function to inline external files of various types""" | |
232 filepath = GetFilepath(src_match) | |
233 if filepath is None: | |
234 return src_match.group(0) | |
235 inlined_files.add(filepath) | |
236 | |
237 # Even if names_only is set, html files needs to be opened, because it | |
238 # can link to images that need to be added to the file set. | |
239 if names_only and not filepath.endswith('.html'): | |
240 return "" | |
241 | |
242 return pattern % InlineToString(filepath, grd_node, allow_external_script) | |
243 | |
244 def InlineIncludeFiles(src_match): | |
245 """Helper function to directly inline generic external files (without | |
246 wrapping them with any kind of tags). | |
247 """ | |
248 return InlineFileContents(src_match, '%s') | |
249 | |
250 def InlineScript(match): | |
251 """Helper function to inline external script files""" | |
252 attrs = (match.group('attrs1') + match.group('attrs2')).strip() | |
253 if attrs: | |
254 attrs = ' ' + attrs | |
255 return InlineFileContents(match, '<script' + attrs + '>%s</script>') | |
256 | |
257 def InlineCSSText(text, css_filepath): | |
258 """Helper function that inlines external resources in CSS text""" | |
259 filepath = os.path.dirname(css_filepath) | |
260 return InlineCSSImages(text, filepath) | |
261 | |
262 def InlineCSSFile(src_match, inlined_files=inlined_files): | |
263 """Helper function to inline external css files. | |
264 | |
265 Args: | |
266 src_match: A regular expression match with a named group named "filename". | |
267 | |
268 Returns: | |
269 The text that should replace the reference to the CSS file. | |
270 """ | |
271 filepath = GetFilepath(src_match) | |
272 if filepath is None: | |
273 return src_match.group(0) | |
274 | |
275 # Even if names_only is set, the CSS file needs to be opened, because it | |
276 # can link to images that need to be added to the file set. | |
277 inlined_files.add(filepath) | |
278 # When resolving CSS files we need to pass in the path so that relative URLs | |
279 # can be resolved. | |
280 return '<style>%s</style>' % InlineCSSText(ReadFile(filepath), filepath) | |
281 | |
282 def InlineCSSImages(text, filepath=input_filepath): | |
283 """Helper function that inlines external images in CSS backgrounds.""" | |
284 # Replace contents of url() for css attributes: content, background, | |
285 # or *-image. | |
286 return re.sub('(?P<attribute>content|background|[\w-]*-image):[ ]*' + | |
287 'url\((?:\'|\")(?P<filename>[^"\'\)\(]*)(?:\'|\")', | |
288 lambda m: SrcImageSet(m, filepath), | |
289 text) | |
290 | |
291 flat_text = ReadFile(input_filename) | |
292 | |
293 if not allow_external_script: | |
294 # We need to inline css and js before we inline images so that image | |
295 # references gets inlined in the css and js | |
296 flat_text = re.sub('<script (?P<attrs1>.*?)src="(?P<filename>[^"\']*)"' + | |
297 '(?P<attrs2>.*?)></script>', | |
298 InlineScript, | |
299 flat_text) | |
300 | |
301 flat_text = re.sub( | |
302 '<link rel="stylesheet".+?href="(?P<filename>[^"]*)".*?>', | |
303 InlineCSSFile, | |
304 flat_text) | |
305 | |
306 flat_text = re.sub( | |
307 '<include\s+src="(?P<filename>[^"\']*)".*>', | |
308 InlineIncludeFiles, | |
309 flat_text) | |
310 | |
311 # Check conditional elements, remove unsatisfied ones from the file. | |
312 flat_text = CheckConditionalElements(flat_text) | |
313 | |
314 flat_text = re.sub('<(?!script)[^>]+?src="(?P<filename>[^"\']*)"', | |
315 SrcReplace, | |
316 flat_text) | |
317 | |
318 # TODO(arv): Only do this inside <style> tags. | |
319 flat_text = InlineCSSImages(flat_text) | |
320 flat_text = re.sub('<link rel="icon".+?href="(?P<filename>[^"\']*)"', | |
321 SrcReplace, | |
322 flat_text) | |
323 | |
324 if names_only: | |
325 flat_text = None # Will contains garbage if the flag is set anyway. | |
326 return InlinedData(flat_text, inlined_files) | |
327 | |
328 | |
329 def InlineToString(input_filename, grd_node, allow_external_script=False): | |
330 """Inlines the resources in a specified file and returns it as a string. | |
331 | |
332 Args: | |
333 input_filename: name of file to read in | |
334 grd_node: html node from the grd file for this include tag | |
335 Returns: | |
336 the inlined data as a string | |
337 """ | |
338 try: | |
339 return DoInline(input_filename, | |
340 grd_node, | |
341 allow_external_script=allow_external_script).inlined_data | |
342 except IOError, e: | |
343 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % | |
344 (e.filename, input_filename, e.strerror)) | |
345 | |
346 | |
347 def InlineToFile(input_filename, output_filename, grd_node): | |
348 """Inlines the resources in a specified file and writes it. | |
349 | |
350 Reads input_filename, finds all the src attributes and attempts to | |
351 inline the files they are referring to, then writes the result | |
352 to output_filename. | |
353 | |
354 Args: | |
355 input_filename: name of file to read in | |
356 output_filename: name of file to be written to | |
357 grd_node: html node from the grd file for this include tag | |
358 Returns: | |
359 a set of filenames of all the inlined files | |
360 """ | |
361 inlined_data = InlineToString(input_filename, grd_node) | |
362 out_file = open(output_filename, 'wb') | |
363 out_file.writelines(inlined_data) | |
364 out_file.close() | |
365 | |
366 | |
367 def GetResourceFilenames(filename, allow_external_script=False): | |
368 """For a grd file, returns a set of all the files that would be inline.""" | |
369 try: | |
370 return DoInline(filename, None, names_only=True, | |
371 allow_external_script=allow_external_script).inlined_files | |
372 except IOError, e: | |
373 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % | |
374 (e.filename, filename, e.strerror)) | |
375 | |
376 | |
377 class ChromeHtml(interface.GathererBase): | |
378 '''Represents an HTML document.''' | |
379 | |
380 def __init__(self, html): | |
381 '''Creates a new object that represents 'text'. | |
382 Args: | |
383 html: 'filename.html' | |
384 ''' | |
385 super(type(self), self).__init__() | |
386 self.filename_ = html | |
387 self.inlined_text_ = None | |
388 self.scale_factors_ = [] | |
389 | |
390 def SetAttributes(self, attrs): | |
391 '''Sets node attributes used by the gatherer. | |
392 | |
393 This checks the scale_factors attribute. | |
394 | |
395 Args: | |
396 attrs: The mapping of node attributes. | |
397 ''' | |
398 if 'scale_factors' in attrs: | |
399 self.scale_factors_ = attrs['scale_factors'].split(' ') | |
400 | |
401 def GetText(self): | |
402 '''Returns the original text of the HTML document''' | |
403 return self.inlined_text_ | |
404 | |
405 def GetData(self, lang, encoding): | |
406 '''Return inlined text of the HTML document''' | |
407 return self.inlined_text_ | |
408 | |
409 def Translate(self, lang, pseudo_if_not_available=True, | |
410 skeleton_gatherer=None, fallback_to_english=False): | |
411 '''Returns this document translated.''' | |
412 return self.inlined_text_ | |
413 | |
414 def Parse(self): | |
415 self.inlined_text_ = InlineToString(self.filename_, None) | |
416 | |
417 @staticmethod | |
418 def FromFile(html, extkey=None, encoding = 'utf-8'): | |
419 '''Creates a ChromeHtml object for the contents of 'html'. Returns a new | |
420 ChromeHtml object. | |
421 | |
422 Args: | |
423 html: file('') | 'filename.html' | |
424 extkey: ignored | |
425 encoding: 'utf-8' (encoding is ignored) | |
426 | |
427 Return: | |
428 ChromeHtml(text_of_file) | |
429 ''' | |
430 if not isinstance(html, types.StringTypes): | |
431 html = html.name | |
432 | |
433 return ChromeHtml(html) | |
OLD | NEW |