Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(886)

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/download_from_google_storage_unittests.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Download files from Google Storage based on SHA1 sums."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import sys
15 import threading
16 import time
17
18 import subprocess2
19
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25
26 class FileNotFoundError(IOError):
27 pass
28
29
30 class InvalidFileError(IOError):
31 pass
32
33
34 # Common utilities
35 class Gsutil(object):
36 """Call gsutil with some predefined settings. This is a convenience object,
37 and is also immutable."""
38 def __init__(self, path, boto_path=None, timeout=None):
39 if not os.path.exists(path):
40 raise FileNotFoundError('GSUtil not found in %s' % path)
41 self.path = path
42 self.timeout = timeout
43 self.boto_path = boto_path
44
45 def call(self, *args):
46 env = os.environ.copy()
47 if self.boto_path:
48 env['AWS_CREDENTIAL_FILE'] = self.boto_path
49 return subprocess2.call((sys.executable, self.path) + args,
50 env=env,
51 timeout=self.timeout)
52
53 def check_call(self, *args):
54 env = os.environ.copy()
55 if self.boto_path:
56 env['AWS_CREDENTIAL_FILE'] = self.boto_path
57 ((out, err), code) = subprocess2.communicate(
58 (sys.executable, self.path) + args,
59 stdout=subprocess2.PIPE,
60 stderr=subprocess2.PIPE,
61 env=env,
62 timeout=self.timeout)
63
64 # Parse output.
65 status_code_match = re.search('status=([0-9]+)', err)
66 if status_code_match:
67 return (int(status_code_match.group(1)), out, err)
68 if ('You are attempting to access protected data with '
69 'no configured credentials.' in err):
70 return (403, out, err)
Isaac (away) 2013/09/19 12:25:16 These lines appear to silence a server error messa
Ryan Tseng 2013/09/19 17:42:19 Its to replace GSResponseError: status=403, code=A
71 if 'No such object' in err:
72 return (404, out, err)
73 return (code, out, err)
74
75
76 def check_bucket_permissions(bucket, gsutil):
77 if not bucket:
78 print >> sys.stderr, 'Missing bucket %s.'
79 return (None, 1)
80 base_url = 'gs://%s' % bucket
81
82 code, _, ls_err = gsutil.check_call('ls', base_url)
83 if code == 403:
84 code, _, _ = gsutil.call('config')
85 if code != 0:
86 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
87 elif code == 404:
88 print >> sys.stderr, '%s not found.' % base_url
89 elif code != 0:
90 print >> sys.stderr, ls_err
91 return (base_url, code)
92
93
94 def get_sha1(filename):
95 sha1 = hashlib.sha1()
96 with open(filename, 'rb') as f:
97 while True:
98 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
99 chunk = f.read(1024*1024)
100 if not chunk:
101 break
102 sha1.update(chunk)
103 return sha1.hexdigest()
104
105
106 # Download-specific code starts here
107
108 def enumerate_work_queue(input_filename, work_queue, directory,
109 recursive, ignore_errors, output, sha1_file):
110 if sha1_file:
111 if not os.path.exists(input_filename):
112 if not ignore_errors:
113 raise FileNotFoundError('%s not found.' % input_filename)
114 print >> sys.stderr, '%s not found.' % input_filename
115 with open(input_filename, 'rb') as f:
116 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
117 if sha1_match:
118 work_queue.put(
119 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))
120 return 1
121 if not ignore_errors:
122 raise InvalidFileError('No sha1 sum found in %s.' % input_filename)
123 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
124 return 0
125
126 if not directory:
127 work_queue.put((input_filename, output))
128 return 1
129
130 work_queue_size = 0
131 for root, dirs, files in os.walk(input_filename):
132 if not recursive:
133 for item in dirs[:]:
134 dirs.remove(item)
135 else:
136 for exclude in ['.svn', '.git']:
137 if exclude in dirs:
138 dirs.remove(exclude)
139 for filename in files:
140 full_path = os.path.join(root, filename)
141 if full_path.endswith('.sha1'):
142 with open(full_path, 'rb') as f:
143 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
144 if sha1_match:
145 work_queue.put(
146 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
147 work_queue_size += 1
148 else:
149 if not ignore_errors:
150 raise InvalidFileError('No sha1 sum found in %s.' % filename)
151 print >> sys.stderr, 'No sha1 sum found in %s.' % filename
152 return work_queue_size
153
154
155 def _downloader_worker_thread(thread_num, q, force, base_url,
156 gsutil, out_q, ret_codes):
157 while True:
158 input_sha1_sum, output_filename = q.get()
159 if input_sha1_sum is None:
160 return
161 if os.path.exists(output_filename) and not force:
162 if get_sha1(output_filename) == input_sha1_sum:
163 out_q.put(
164 '%d> File %s exists and SHA1 matches. Skipping.' % (
165 thread_num, output_filename))
166 continue
167 # Check if file exists.
168 file_url = '%s/%s' % (base_url, input_sha1_sum)
169 if gsutil.check_call('ls', file_url)[0] != 0:
170 out_q.put('%d> File %s for %s does not exist, skipping.' % (
171 thread_num, file_url, output_filename))
172 ret_codes.put((1, 'File %s for %s does not exist.' % (
173 file_url, output_filename)))
174 continue
175 # Fetch the file.
176 out_q.put('%d> Downloading %s...' % (
177 thread_num, output_filename))
178 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
179 if code != 0:
180 out_q.put('%d> %s' % (thread_num, err))
181 ret_codes.put((code, err))
182
183
184 def printer_worker(output_queue):
185 while True:
186 line = output_queue.get()
187 # Its plausible we want to print empty lines.
188 if line is None:
189 break
190 print line
191
192
193 def download_from_google_storage(
194 input_filename, base_url, gsutil, num_threads, directory, recursive,
195 force, output, ignore_errors, sha1_file):
196 # Start up all the worker threads.
197 all_threads = []
198 download_start = time.time()
199 stdout_queue = Queue.Queue()
200 work_queue = Queue.Queue()
201 ret_codes = Queue.Queue()
202 ret_codes.put((0, None))
203 for thread_num in range(num_threads):
204 t = threading.Thread(
205 target=_downloader_worker_thread,
206 args=[thread_num, work_queue, force, base_url,
207 gsutil, stdout_queue, ret_codes])
208 t.daemon = True
209 t.start()
210 all_threads.append(t)
211 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
212 printer_thread.daemon = True
213 printer_thread.start()
214
215 # Enumerate our work queue.
216 work_queue_size = enumerate_work_queue(
217 input_filename, work_queue, directory, recursive,
218 ignore_errors, output, sha1_file)
219 for _ in all_threads:
220 work_queue.put((None, None)) # Used to tell worker threads to stop.
221
222 # Wait for all downloads to finish.
223 for t in all_threads:
224 t.join()
225 stdout_queue.put(None)
226 printer_thread.join()
227
228 # See if we ran into any errors.
229 max_ret_code = 0
230 for ret_code, message in ret_codes.queue:
231 max_ret_code = max(ret_code, max_ret_code)
232 if message:
233 print >> sys.stderr, message
234 if not max_ret_code:
235 print 'Success!'
236
237 print 'Downloading %d files took %1f second(s)' % (
238 work_queue_size, time.time() - download_start)
239 return max_ret_code
240
241
242 def main(args):
243 usage = ('usage: %prog [options] target\n'
244 'Target must be:\n'
245 ' (default) a sha1 sum ([A-Za-z0-9]{40}).\n'
246 ' (-s or --sha1_file) a .sha1 file, containing a sha1 sum on '
247 'the first line.\n'
248 ' (-d or --directory) A directory to scan for .sha1 files.')
249 parser = optparse.OptionParser(usage)
250 parser.add_option('-o', '--output',
251 help='Specify the output file name. Defaults to: '
252 '(a) Given a SHA1 hash, the name is the SHA1 hash. '
253 '(b) Given a .sha1 file or directory, the name will '
254 'match (.*).sha1.')
255 parser.add_option('-b', '--bucket',
256 help='Google Storage bucket to fetch from.')
257 parser.add_option('-e', '--boto',
258 help='Specify a custom boto file.')
259 parser.add_option('-c', '--no_resume', action='store_true',
260 help='Resume download if file is partially downloaded.')
261 parser.add_option('-f', '--force', action='store_true',
262 help='Force download even if local file exists.')
263 parser.add_option('-i', '--ignore_errors', action='store_true',
264 help='Don\'t throw error if we find an invalid .sha1 file.')
265 parser.add_option('-r', '--recursive', action='store_true',
266 help='Scan folders recursively for .sha1 files. '
267 'Must be used with -d/--directory')
268 parser.add_option('-t', '--num_threads', default=1, type='int',
269 help='Number of downloader threads to run.')
270 parser.add_option('-d', '--directory', action='store_true',
271 help='The target is a directory. '
272 'Cannot be used with -s/--sha1_file.')
273 parser.add_option('-s', '--sha1_file', action='store_true',
274 help='The target is a file containing a sha1 sum. '
275 'Cannot be used with -d/--directory.')
276
277 (options, args) = parser.parse_args()
278 if not args:
279 parser.error('Missing target.')
280 if len(args) > 1:
281 parser.error('Too many targets.')
282 if not options.bucket:
283 parser.error('Missing bucket. Specify bucket with --bucket.')
284 if options.sha1_file and options.directory:
285 parser.error('Both --directory and --sha1_file are specified, '
286 'can only specify one.')
287 if options.recursive and not options.directory:
288 parser.error('--recursive specified but --directory not specified.')
289 if options.output and options.directory:
290 parser.error('--directory is specified, so --output has no effect.')
291 input_filename = args[0]
292
293 # Set output filename if not specified.
294 if not options.output and not options.directory:
295 if not options.sha1_file:
296 # Target is a sha1 sum, so output filename would also be the sha1 sum.
297 options.output = input_filename
298 elif options.sha1_file:
299 # Target is a .sha1 file.
300 if not input_filename.endswith('.sha1'):
301 parser.error('--sha1_file is specified, but the input filename '
302 'does not end with .sha1, and no --output is specified. '
303 'Either make sure the input filename has a .sha1 '
304 'extension, or specify --output.')
305 options.output = input_filename[:-5]
306 else:
307 parser.error('Unreachable state.')
308
309 # Check if output file already exists.
310 if not options.directory and not options.force and not options.no_resume:
311 if os.path.exists(options.output):
312 parser.error('Output file %s exists and --no_resume is specified.'
313 % options.output)
314
315 # Make sure we can find a working instance of gsutil.
316 if os.path.exists(GSUTIL_DEFAULT_PATH):
317 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
318 else:
319 gsutil = None
320 for path in os.environ["PATH"].split(os.pathsep):
321 if os.path.exists(path) and 'gsutil' in os.listdir(path):
322 gsutil = Gsutil(os.path.join(path, 'gsutil'))
323 if not gsutil:
324 parser.error('gsutil not found in %s, bad depot_tools checkout?' %
325 GSUTIL_DEFAULT_PATH)
326
327 # Check we have a valid bucket with valid permissions.
328 base_url, code = check_bucket_permissions(options.bucket, gsutil)
329 if code:
330 return code
331
332 return download_from_google_storage(
333 input_filename, base_url, gsutil, options.num_threads, options.directory,
334 options.recursive, options.force, options.output, options.ignore_errors,
335 options.sha1_file)
336
337
338 if __name__ == '__main__':
339 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | tests/download_from_google_storage_unittests.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698