Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(723)

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Removed gstools.py, added more error messages Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Download files from Google Storage based on SHA1 sums."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import sys
15 import threading
16 import time
17
18 import subprocess2
19
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25
26 # Common utilities
27 class Gsutil(object):
28 """Call gsutil with some predefined settings."""
29 def __init__(self, path, boto_path=None, timeout=None):
30 if not os.path.exists(path):
31 raise OSError('GSUtil not found in %s' % path)
32 self.path = path
33 self.timeout = timeout
34 self.boto_path = boto_path
35
36 def call(self, *args):
37 env = os.environ.copy()
38 if self.boto_path is not None:
39 env['AWS_CREDENTIAL_FILE'] = self.boto_path
40 return subprocess2.call((sys.executable, self.path) + args,
41 env=env,
42 timeout=self.timeout)
43
44 def check_call(self, *args):
45 env = os.environ.copy()
46 if self.boto_path is not None:
47 env['AWS_CREDENTIAL_FILE'] = self.boto_path
48 ((out, err), code) = subprocess2.communicate(
49 (sys.executable, self.path) + args,
50 stdout=subprocess2.PIPE,
51 stderr=subprocess2.PIPE,
52 env=env,
53 timeout=self.timeout)
54
55 # Parse output.
56 status_code_match = re.search('status=([0-9]+)', err)
57 if status_code_match:
58 return int(status_code_match.groups(1))
59 elif ('You are attempting to access protected data with '
60 'no configured credentials.' in err):
61 return (403, out, err)
62 elif 'No such object' in err:
63 return (404, out, err)
64 else:
65 return (code, out, err)
66
67 def clone(self):
68 return Gsutil(self.path, self.boto_path, self.timeout)
69
70
71 def CheckBucketPermissions(bucket, gsutil):
72 if not bucket:
73 print >> sys.stderr, 'Missing bucket %s.'
74 return (None, 1)
75 base_url = 'gs://%s' % bucket
76
77 # Check if we have permissions to the Google Storage bucket.
78 code, _, ls_err = gsutil.check_call('ls', base_url)
79 if code == 403:
80 code, _, _ = gsutil.call('config')
81 if code != 0:
82 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
83 elif code == 404:
84 print >> sys.stderr, '%s not found.' % base_url
85 elif code != 0:
86 print >> sys.stderr, ls_err
87 return (base_url, code)
88
89
90 def GetSHA1(filename):
91 sha1 = hashlib.sha1()
92 with open(filename, 'rb') as f:
93 while True:
94 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
95 chunk = f.read(1024*1024)
96 if not chunk:
97 break
98 sha1.update(chunk)
99 return sha1.hexdigest()
100
101
102 def GetMD5(filename, lock):
Marc-Antoine Ruel (Google) 2013/03/05 02:04:08 This functions is not used in this file, please mo
Ryan Tseng 2013/03/06 19:03:56 Done.
103 md5_calculator = hashlib.md5()
104 with lock:
105 with open(filename, 'rb') as f:
106 while True:
107 chunk = f.read(1024*1024)
108 if not chunk:
109 break
110 md5_calculator.update(chunk)
111 return md5_calculator.hexdigest()
112
113
114 def GetMD5Cached(filename, lock):
Marc-Antoine Ruel (Google) 2013/03/05 02:04:08 This function is not used in this file, please mov
Ryan Tseng 2013/03/06 19:03:56 Done.
115 """Don't calculate the MD5 if we can find a .md5 file."""
116 # See if we can find an existing MD5 sum stored in a file.
117 if os.path.exists('%s.md5' % filename):
118 with open('%s.md5' % filename) as f:
119 md5_match = re.search('([a-z0-9]{32})', f.read())
120 if md5_match:
121 return md5_match.group(1)
122 else:
123 md5_hash = GetMD5(filename, lock)
124 with open('%s.md5' % filename, 'w') as f:
125 f.write(md5_hash)
126 return md5_hash
127
128
129 # Download-specific code starts here
130
131 def enumerate_work_queue(input_filename, work_queue, directory,
132 recursive, ignore_errors, output, sha1_file):
133 if sha1_file:
134 if not os.path.exists(input_filename):
135 print >> sys.stderr, '%s not found.' % input_filename
136 if not ignore_errors:
137 raise Exception('%s not found.' % input_filename)
138 with open(input_filename, 'rb') as f:
139 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
140 if sha1_match:
141 work_queue.put(
142 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))
143 return 1
144 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
145 if not ignore_errors:
146 raise Exception('No sha1 sum found in %s.' % input_filename)
147 return 0
148
149 if not directory:
150 work_queue.put((input_filename, output))
151 return 1
152
153 work_queue_size = 0
154 for root, dirs, files in os.walk(input_filename):
155 if not recursive:
156 for item in dirs[:]:
157 dirs.remove(item)
158 else:
159 for exclude in ['.svn', '.git']:
160 if exclude in dirs:
161 dirs.remove(exclude)
162 for filename in files:
163 full_path = os.path.join(root, filename)
164 if full_path.endswith('.sha1'):
165 with open(full_path, 'rb') as f:
166 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
167 if sha1_match:
168 work_queue.put(
169 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
170 work_queue_size += 1
171 else:
172 print >> sys.stderr, 'No sha1 sum found in %s.' % filename
173 if not ignore_errors:
174 raise Exception('No sha1 sum found in %s.' % filename)
175 return work_queue_size
176
177
178 def _downloader_worker_thread(thread_num, q, force, base_url, gsutil, out_q):
179 while True:
180 input_sha1_sum, output_filename = q.get()
181 if input_sha1_sum is None:
182 out_q.put('Thread %d is done' % thread_num)
183 return
184 if os.path.exists(output_filename) and not force:
185 if GetSHA1(output_filename) == input_sha1_sum:
186 out_q.put(
187 'File %s exists and SHA1 sum (%s) matches. Skipping.' % (
188 output_filename , input_sha1_sum))
189 continue
190 # Check if file exists.
191 file_url = '%s/%s' % (base_url, input_sha1_sum)
192 if gsutil.check_call('ls', file_url)[0] != 0:
193 out_q.put('File %s for %s does not exist, skipping.' % (
194 file_url, output_filename))
195 continue
196 # Fetch the file.
197 out_q.put('Downloading %s to %s...' % (file_url, output_filename))
198 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
199 if code != 0:
200 out_q.put(err)
201 return code
202
203
204 def download_from_google_storage(
205 input_filename, base_url, gsutil, num_threads, directory, recursive,
206 force, output, ignore_errors, sha1_file):
207 # Start up all the worker threads.
208 all_threads = []
209 download_timer = time.time()
210 stdout_queue = Queue.Queue()
211 work_queue = Queue.Queue()
212 for thread_num in range(num_threads):
213 t = threading.Thread(
214 target=_downloader_worker_thread,
215 args=[thread_num, work_queue, force, base_url,
216 gsutil.clone(), stdout_queue])
217 t.daemon = True
218 t.start()
219 all_threads.append(t)
220
221 # Enumerate our work queue.
222 work_queue_size = enumerate_work_queue(
223 input_filename, work_queue, directory, recursive,
224 ignore_errors, output, sha1_file)
225 for _ in all_threads:
226 work_queue.put((None, None)) # Used to tell worker threads to stop.
227
228 # Wait for all downloads to finish.
229 while not work_queue.empty() or any(t.is_alive() for t in all_threads):
Marc-Antoine Ruel (Google) 2013/03/05 02:04:08 There's a race condition in there; - The last thre
Ryan Tseng 2013/03/06 19:03:56 changed or -> and * If the queue not empty, then t
230 print stdout_queue.get()
231 while not stdout_queue.empty():
232 print stdout_queue.get()
233
234 print 'Success.'
235 print 'Downloading %d files took %1f second(s)' % (
236 work_queue_size, time.time() - download_timer)
237 return 0
238
239
240 def main(args):
241 usage = ('usage: %prog [options] target\nTarget must be:\n'
242 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '
243 '.sha1 file, containing a sha1 sum on the first line. (-d or '
244 '--directory) A directory to scan for .sha1 files. ')
245 parser = optparse.OptionParser(usage)
246 parser.add_option('-o', '--output',
247 help='Specify the output file name. Defaults to:\n'
248 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'
249 '(b) Given a .sha1 file or directory, the name will '
250 'match (.*).sha1.')
251 parser.add_option('-b', '--bucket',
252 help='Google Storage bucket to fetch from.')
253 parser.add_option('-e', '--boto',
254 help='Specify a custom boto file.')
255 parser.add_option('-c', '--no_resume', action='store_true',
256 help='Resume download if file is partially downloaded.')
257 parser.add_option('-f', '--force', action='store_true',
258 help='Force download even if local file exists.')
259 parser.add_option('-i', '--ignore_errors', action='store_true',
260 help='Don\'t throw error if we find an invalid .sha1 file.')
261 parser.add_option('-r', '--recursive', action='store_true',
262 help='Scan folders recursively for .sha1 files. '
263 'Must be used with -d/--directory')
264 parser.add_option('-t', '--num_threads', default=1, type='int',
265 help='Number of downloader threads to run.')
266 parser.add_option('-d', '--directory', action='store_true',
267 help='The target is a directory. '
268 'Cannot be used with -s/--sha1_file.')
269 parser.add_option('-s', '--sha1_file', action='store_true',
270 help='The target is a file containing a sha1 sum. '
271 'Cannot be used with -d/--directory.')
272
273 (options, args) = parser.parse_args()
274 if not args:
275 parser.error('Missing target.')
276 if len(args) > 1:
277 parser.error('Too many targets.')
278 if not options.bucket:
279 parser.error('Missing bucket. Specify bucket with --bucket.')
280 if options.sha1_file and options.directory:
281 parser.error('Both --directory and --sha1_file are specified, '
282 'can only specify one.')
283 elif options.recursive and not options.directory:
284 parser.error('--recursive specified but --directory not specified.')
285 elif options.output and options.directory:
286 parser.error('--directory is specified, so --output has no effect.')
287 else:
288 input_filename = args[0]
289
290 # Set output filename if not specified.
291 if not options.output and not options.directory:
292 if not options.sha1_file:
293 # Target is a sha1 sum, so output filename would also be the sha1 sum.
294 options.output = input_filename
295 elif options.sha1_file:
296 # Target is a .sha1 file.
297 if not input_filename.endswith('.sha1'):
298 parser.error('--sha1_file is specified, but the input filename '
299 'does not end with .sha1, and no --output is specified. '
300 'Either make sure the input filename has a .sha1 '
301 'extension, or specify --output.')
302 options.output = input_filename[:-5]
303 else:
304 raise NotImplementedError('Unreachable state.')
305
306 # Check if output file already exists.
307 if not options.directory and not options.force and not options.no_resume:
308 if os.path.exists(options.output):
309 parser.error('Output file %s exists and --no_resume is specified.'
310 % options.output)
311
312 # Make sure we can find a working instance of gsutil.
313 if os.path.exists(GSUTIL_DEFAULT_PATH):
314 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
315 else:
316 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
317 GSUTIL_DEFAULT_PATH)
318 return 1
319
320 # Check we have a valid bucket with valid permissions.
321 base_url, code = CheckBucketPermissions(options.bucket, gsutil)
322 if code:
323 return code
324
325 return download_from_google_storage(
326 input_filename, base_url, gsutil, options.num_threads, options.directory,
327 options.recursive, options.force, options.output, options.ignore_errors,
328 options.sha1_file)
329
330
331 if __name__ == '__main__':
332 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698