download_from_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Download files from Google Storage based on SHA1 sums."""

	7

	8

	9 import hashlib

	10 import optparse

	11 import os

	12 import Queue

	13 import re

	14 import sys

	15 import threading

	16 import time

	17

	18 import subprocess2

	19

	20

	21 GSUTIL_DEFAULT_PATH = os.path.join(

	22 os.path.dirname(os.path.abspath(__file__)),

	23 'third_party', 'gsutil', 'gsutil')

	24

	25

	26 class FileNotFoundError(IOError):

	27 pass

	28

	29

	30 class InvalidFileError(IOError):

	31 pass

	32

	33

	34 # Common utilities

	35 class Gsutil(object):

	36 """Call gsutil with some predefined settings. This is a convenience object,

	37 and is also immutable."""

	38 def __init__(self, path, boto_path=None, timeout=None):

	39 if not os.path.exists(path):

	40 raise FileNotFoundError('GSUtil not found in %s' % path)

	41 self.path = path

	42 self.timeout = timeout

	43 self.boto_path = boto_path

	44

	45 def call(self, *args):

	46 env = os.environ.copy()

	47 if self.boto_path:

	48 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	49 return subprocess2.call((sys.executable, self.path) + args,

	50 env=env,

	51 timeout=self.timeout)

	52

	53 def check_call(self, *args):

	54 env = os.environ.copy()

	55 if self.boto_path:

	56 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	57 ((out, err), code) = subprocess2.communicate(

	58 (sys.executable, self.path) + args,

	59 stdout=subprocess2.PIPE,

	60 stderr=subprocess2.PIPE,

	61 env=env,

	62 timeout=self.timeout)

	63

	64 # Parse output.

	65 status_code_match = re.search('status=([0-9]+)', err)

	66 if status_code_match:

	67 return (int(status_code_match.group(1)), out, err)

	68 if ('You are attempting to access protected data with '

	69 'no configured credentials.' in err):

	70 return (403, out, err)
	Isaac (away) 2013/09/19 12:25:16 These lines appear to silence a server error messa These lines appear to silence a server error messages and cause the script to incorrectly report receiving a 403. (std_error is not printed in case on line 83 below) Can I ask why we did it this way? Ryan Tseng 2013/09/19 17:42:19 Its to replace GSResponseError: status=403, code=A Show quoted text On 2013/09/19 12:25:16, Isaac wrote: > These lines appear to silence a server error messages and cause the script to > incorrectly report receiving a 403. (std_error is not printed in case on line > 83 below) > > Can I ask why we did it this way? Its to replace GSResponseError: status=403, code=AccessDenied, reason="Forbidden", message="Access denied.", detail="..." With a more better looking error without losing information. That error message isn't very friendly looking and doesn't give much of a resolution (In our case the most common failure is due to lack of credentials, in which case it should ask the user to run the script with the "config" argument). Of course if we're losing information we should fix that.
	71 if 'No such object' in err:

	72 return (404, out, err)

	73 return (code, out, err)

	74

	75

	76 def check_bucket_permissions(bucket, gsutil):

	77 if not bucket:

	78 print >> sys.stderr, 'Missing bucket %s.'

	79 return (None, 1)

	80 base_url = 'gs://%s' % bucket

	81

	82 code, _, ls_err = gsutil.check_call('ls', base_url)

	83 if code == 403:

	84 code, _, _ = gsutil.call('config')

	85 if code != 0:

	86 print >> sys.stderr, 'Error while authenticating to %s.' % base_url

	87 elif code == 404:

	88 print >> sys.stderr, '%s not found.' % base_url

	89 elif code != 0:

	90 print >> sys.stderr, ls_err

	91 return (base_url, code)

	92

	93

	94 def get_sha1(filename):

	95 sha1 = hashlib.sha1()

	96 with open(filename, 'rb') as f:

	97 while True:

	98 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.

	99 chunk = f.read(1024*1024)

	100 if not chunk:

	101 break

	102 sha1.update(chunk)

	103 return sha1.hexdigest()

	104

	105

	106 # Download-specific code starts here

	107

	108 def enumerate_work_queue(input_filename, work_queue, directory,

	109 recursive, ignore_errors, output, sha1_file):

	110 if sha1_file:

	111 if not os.path.exists(input_filename):

	112 if not ignore_errors:

	113 raise FileNotFoundError('%s not found.' % input_filename)

	114 print >> sys.stderr, '%s not found.' % input_filename

	115 with open(input_filename, 'rb') as f:

	116 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	117 if sha1_match:

	118 work_queue.put(

	119 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))

	120 return 1

	121 if not ignore_errors:

	122 raise InvalidFileError('No sha1 sum found in %s.' % input_filename)

	123 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename

	124 return 0

	125

	126 if not directory:

	127 work_queue.put((input_filename, output))

	128 return 1

	129

	130 work_queue_size = 0

	131 for root, dirs, files in os.walk(input_filename):

	132 if not recursive:

	133 for item in dirs[:]:

	134 dirs.remove(item)

	135 else:

	136 for exclude in ['.svn', '.git']:

	137 if exclude in dirs:

	138 dirs.remove(exclude)

	139 for filename in files:

	140 full_path = os.path.join(root, filename)

	141 if full_path.endswith('.sha1'):

	142 with open(full_path, 'rb') as f:

	143 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	144 if sha1_match:

	145 work_queue.put(

	146 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))

	147 work_queue_size += 1

	148 else:

	149 if not ignore_errors:

	150 raise InvalidFileError('No sha1 sum found in %s.' % filename)

	151 print >> sys.stderr, 'No sha1 sum found in %s.' % filename

	152 return work_queue_size

	153

	154

	155 def _downloader_worker_thread(thread_num, q, force, base_url,

	156 gsutil, out_q, ret_codes):

	157 while True:

	158 input_sha1_sum, output_filename = q.get()

	159 if input_sha1_sum is None:

	160 return

	161 if os.path.exists(output_filename) and not force:

	162 if get_sha1(output_filename) == input_sha1_sum:

	163 out_q.put(

	164 '%d> File %s exists and SHA1 matches. Skipping.' % (

	165 thread_num, output_filename))

	166 continue

	167 # Check if file exists.

	168 file_url = '%s/%s' % (base_url, input_sha1_sum)

	169 if gsutil.check_call('ls', file_url)[0] != 0:

	170 out_q.put('%d> File %s for %s does not exist, skipping.' % (

	171 thread_num, file_url, output_filename))

	172 ret_codes.put((1, 'File %s for %s does not exist.' % (

	173 file_url, output_filename)))

	174 continue

	175 # Fetch the file.

	176 out_q.put('%d> Downloading %s...' % (

	177 thread_num, output_filename))

	178 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)

	179 if code != 0:

	180 out_q.put('%d> %s' % (thread_num, err))

	181 ret_codes.put((code, err))

	182

	183

	184 def printer_worker(output_queue):

	185 while True:

	186 line = output_queue.get()

	187 # Its plausible we want to print empty lines.

	188 if line is None:

	189 break

	190 print line

	191

	192

	193 def download_from_google_storage(

	194 input_filename, base_url, gsutil, num_threads, directory, recursive,

	195 force, output, ignore_errors, sha1_file):

	196 # Start up all the worker threads.

	197 all_threads = []

	198 download_start = time.time()

	199 stdout_queue = Queue.Queue()

	200 work_queue = Queue.Queue()

	201 ret_codes = Queue.Queue()

	202 ret_codes.put((0, None))

	203 for thread_num in range(num_threads):

	204 t = threading.Thread(

	205 target=_downloader_worker_thread,

	206 args=[thread_num, work_queue, force, base_url,

	207 gsutil, stdout_queue, ret_codes])

	208 t.daemon = True

	209 t.start()

	210 all_threads.append(t)

	211 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])

	212 printer_thread.daemon = True

	213 printer_thread.start()

	214

	215 # Enumerate our work queue.

	216 work_queue_size = enumerate_work_queue(

	217 input_filename, work_queue, directory, recursive,

	218 ignore_errors, output, sha1_file)

	219 for _ in all_threads:

	220 work_queue.put((None, None)) # Used to tell worker threads to stop.

	221

	222 # Wait for all downloads to finish.

	223 for t in all_threads:

	224 t.join()

	225 stdout_queue.put(None)

	226 printer_thread.join()

	227

	228 # See if we ran into any errors.

	229 max_ret_code = 0

	230 for ret_code, message in ret_codes.queue:

	231 max_ret_code = max(ret_code, max_ret_code)

	232 if message:

	233 print >> sys.stderr, message

	234 if not max_ret_code:

	235 print 'Success!'

	236

	237 print 'Downloading %d files took %1f second(s)' % (

	238 work_queue_size, time.time() - download_start)

	239 return max_ret_code

	240

	241

	242 def main(args):

	243 usage = ('usage: %prog [options] target\n'

	244 'Target must be:\n'

	245 ' (default) a sha1 sum ([A-Za-z0-9]{40}).\n'

	246 ' (-s or --sha1_file) a .sha1 file, containing a sha1 sum on '

	247 'the first line.\n'

	248 ' (-d or --directory) A directory to scan for .sha1 files.')

	249 parser = optparse.OptionParser(usage)

	250 parser.add_option('-o', '--output',

	251 help='Specify the output file name. Defaults to: '

	252 '(a) Given a SHA1 hash, the name is the SHA1 hash. '

	253 '(b) Given a .sha1 file or directory, the name will '

	254 'match (.*).sha1.')

	255 parser.add_option('-b', '--bucket',

	256 help='Google Storage bucket to fetch from.')

	257 parser.add_option('-e', '--boto',

	258 help='Specify a custom boto file.')

	259 parser.add_option('-c', '--no_resume', action='store_true',

	260 help='Resume download if file is partially downloaded.')

	261 parser.add_option('-f', '--force', action='store_true',

	262 help='Force download even if local file exists.')

	263 parser.add_option('-i', '--ignore_errors', action='store_true',

	264 help='Don\'t throw error if we find an invalid .sha1 file.')

	265 parser.add_option('-r', '--recursive', action='store_true',

	266 help='Scan folders recursively for .sha1 files. '

	267 'Must be used with -d/--directory')

	268 parser.add_option('-t', '--num_threads', default=1, type='int',

	269 help='Number of downloader threads to run.')

	270 parser.add_option('-d', '--directory', action='store_true',

	271 help='The target is a directory. '

	272 'Cannot be used with -s/--sha1_file.')

	273 parser.add_option('-s', '--sha1_file', action='store_true',

	274 help='The target is a file containing a sha1 sum. '

	275 'Cannot be used with -d/--directory.')

	276

	277 (options, args) = parser.parse_args()

	278 if not args:

	279 parser.error('Missing target.')

	280 if len(args) > 1:

	281 parser.error('Too many targets.')

	282 if not options.bucket:

	283 parser.error('Missing bucket. Specify bucket with --bucket.')

	284 if options.sha1_file and options.directory:

	285 parser.error('Both --directory and --sha1_file are specified, '

	286 'can only specify one.')

	287 if options.recursive and not options.directory:

	288 parser.error('--recursive specified but --directory not specified.')

	289 if options.output and options.directory:

	290 parser.error('--directory is specified, so --output has no effect.')

	291 input_filename = args[0]

	292

	293 # Set output filename if not specified.

	294 if not options.output and not options.directory:

	295 if not options.sha1_file:

	296 # Target is a sha1 sum, so output filename would also be the sha1 sum.

	297 options.output = input_filename

	298 elif options.sha1_file:

	299 # Target is a .sha1 file.

	300 if not input_filename.endswith('.sha1'):

	301 parser.error('--sha1_file is specified, but the input filename '

	302 'does not end with .sha1, and no --output is specified. '

	303 'Either make sure the input filename has a .sha1 '

	304 'extension, or specify --output.')

	305 options.output = input_filename[:-5]

	306 else:

	307 parser.error('Unreachable state.')

	308

	309 # Check if output file already exists.

	310 if not options.directory and not options.force and not options.no_resume:

	311 if os.path.exists(options.output):

	312 parser.error('Output file %s exists and --no_resume is specified.'

	313 % options.output)

	314

	315 # Make sure we can find a working instance of gsutil.

	316 if os.path.exists(GSUTIL_DEFAULT_PATH):

	317 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)

	318 else:

	319 gsutil = None

	320 for path in os.environ["PATH"].split(os.pathsep):

	321 if os.path.exists(path) and 'gsutil' in os.listdir(path):

	322 gsutil = Gsutil(os.path.join(path, 'gsutil'))

	323 if not gsutil:

	324 parser.error('gsutil not found in %s, bad depot_tools checkout?' %

	325 GSUTIL_DEFAULT_PATH)

	326

	327 # Check we have a valid bucket with valid permissions.

	328 base_url, code = check_bucket_permissions(options.bucket, gsutil)

	329 if code:

	330 return code

	331

	332 return download_from_google_storage(

	333 input_filename, base_url, gsutil, options.num_threads, options.directory,

	334 options.recursive, options.force, options.output, options.ignore_errors,

	335 options.sha1_file)

	336

	337

	338 if __name__ == '__main__':

	339 sys.exit(main(sys.argv))

OLD	NEW

« no previous file with comments | « no previous file | tests/download_from_google_storage_unittests.py » ('j') | no next file with comments »