download_from_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Removed gstools.py, added more error messages Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Download files from Google Storage based on SHA1 sums."""

	7

	8

	9 import hashlib

	10 import optparse

	11 import os

	12 import Queue

	13 import re

	14 import sys

	15 import threading

	16 import time

	17

	18 import subprocess2

	19

	20

	21 GSUTIL_DEFAULT_PATH = os.path.join(

	22 os.path.dirname(os.path.abspath(__file__)),

	23 'third_party', 'gsutil', 'gsutil')

	24

	25

	26 # Common utilities

	27 class Gsutil(object):

	28 """Call gsutil with some predefined settings."""

	29 def __init__(self, path, boto_path=None, timeout=None):

	30 if not os.path.exists(path):

	31 raise OSError('GSUtil not found in %s' % path)

	32 self.path = path

	33 self.timeout = timeout

	34 self.boto_path = boto_path

	35

	36 def call(self, *args):

	37 env = os.environ.copy()

	38 if self.boto_path is not None:

	39 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	40 return subprocess2.call((sys.executable, self.path) + args,

	41 env=env,

	42 timeout=self.timeout)

	43

	44 def check_call(self, *args):

	45 env = os.environ.copy()

	46 if self.boto_path is not None:

	47 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	48 ((out, err), code) = subprocess2.communicate(

	49 (sys.executable, self.path) + args,

	50 stdout=subprocess2.PIPE,

	51 stderr=subprocess2.PIPE,

	52 env=env,

	53 timeout=self.timeout)

	54

	55 # Parse output.

	56 status_code_match = re.search('status=([0-9]+)', err)

	57 if status_code_match:

	58 return int(status_code_match.groups(1))

	59 elif ('You are attempting to access protected data with '

	60 'no configured credentials.' in err):

	61 return (403, out, err)

	62 elif 'No such object' in err:

	63 return (404, out, err)

	64 else:

	65 return (code, out, err)

	66

	67 def clone(self):

	68 return Gsutil(self.path, self.boto_path, self.timeout)

	69

	70

	71 def CheckBucketPermissions(bucket, gsutil):

	72 if not bucket:

	73 print >> sys.stderr, 'Missing bucket %s.'

	74 return (None, 1)

	75 base_url = 'gs://%s' % bucket

	76

	77 # Check if we have permissions to the Google Storage bucket.

	78 code, _, ls_err = gsutil.check_call('ls', base_url)

	79 if code == 403:

	80 code, _, _ = gsutil.call('config')

	81 if code != 0:

	82 print >> sys.stderr, 'Error while authenticating to %s.' % base_url

	83 elif code == 404:

	84 print >> sys.stderr, '%s not found.' % base_url

	85 elif code != 0:

	86 print >> sys.stderr, ls_err

	87 return (base_url, code)

	88

	89

	90 def GetSHA1(filename):

	91 sha1 = hashlib.sha1()

	92 with open(filename, 'rb') as f:

	93 while True:

	94 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.

	95 chunk = f.read(1024*1024)

	96 if not chunk:

	97 break

	98 sha1.update(chunk)

	99 return sha1.hexdigest()

	100

	101

	102 def GetMD5(filename, lock):
	Marc-Antoine Ruel (Google) 2013/03/05 02:04:08 This functions is not used in this file, please mo This functions is not used in this file, please move to upload. Ryan Tseng 2013/03/06 19:03:56 Done. Show quoted text On 2013/03/05 02:04:08, Marc-Antoine Ruel (Google) wrote: > This functions is not used in this file, please move to upload. Done.
	103 md5_calculator = hashlib.md5()

	104 with lock:

	105 with open(filename, 'rb') as f:

	106 while True:

	107 chunk = f.read(1024*1024)

	108 if not chunk:

	109 break

	110 md5_calculator.update(chunk)

	111 return md5_calculator.hexdigest()

	112

	113

	114 def GetMD5Cached(filename, lock):
	Marc-Antoine Ruel (Google) 2013/03/05 02:04:08 This function is not used in this file, please mov This function is not used in this file, please move to upload Ryan Tseng 2013/03/06 19:03:56 Done. Show quoted text On 2013/03/05 02:04:08, Marc-Antoine Ruel (Google) wrote: > This function is not used in this file, please move to upload Done.
	115 """Don't calculate the MD5 if we can find a .md5 file."""

	116 # See if we can find an existing MD5 sum stored in a file.

	117 if os.path.exists('%s.md5' % filename):

	118 with open('%s.md5' % filename) as f:

	119 md5_match = re.search('([a-z0-9]{32})', f.read())

	120 if md5_match:

	121 return md5_match.group(1)

	122 else:

	123 md5_hash = GetMD5(filename, lock)

	124 with open('%s.md5' % filename, 'w') as f:

	125 f.write(md5_hash)

	126 return md5_hash

	127

	128

	129 # Download-specific code starts here

	130

	131 def enumerate_work_queue(input_filename, work_queue, directory,

	132 recursive, ignore_errors, output, sha1_file):

	133 if sha1_file:

	134 if not os.path.exists(input_filename):

	135 print >> sys.stderr, '%s not found.' % input_filename

	136 if not ignore_errors:

	137 raise Exception('%s not found.' % input_filename)

	138 with open(input_filename, 'rb') as f:

	139 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	140 if sha1_match:

	141 work_queue.put(

	142 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))

	143 return 1

	144 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename

	145 if not ignore_errors:

	146 raise Exception('No sha1 sum found in %s.' % input_filename)

	147 return 0

	148

	149 if not directory:

	150 work_queue.put((input_filename, output))

	151 return 1

	152

	153 work_queue_size = 0

	154 for root, dirs, files in os.walk(input_filename):

	155 if not recursive:

	156 for item in dirs[:]:

	157 dirs.remove(item)

	158 else:

	159 for exclude in ['.svn', '.git']:

	160 if exclude in dirs:

	161 dirs.remove(exclude)

	162 for filename in files:

	163 full_path = os.path.join(root, filename)

	164 if full_path.endswith('.sha1'):

	165 with open(full_path, 'rb') as f:

	166 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	167 if sha1_match:

	168 work_queue.put(

	169 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))

	170 work_queue_size += 1

	171 else:

	172 print >> sys.stderr, 'No sha1 sum found in %s.' % filename

	173 if not ignore_errors:

	174 raise Exception('No sha1 sum found in %s.' % filename)

	175 return work_queue_size

	176

	177

	178 def _downloader_worker_thread(thread_num, q, force, base_url, gsutil, out_q):

	179 while True:

	180 input_sha1_sum, output_filename = q.get()

	181 if input_sha1_sum is None:

	182 out_q.put('Thread %d is done' % thread_num)

	183 return

	184 if os.path.exists(output_filename) and not force:

	185 if GetSHA1(output_filename) == input_sha1_sum:

	186 out_q.put(

	187 'File %s exists and SHA1 sum (%s) matches. Skipping.' % (

	188 output_filename , input_sha1_sum))

	189 continue

	190 # Check if file exists.

	191 file_url = '%s/%s' % (base_url, input_sha1_sum)

	192 if gsutil.check_call('ls', file_url)[0] != 0:

	193 out_q.put('File %s for %s does not exist, skipping.' % (

	194 file_url, output_filename))

	195 continue

	196 # Fetch the file.

	197 out_q.put('Downloading %s to %s...' % (file_url, output_filename))

	198 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)

	199 if code != 0:

	200 out_q.put(err)

	201 return code

	202

	203

	204 def download_from_google_storage(

	205 input_filename, base_url, gsutil, num_threads, directory, recursive,

	206 force, output, ignore_errors, sha1_file):

	207 # Start up all the worker threads.

	208 all_threads = []

	209 download_timer = time.time()

	210 stdout_queue = Queue.Queue()

	211 work_queue = Queue.Queue()

	212 for thread_num in range(num_threads):

	213 t = threading.Thread(

	214 target=_downloader_worker_thread,

	215 args=[thread_num, work_queue, force, base_url,

	216 gsutil.clone(), stdout_queue])

	217 t.daemon = True

	218 t.start()

	219 all_threads.append(t)

	220

	221 # Enumerate our work queue.

	222 work_queue_size = enumerate_work_queue(

	223 input_filename, work_queue, directory, recursive,

	224 ignore_errors, output, sha1_file)

	225 for _ in all_threads:

	226 work_queue.put((None, None)) # Used to tell worker threads to stop.

	227

	228 # Wait for all downloads to finish.

	229 while not work_queue.empty() or any(t.is_alive() for t in all_threads):
	Marc-Antoine Ruel (Google) 2013/03/05 02:04:08 There's a race condition in there; - The last thre There's a race condition in there; - The last thread is exiting but t.is_alive() is still true. - Nothing is going to be queued. This will result in a hang at line 230. Ryan Tseng 2013/03/06 19:03:56 changed or -> and * If the queue not empty, then t Show quoted text On 2013/03/05 02:04:08, Marc-Antoine Ruel (Google) wrote: > There's a race condition in there; > - The last thread is exiting but t.is_alive() is still true. > - Nothing is going to be queued. > > This will result in a hang at line 230. changed or -> and * If the queue not empty, then there will be one or more threads that will queue into stdout_queue. Meaning 230 will not hang since there will guaranteed to be one or more items in queue, or to be queued later. * If the queue is empty, we exit this loop to avoid any race conditions between queueing stdouts and thread exits.
	230 print stdout_queue.get()

	231 while not stdout_queue.empty():

	232 print stdout_queue.get()

	233

	234 print 'Success.'

	235 print 'Downloading %d files took %1f second(s)' % (

	236 work_queue_size, time.time() - download_timer)

	237 return 0

	238

	239

	240 def main(args):

	241 usage = ('usage: %prog [options] target\nTarget must be:\n'

	242 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '

	243 '.sha1 file, containing a sha1 sum on the first line. (-d or '

	244 '--directory) A directory to scan for .sha1 files. ')

	245 parser = optparse.OptionParser(usage)

	246 parser.add_option('-o', '--output',

	247 help='Specify the output file name. Defaults to:\n'

	248 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'

	249 '(b) Given a .sha1 file or directory, the name will '

	250 'match (.*).sha1.')

	251 parser.add_option('-b', '--bucket',

	252 help='Google Storage bucket to fetch from.')

	253 parser.add_option('-e', '--boto',

	254 help='Specify a custom boto file.')

	255 parser.add_option('-c', '--no_resume', action='store_true',

	256 help='Resume download if file is partially downloaded.')

	257 parser.add_option('-f', '--force', action='store_true',

	258 help='Force download even if local file exists.')

	259 parser.add_option('-i', '--ignore_errors', action='store_true',

	260 help='Don\'t throw error if we find an invalid .sha1 file.')

	261 parser.add_option('-r', '--recursive', action='store_true',

	262 help='Scan folders recursively for .sha1 files. '

	263 'Must be used with -d/--directory')

	264 parser.add_option('-t', '--num_threads', default=1, type='int',

	265 help='Number of downloader threads to run.')

	266 parser.add_option('-d', '--directory', action='store_true',

	267 help='The target is a directory. '

	268 'Cannot be used with -s/--sha1_file.')

	269 parser.add_option('-s', '--sha1_file', action='store_true',

	270 help='The target is a file containing a sha1 sum. '

	271 'Cannot be used with -d/--directory.')

	272

	273 (options, args) = parser.parse_args()

	274 if not args:

	275 parser.error('Missing target.')

	276 if len(args) > 1:

	277 parser.error('Too many targets.')

	278 if not options.bucket:

	279 parser.error('Missing bucket. Specify bucket with --bucket.')

	280 if options.sha1_file and options.directory:

	281 parser.error('Both --directory and --sha1_file are specified, '

	282 'can only specify one.')

	283 elif options.recursive and not options.directory:

	284 parser.error('--recursive specified but --directory not specified.')

	285 elif options.output and options.directory:

	286 parser.error('--directory is specified, so --output has no effect.')

	287 else:

	288 input_filename = args[0]

	289

	290 # Set output filename if not specified.

	291 if not options.output and not options.directory:

	292 if not options.sha1_file:

	293 # Target is a sha1 sum, so output filename would also be the sha1 sum.

	294 options.output = input_filename

	295 elif options.sha1_file:

	296 # Target is a .sha1 file.

	297 if not input_filename.endswith('.sha1'):

	298 parser.error('--sha1_file is specified, but the input filename '

	299 'does not end with .sha1, and no --output is specified. '

	300 'Either make sure the input filename has a .sha1 '

	301 'extension, or specify --output.')

	302 options.output = input_filename[:-5]

	303 else:

	304 raise NotImplementedError('Unreachable state.')

	305

	306 # Check if output file already exists.

	307 if not options.directory and not options.force and not options.no_resume:

	308 if os.path.exists(options.output):

	309 parser.error('Output file %s exists and --no_resume is specified.'

	310 % options.output)

	311

	312 # Make sure we can find a working instance of gsutil.

	313 if os.path.exists(GSUTIL_DEFAULT_PATH):

	314 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)

	315 else:

	316 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %

	317 GSUTIL_DEFAULT_PATH)

	318 return 1

	319

	320 # Check we have a valid bucket with valid permissions.

	321 base_url, code = CheckBucketPermissions(options.bucket, gsutil)

	322 if code:

	323 return code

	324

	325 return download_from_google_storage(

	326 input_filename, base_url, gsutil, options.num_threads, options.directory,

	327 options.recursive, options.force, options.output, options.ignore_errors,

	328 options.sha1_file)

	329

	330

	331 if __name__ == '__main__':

	332 sys.exit(main(sys.argv))

OLD	NEW

« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | no next file with comments »