upload_to_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Added some unittests Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Uploads files to Google Storage content addressed."""

	7

	8 import optparse

	9 import os

	10 import Queue

	11 import re

	12 import sys

	13 import threading

	14 import time

	15

	16 import gstools

	17

	18 GSUTIL_DEFAULT_PATH = os.path.join(

	19 os.path.dirname(os.path.abspath(__file__)),

	20 'third_party', 'gsutil', 'gsutil')

	21

	22 USAGE_STRING = """%prog [options] target [target2 ...].

	23 Target is the file intended to be uploaded to Google Storage.

	24 If target is "-", then a list of files will be taken from standard input

	25

	26 This script will generate a file (original filename).sha1 containing the

	27 sha1 sum of the uploaded file.

	28 It is recommended that the .sha1 file is checked into the repository,

	29 the original file removed from the repository, and a hook added to the

	30 DEPS file to call download_from_google_storage.py.

	31

	32 Example usages

	33 --------------

	34

	35 Scan the current directory and upload all files larger than 1MB:

	36 find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -

	37 """

	38

	39

	40 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):

	41 while True:

	42 filename, sha1_sum = q.get()

	43 if filename is None:
	M-A Ruel 2013/02/27 21:52:07 if not filename: break optional style nit: Pers if not filename: break optional style nit: Personally, I wouldn't keep the comment. Ryan Tseng 2013/02/27 23:34:41 Done. Show quoted text On 2013/02/27 21:52:07, Marc-Antoine Ruel wrote: > if not filename: > break > > optional style nit: Personally, I wouldn't keep the comment. Done.
	44 break # A (None, None) item is inserted for each thread to mark EOF.

	45 file_url = '%s/%s' % (base_url, sha1_sum)

	46 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:

	47 # File exists, check MD5 hash.

	48 _, out, _ = gsutil.check_call('ls', '-L', file_url)

	49 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)

	50 if etag_match:

	51 remote_md5 = etag_match.group(1)

	52 # Calculate the MD5 checksum to match it to Google Storage's ETag.

	53 if options.use_md5:

	54 local_md5 = gstools.GetMD5Cached(filename, md5_lock)

	55 else:

	56 local_md5 = gstools.GetMD5(filename, md5_lock)

	57 if local_md5 == remote_md5:

	58 print ('File %s already exists at %s and MD5 matches, exiting' %

	59 (filename, file_url))

	60 continue

	61 print 'Uploading %s to %s' % (filename, file_url)

	62 code = gsutil.call('cp', '-q', filename, file_url)
	M-A Ruel 2013/02/27 21:52:07 Does this library throws exceptions when receiving Does this library throws exceptions when receiving an HTTP 500, TCP RST, etc? Ryan Tseng 2013/02/27 23:34:41 It doesn't throw an exception, it just returns a n Show quoted text On 2013/02/27 21:52:07, Marc-Antoine Ruel wrote: > Does this library throws exceptions when receiving an HTTP 500, TCP RST, etc? It doesn't throw an exception, it just returns a non-zero code. It can be made to throw exceptions for non-whitelisted errors though, is that preferable? M-A Ruel 2013/02/28 14:53:56 No, my point was more that occasionally, exception Show quoted text On 2013/02/27 23:34:41, Ryan T. wrote: > On 2013/02/27 21:52:07, Marc-Antoine Ruel wrote: > > Does this library throws exceptions when receiving an HTTP 500, TCP RST, etc? > > It doesn't throw an exception, it just returns a non-zero code. > It can be made to throw exceptions for non-whitelisted errors though, is that > preferable? No, my point was more that occasionally, exceptions like IOError can leak through a library even if they try to handle "most" of them. So it's fine to leave it as-is and look for issues in practice.
	63 if code != 0:

	64 print >> sys.stderr, gsutil.stderr

	65 continue

	66

	67

	68 def get_targets(options, args, parser):

	69 if not args:

	70 parser.error('Missing target.')

	71 elif len(args) == 1 and args[0] == '-':
	M-A Ruel 2013/02/27 21:52:07 s/elif/if/ and add an empty line above. It's clear s/elif/if/ and add an empty line above. It's clearer. Ryan Tseng 2013/02/27 23:34:41 Done. Show quoted text On 2013/02/27 21:52:07, Marc-Antoine Ruel wrote: > s/elif/if/ > and add an empty line above. It's clearer. Done.
	72 # Take stdin as a newline or null seperated list of files.

	73 if options.use_null_terminator:

	74 input_filenames = sys.stdin.read().split('\0')
	M-A Ruel 2013/02/27 21:52:07 return sys.stdin.read().split('\0') return sys.stdin.read().split('\0') Ryan Tseng 2013/02/27 23:34:41 Done. Show quoted text On 2013/02/27 21:52:07, Marc-Antoine Ruel wrote: > return sys.stdin.read().split('\0') Done.
	75 else:

	76 input_filenames = sys.stdin.read().splitlines()
	M-A Ruel 2013/02/27 21:52:07 return return Ryan Tseng 2013/02/27 23:34:41 Done. Show quoted text On 2013/02/27 21:52:07, Marc-Antoine Ruel wrote: > return Done.
	77 else:
	M-A Ruel 2013/02/27 21:52:07 return args remove the "else:" line, not needed. return args remove the "else:" line, not needed. Ryan Tseng 2013/02/27 23:34:41 Done. Show quoted text On 2013/02/27 21:52:07, Marc-Antoine Ruel wrote: > return args > remove the "else:" line, not needed. Done.
	78 input_filenames = args

	79

	80 return input_filenames

	81

	82

	83 def upload_to_google_storage(input_filenames, base_url, gsutil, options):

	84 # We only want one MD5 calculation happening at a time to avoid HD thrashing.

	85 md5_lock = threading.Lock()

	86

	87 # Start up all the worker threads.

	88 all_threads = []

	89 upload_queue = Queue.Queue()

	90 upload_timer = time.time()

	91 for thread_num in range(options.num_threads):

	92 t = threading.Thread(

	93 target=_upload_worker,

	94 args=[thread_num, upload_queue, base_url,

	95 gsutil.clone(), options, md5_lock])

	96 t.daemon = True

	97 t.start()

	98 all_threads.append(t)

	99

	100 # We want to hash everything in a single thread since its faster.

	101 # The bottleneck is in disk IO, not CPU.

	102 hash_timer = time.time() # For timing statistics.

	103 for filename in input_filenames:

	104 if not os.path.exists(filename):

	105 print 'Error: %s not found, skipping.' % filename

	106 continue

	107 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:

	108 print 'Found hash for %s, skipping.' % filename

	109 upload_queue.put((filename, open('%s.sha1' % filename).read()))

	110 continue

	111 print 'Calculating hash for %s...' % filename,

	112 sha1_sum = gstools.GetSHA1(filename)

	113 with open(filename + '.sha1', 'wb') as f:

	114 f.write(sha1_sum)

	115 print 'done'

	116 upload_queue.put((filename, sha1_sum))

	117 hash_time = time.time() - hash_timer

	118

	119 # Wait for everything to finish.

	120 for _ in all_threads:

	121 upload_queue.put((None, None)) # To mark the end of the work queue.

	122 for t in all_threads:

	123 t.join()

	124

	125 print 'Success.'

	126 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)

	127 print 'Uploading took %1f seconds' % (time.time() - upload_timer)

	128 return 0

	129

	130

	131 def main(args):

	132 parser = optparse.OptionParser(USAGE_STRING)

	133 parser.add_option('-b', '--bucket',

	134 help='Google Storage bucket to upload to.')

	135 parser.add_option('-e', '--boto', help='Specify a custom boto file.')

	136 parser.add_option('-f', '--force', action='store_true',

	137 help='Force upload even if remote file exists.')

	138 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,

	139 help='Path to the gsutil script.')

	140 parser.add_option('-m', '--use_md5', action='store_true',

	141 help='Generate MD5 files when scanning, and don\'t check '

	142 'the MD5 checksum if a .md5 file is found.')

	143 parser.add_option('-t', '--num_threads', default=1, type='int',

	144 help='Number of uploader threads to run.')

	145 parser.add_option('-s', '--skip_hashing', action='store_true',

	146 help='Skip hashing if .sha1 file exists.')

	147 parser.add_option('-0', '--use_null_terminator', action='store_true',

	148 help='Use \\0 instead of \\n when parsing '

	149 'the file list from stdin. This is useful if the input '

	150 'is coming from "find ... -print0".')

	151 (options, args) = parser.parse_args()

	152

	153 # Enumerate our inputs.

	154 input_filenames = get_targets(options, args, parser)

	155

	156 # Make sure we can find a working instance of gsutil.

	157 if os.path.exists(GSUTIL_DEFAULT_PATH):

	158 gsutil = gstools.Gsutil(GSUTIL_DEFAULT_PATH)

	159 else:

	160 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %

	161 GSUTIL_DEFAULT_PATH)

	162 return 1

	163

	164 # Check we have a valid bucket with valid permissions.

	165 base_url, code = gstools.CheckBucketPermissions(options.bucket, gsutil)

	166 if code:

	167 return code

	168

	169 return upload_to_google_storage(input_filenames, base_url, gsutil, options)

	170

	171

	172 if __name__ == '__main__':

	173 sys.exit(main(sys.argv))

OLD	NEW

« tests/gstools_unittest.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »