upload_to_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Test fix Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Uploads files to Google Storage content addressed."""

	7

	8 import optparse

	9 import os

	10 import Queue

	11 import re

	12 import sys

	13 import threading

	14 import time

	15

	16 import gstools

	17

	18 GSUTIL_DEFAULT_PATH = os.path.join(

	19 os.path.dirname(os.path.abspath(__file__)),

	20 'third_party', 'gsutil', 'gsutil')

	21

	22 USAGE_STRING = """%prog [options] target [target2 ...].

	23 Target is the file intended to be uploaded to Google Storage.

	24 If target is "-", then a list of files will be taken from standard input

	25

	26 This script will generate a file (original filename).sha1 containing the

	27 sha1 sum of the uploaded file.

	28 It is recommended that the .sha1 file is checked into the repository,

	29 the original file removed from the repository, and a hook added to the

	30 DEPS file to call download_from_google_storage.py.

	31

	32 Example usages

	33 --------------

	34

	35 Scan the current directory and upload all files larger than 1MB:

	36 find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -

	37 """

	38

	39

	40 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):

	41 while True:

	42 filename, sha1_sum = q.get()

	43 if not filename:

	44 break

	45 file_url = '%s/%s' % (base_url, sha1_sum)

	46 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:

	47 # File exists, check MD5 hash.

	48 _, out, _ = gsutil.check_call('ls', '-L', file_url)

	49 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)

	50 if etag_match:

	51 remote_md5 = etag_match.group(1)

	52 # Calculate the MD5 checksum to match it to Google Storage's ETag.

	53 if options.use_md5:

	54 local_md5 = gstools.GetMD5Cached(filename, md5_lock)

	55 else:

	56 local_md5 = gstools.GetMD5(filename, md5_lock)

	57 if local_md5 == remote_md5:

	58 print ('File %s already exists at %s and MD5 matches, exiting' %

	59 (filename, file_url))

	60 continue

	61 print 'Uploading %s to %s' % (filename, file_url)

	62 code = gsutil.call('cp', '-q', filename, file_url)

	63 if code != 0:

	64 print >> sys.stderr, gsutil.stderr
	M-A Ruel 2013/02/28 14:53:56 This won't affect this process' exit code? This won't affect this process' exit code? Ryan Tseng 2013/03/01 02:41:35 It probably should. Updated. Show quoted text On 2013/02/28 14:53:56, Marc-Antoine Ruel wrote: > This won't affect this process' exit code? It probably should. Updated.
	65 continue

	66

	67

	68 def get_targets(options, args, parser):

	69 if not args:

	70 parser.error('Missing target.')

	71

	72 if len(args) == 1 and args[0] == '-':

	73 # Take stdin as a newline or null seperated list of files.

	74 if options.use_null_terminator:

	75 return sys.stdin.read().split('\0')

	76 else:

	77 return sys.stdin.read().splitlines()

	78 else:

	79 return args

	80

	81

	82 def upload_to_google_storage(input_filenames, base_url, gsutil, options):

	83 # We only want one MD5 calculation happening at a time to avoid HD thrashing.

	84 md5_lock = threading.Lock()

	85

	86 # Start up all the worker threads.

	87 all_threads = []

	88 upload_queue = Queue.Queue()

	89 upload_timer = time.time()

	90 for thread_num in range(options.num_threads):

	91 t = threading.Thread(

	92 target=_upload_worker,

	93 args=[thread_num, upload_queue, base_url,

	94 gsutil.clone(), options, md5_lock])

	95 t.daemon = True

	96 t.start()

	97 all_threads.append(t)

	98

	99 # We want to hash everything in a single thread since its faster.

	100 # The bottleneck is in disk IO, not CPU.

	101 hash_timer = time.time() # For timing statistics.

	102 for filename in input_filenames:

	103 if not os.path.exists(filename):

	104 print 'Error: %s not found, skipping.' % filename

	105 continue

	106 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:

	107 print 'Found hash for %s, skipping.' % filename

	108 upload_queue.put((filename, open('%s.sha1' % filename).read()))

	109 continue

	110 print 'Calculating hash for %s...' % filename,

	111 sha1_sum = gstools.GetSHA1(filename)
	M-A Ruel 2013/02/28 14:53:56 So you may end up doing both a md5 and sha1 calcul So you may end up doing both a md5 and sha1 calculation simultaneously? Note that "uploading" a file or "calculating hash" are both using a lot of I/O. The issue at stake here is that uploading is usually _more bound_ to network I/O, but in some case the ratio isn't obvious, some VMs (especially cloud compute engine) could get a clear 1gbit of network I/O so it wouldn't much network bound in practice. I don't know the best way to optimize that automatically, I'm just noting this fact. Ryan Tseng 2013/03/01 02:41:35 gsutil doesn't maximize network transfer bandwidth Show quoted text On 2013/02/28 14:53:56, Marc-Antoine Ruel wrote: > So you may end up doing both a md5 and sha1 calculation simultaneously? > > Note that "uploading" a file or "calculating hash" are both using a lot of I/O. > The issue at stake here is that uploading is usually _more bound_ to network > I/O, but in some case the ratio isn't obvious, some VMs (especially cloud > compute engine) could get a clear 1gbit of network I/O so it wouldn't much > network bound in practice. > > I don't know the best way to optimize that automatically, I'm just noting this > fact. gsutil doesn't maximize network transfer bandwidth for a single file for some reason, which is the main reason this is multi-threaded. So we don't get the full benefit of the 1Gbps unless we open like 10 connections :)
	112 with open(filename + '.sha1', 'wb') as f:

	113 f.write(sha1_sum)

	114 print 'done'

	115 upload_queue.put((filename, sha1_sum))

	116 hash_time = time.time() - hash_timer

	117

	118 # Wait for everything to finish.

	119 for _ in all_threads:

	120 upload_queue.put((None, None)) # To mark the end of the work queue.

	121 for t in all_threads:

	122 t.join()

	123

	124 print 'Success.'

	125 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)

	126 print 'Uploading took %1f seconds' % (time.time() - upload_timer)

	127 return 0

	128

	129

	130 def main(args):

	131 parser = optparse.OptionParser(USAGE_STRING)

	132 parser.add_option('-b', '--bucket',

	133 help='Google Storage bucket to upload to.')

	134 parser.add_option('-e', '--boto', help='Specify a custom boto file.')

	135 parser.add_option('-f', '--force', action='store_true',

	136 help='Force upload even if remote file exists.')

	137 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,

	138 help='Path to the gsutil script.')

	139 parser.add_option('-m', '--use_md5', action='store_true',

	140 help='Generate MD5 files when scanning, and don\'t check '

	141 'the MD5 checksum if a .md5 file is found.')

	142 parser.add_option('-t', '--num_threads', default=1, type='int',

	143 help='Number of uploader threads to run.')

	144 parser.add_option('-s', '--skip_hashing', action='store_true',

	145 help='Skip hashing if .sha1 file exists.')

	146 parser.add_option('-0', '--use_null_terminator', action='store_true',

	147 help='Use \\0 instead of \\n when parsing '

	148 'the file list from stdin. This is useful if the input '

	149 'is coming from "find ... -print0".')

	150 (options, args) = parser.parse_args()

	151

	152 # Enumerate our inputs.

	153 input_filenames = get_targets(options, args, parser)

	154

	155 # Make sure we can find a working instance of gsutil.

	156 if os.path.exists(GSUTIL_DEFAULT_PATH):

	157 gsutil = gstools.Gsutil(GSUTIL_DEFAULT_PATH)

	158 else:

	159 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %

	160 GSUTIL_DEFAULT_PATH)

	161 return 1

	162

	163 # Check we have a valid bucket with valid permissions.

	164 base_url, code = gstools.CheckBucketPermissions(options.bucket, gsutil)

	165 if code:

	166 return code

	167

	168 return upload_to_google_storage(input_filenames, base_url, gsutil, options)

	169

	170

	171 if __name__ == '__main__':

	172 sys.exit(main(sys.argv))

OLD	NEW

« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »