upload_to_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Review fixes Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Uploads files to Google Storage content addressed."""

	7

	8 import optparse

	9 import os

	10 import Queue

	11 import re

	12 import sys

	13 import threading

	14 import time

	15

	16 import gstools

	17

	18 GSUTIL_DEFAULT_PATH = os.path.join(

	19 os.path.dirname(os.path.abspath(__file__)),

	20 'third_party', 'gsutil', 'gsutil')

	21

	22 USAGE_STRING = """%prog [options] target [target2 ...].

	23 Target is the file intended to be uploaded to Google Storage.

	24 If target is "-", then a list of files will be taken from standard input

	25

	26 This script will generate a file (original filename).sha1 containing the

	27 sha1 sum of the uploaded file.

	28 It is recommended that the .sha1 file is checked into the repository,

	29 the original file removed from the repository, and a hook added to the

	30 DEPS file to call download_from_google_storage.py.

	31

	32 Example usages

	33 --------------

	34

	35 Scan the current directory and upload all files larger than 1MB:

	36 find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -

	37 """

	38

	39

	40 def _upload_worker(

	41 thread_num, q, base_url, gsutil, md5_lock, force, use_md5, ret_codes):

	42 while True:

	43 filename, sha1_sum = q.get()

	44 if not filename:

	45 break

	46 file_url = '%s/%s' % (base_url, sha1_sum)

	47 if gsutil.check_call('ls', file_url)[0] == 0 and not force:

	48 # File exists, check MD5 hash.

	49 _, out, _ = gsutil.check_call('ls', '-L', file_url)

	50 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)

	51 if etag_match:

	52 remote_md5 = etag_match.group(1)

	53 # Calculate the MD5 checksum to match it to Google Storage's ETag.

	54 if use_md5:

	55 local_md5 = gstools.GetMD5Cached(filename, md5_lock)

	56 else:

	57 local_md5 = gstools.GetMD5(filename, md5_lock)

	58 if local_md5 == remote_md5:

	59 print ('File %s already exists at %s and MD5 matches, exiting' %

	60 (filename, file_url))

	61 continue

	62 print 'Uploading %s to %s' % (filename, file_url)

	63 code, _, err = gsutil.check_call('cp', '-q', filename, file_url)

	64 if code != 0:

	65 ret_codes.put(

	66 (code,

	67 'Encountered error on uploading %s to %s\n%s' %

	68 (filename, file_url, err)))

	69 continue

	70

	71

	72 def get_targets(args, parser, use_null_terminator):

	73 if not args:

	74 parser.error('Missing target.')

	75

	76 if len(args) == 1 and args[0] == '-':

	77 # Take stdin as a newline or null seperated list of files.

	78 if use_null_terminator:

	79 return sys.stdin.read().split('\0')

	80 else:

	81 return sys.stdin.read().splitlines()

	82 else:

	83 return args

	84

	85

	86 def upload_to_google_storage(

	87 input_filenames, base_url, gsutil, force,

	88 use_md5, num_threads, skip_hashing):

	89 # We only want one MD5 calculation happening at a time to avoid HD thrashing.

	90 md5_lock = threading.Lock()

	91

	92 # Start up all the worker threads.

	93 all_threads = []

	94 ret_codes = Queue.Queue()

	95 ret_codes.put((0, None))

	96 upload_queue = Queue.Queue()

	97 upload_timer = time.time()

	98 for thread_num in range(num_threads):

	99 t = threading.Thread(

	100 target=_upload_worker,

	101 args=[thread_num, upload_queue, base_url,

	102 gsutil.clone(), md5_lock, force, use_md5, ret_codes])

	103 t.daemon = True

	104 t.start()

	105 all_threads.append(t)

	106

	107 # We want to hash everything in a single thread since its faster.

	108 # The bottleneck is in disk IO, not CPU.

	109 hash_timer = time.time() # For timing statistics.

	110 for filename in input_filenames:

	111 if not os.path.exists(filename):

	112 print 'Error: %s not found, skipping.' % filename

	113 continue

	114 if os.path.exists('%s.sha1' % filename) and skip_hashing:

	115 print 'Found hash for %s, skipping.' % filename

	116 upload_queue.put((filename, open('%s.sha1' % filename).read()))

	117 continue

	118 print 'Calculating hash for %s...' % filename,

	119 sha1_sum = gstools.GetSHA1(filename)

	120 with open(filename + '.sha1', 'wb') as f:

	121 f.write(sha1_sum)

	122 print 'done'

	123 upload_queue.put((filename, sha1_sum))

	124 hash_time = time.time() - hash_timer

	125

	126 # Wait for everything to finish.

	127 for _ in all_threads:

	128 upload_queue.put((None, None)) # To mark the end of the work queue.

	129 for t in all_threads:

	130 t.join()

	131

	132 # Print timing information.

	133 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)

	134 print 'Uploading took %1f seconds' % (time.time() - upload_timer)

	135

	136 # See if we ran into any errors.

	137 max_ret_code = 0

	138 for ret_code, message in ret_codes.queue:

	139 max_ret_code = max(ret_code, max_ret_code)

	140 if message:

	141 print >> sys.stderr, message

	142

	143 if not max_ret_code:

	144 print 'Success.'

	145 else:

	146 print 'We encountered some error(s).'

	147

	148 return max_ret_code

	149

	150

	151 def main(args):

	152 parser = optparse.OptionParser(USAGE_STRING)

	153 parser.add_option('-b', '--bucket',

	154 help='Google Storage bucket to upload to.')

	155 parser.add_option('-e', '--boto', help='Specify a custom boto file.')

	156 parser.add_option('-f', '--force', action='store_true',

	157 help='Force upload even if remote file exists.')

	158 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,

	159 help='Path to the gsutil script.')

	160 parser.add_option('-m', '--use_md5', action='store_true',

	161 help='Generate MD5 files when scanning, and don\'t check '

	162 'the MD5 checksum if a .md5 file is found.')

	163 parser.add_option('-t', '--num_threads', default=1, type='int',

	164 help='Number of uploader threads to run.')

	165 parser.add_option('-s', '--skip_hashing', action='store_true',

	166 help='Skip hashing if .sha1 file exists.')

	167 parser.add_option('-0', '--use_null_terminator', action='store_true',

	168 help='Use \\0 instead of \\n when parsing '

	169 'the file list from stdin. This is useful if the input '

	170 'is coming from "find ... -print0".')

	171 (options, args) = parser.parse_args()

	172

	173 # Enumerate our inputs.

	174 input_filenames = get_targets(args, parser, options.use_null_terminator)

	175

	176 # Make sure we can find a working instance of gsutil.

	177 if os.path.exists(GSUTIL_DEFAULT_PATH):

	178 gsutil = gstools.Gsutil(GSUTIL_DEFAULT_PATH)

	179 else:

	180 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %

	181 GSUTIL_DEFAULT_PATH)

	182 return 1

	183

	184 # Check we have a valid bucket with valid permissions.

	185 base_url, code = gstools.CheckBucketPermissions(options.bucket, gsutil)

	186 if code:

	187 return code

	188

	189 return upload_to_google_storage(

	190 input_filenames, base_url, gsutil, options.force, options.use_md5,

	191 options.num_threads, options.skip_hashing)

	192

	193

	194 if __name__ == '__main__':

	195 sys.exit(main(sys.argv))

OLD	NEW

« gstools.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »