upload_to_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Uploads files to Google Storage content addressed."""

	7

	8 import hashlib

	9 import optparse

	10 import os

	11 import Queue

	12 import re

	13 import sys

	14 import threading

	15 import time

	16

	17 from download_from_google_storage import check_bucket_permissions

	18 from download_from_google_storage import get_sha1

	19 from download_from_google_storage import Gsutil

	20 from download_from_google_storage import printer_worker

	21

	22 GSUTIL_DEFAULT_PATH = os.path.join(

	23 os.path.dirname(os.path.abspath(__file__)),

	24 'third_party', 'gsutil', 'gsutil')

	25

	26 USAGE_STRING = """%prog [options] target [target2 ...].

	27 Target is the file intended to be uploaded to Google Storage.

	28 If target is "-", then a list of files will be taken from standard input

	29

	30 This script will generate a file (original filename).sha1 containing the

	31 sha1 sum of the uploaded file.

	32 It is recommended that the .sha1 file is checked into the repository,

	33 the original file removed from the repository, and a hook added to the

	34 DEPS file to call download_from_google_storage.py.

	35

	36 Example usages

	37 --------------

	38

	39 Scan the current directory and upload all files larger than 1MB:

	40 find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -b bkt -

	41 (Replace "bkt" with the name of a writable bucket.)

	42 """

	43

	44

	45 def get_md5(filename):

	46 md5_calculator = hashlib.md5()

	47 with open(filename, 'rb') as f:

	48 while True:

	49 chunk = f.read(1024*1024)

	50 if not chunk:

	51 break

	52 md5_calculator.update(chunk)

	53 return md5_calculator.hexdigest()

	54

	55

	56 def get_md5_cached(filename):

	57 """Don't calculate the MD5 if we can find a .md5 file."""

	58 # See if we can find an existing MD5 sum stored in a file.

	59 if os.path.exists('%s.md5' % filename):

	60 with open('%s.md5' % filename, 'rb') as f:

	61 md5_match = re.search('([a-z0-9]{32})', f.read())

	62 if md5_match:

	63 return md5_match.group(1)

	64 else:

	65 md5_hash = get_md5(filename)

	66 with open('%s.md5' % filename, 'wb') as f:

	67 f.write(md5_hash)

	68 return md5_hash

	69

	70

	71 def _upload_worker(

	72 thread_num, upload_queue, base_url, gsutil, md5_lock, force,

	73 use_md5, stdout_queue, ret_codes):

	74 while True:

	75 filename, sha1_sum = upload_queue.get()

	76 if not filename:

	77 break

	78 file_url = '%s/%s' % (base_url, sha1_sum)

	79 if gsutil.check_call('ls', file_url)[0] == 0 and not force:

	80 # File exists, check MD5 hash.

	81 _, out, _ = gsutil.check_call('ls', '-L', file_url)

	82 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)

	83 if etag_match:

	84 remote_md5 = etag_match.group(1)

	85 # Calculate the MD5 checksum to match it to Google Storage's ETag.

	86 with md5_lock:

	87 if use_md5:

	88 local_md5 = get_md5_cached(filename)

	89 else:

	90 local_md5 = get_md5(filename)

	91 if local_md5 == remote_md5:

	92 stdout_queue.put(

	93 '%d> File %s already exists and MD5 matches, upload skipped' %

	94 (thread_num, filename))

	95 continue

	96 stdout_queue.put('%d> Uploading %s...' % (

	97 thread_num, filename))

	98 code, _, err = gsutil.check_call('cp', '-q', filename, file_url)

	99 if code != 0:

	100 ret_codes.put(

	101 (code,

	102 'Encountered error on uploading %s to %s\n%s' %

	103 (filename, file_url, err)))

	104 continue

	105

	106

	107 def get_targets(args, parser, use_null_terminator):

	108 if not args:

	109 parser.error('Missing target.')

	110

	111 if len(args) == 1 and args[0] == '-':

	112 # Take stdin as a newline or null seperated list of files.

	113 if use_null_terminator:

	114 return sys.stdin.read().split('\0')

	115 else:

	116 return sys.stdin.read().splitlines()

	117 else:

	118 return args

	119

	120

	121 def upload_to_google_storage(

	122 input_filenames, base_url, gsutil, force,

	123 use_md5, num_threads, skip_hashing):

	124 # We only want one MD5 calculation happening at a time to avoid HD thrashing.

	125 md5_lock = threading.Lock()

	126

	127 # Start up all the worker threads plus the printer thread.

	128 all_threads = []

	129 ret_codes = Queue.Queue()

	130 ret_codes.put((0, None))

	131 upload_queue = Queue.Queue()

	132 upload_timer = time.time()

	133 stdout_queue = Queue.Queue()

	134 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])

	135 printer_thread.daemon = True

	136 printer_thread.start()

	137 for thread_num in range(num_threads):

	138 t = threading.Thread(

	139 target=_upload_worker,

	140 args=[thread_num, upload_queue, base_url, gsutil, md5_lock,

	141 force, use_md5, stdout_queue, ret_codes])

	142 t.daemon = True

	143 t.start()

	144 all_threads.append(t)

	145

	146 # We want to hash everything in a single thread since its faster.

	147 # The bottleneck is in disk IO, not CPU.

	148 hashing_start = time.time()

	149 for filename in input_filenames:

	150 if not os.path.exists(filename):

	151 stdout_queue.put('Main> Error: %s not found, skipping.' % filename)

	152 continue

	153 if os.path.exists('%s.sha1' % filename) and skip_hashing:

	154 stdout_queue.put(

	155 'Main> Found hash for %s, sha1 calculation skipped.' % filename)

	156 with open(filename + '.sha1', 'rb') as f:

	157 sha1_file = f.read(1024)

	158 if not re.match('^([a-z0-9]{40})$', sha1_file):

	159 print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename

	160 return 1

	161 upload_queue.put((filename, sha1_file))

	162 continue

	163 stdout_queue.put('Main> Calculating hash for %s...' % filename)

	164 sha1_sum = get_sha1(filename)

	165 with open(filename + '.sha1', 'wb') as f:

	166 f.write(sha1_sum)

	167 stdout_queue.put('Main> Done calculating hash for %s.' % filename)

	168 upload_queue.put((filename, sha1_sum))

	169 hashing_duration = time.time() - hashing_start

	170

	171 # Wait for everything to finish.

	172 for _ in all_threads:

	173 upload_queue.put((None, None)) # To mark the end of the work queue.

	174 for t in all_threads:

	175 t.join()

	176 stdout_queue.put(None)

	177 printer_thread.join()

	178

	179 # Print timing information.

	180 print 'Hashing %s files took %1f seconds' % (

	181 len(input_filenames), hashing_duration)

	182 print 'Uploading took %1f seconds' % (time.time() - upload_timer)

	183

	184 # See if we ran into any errors.

	185 max_ret_code = 0

	186 for ret_code, message in ret_codes.queue:

	187 max_ret_code = max(ret_code, max_ret_code)

	188 if message:

	189 print >> sys.stderr, message

	190

	191 if not max_ret_code:

	192 print 'Success!'

	193

	194 return max_ret_code

	195

	196

	197 def main(args):

	198 parser = optparse.OptionParser(USAGE_STRING)

	199 parser.add_option('-b', '--bucket',

	200 help='Google Storage bucket to upload to.')

	201 parser.add_option('-e', '--boto', help='Specify a custom boto file.')

	202 parser.add_option('-f', '--force', action='store_true',

	203 help='Force upload even if remote file exists.')

	204 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,

	205 help='Path to the gsutil script.')

	206 parser.add_option('-m', '--use_md5', action='store_true',

	207 help='Generate MD5 files when scanning, and don\'t check '

	208 'the MD5 checksum if a .md5 file is found.')

	209 parser.add_option('-t', '--num_threads', default=1, type='int',

	210 help='Number of uploader threads to run.')

	211 parser.add_option('-s', '--skip_hashing', action='store_true',

	212 help='Skip hashing if .sha1 file exists.')

	213 parser.add_option('-0', '--use_null_terminator', action='store_true',

	214 help='Use \\0 instead of \\n when parsing '

	215 'the file list from stdin. This is useful if the input '

	216 'is coming from "find ... -print0".')

	217 (options, args) = parser.parse_args()

	218

	219 # Enumerate our inputs.

	220 input_filenames = get_targets(args, parser, options.use_null_terminator)

	221

	222 # Make sure we can find a working instance of gsutil.

	223 if os.path.exists(GSUTIL_DEFAULT_PATH):

	224 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)

	225 else:

	226 gsutil = None

	227 for path in os.environ["PATH"].split(os.pathsep):

	228 if os.path.exists(path) and 'gsutil' in os.listdir(path):

	229 gsutil = Gsutil(os.path.join(path, 'gsutil'))

	230 if not gsutil:

	231 parser.error('gsutil not found in %s, bad depot_tools checkout?' %

	232 GSUTIL_DEFAULT_PATH)

	233

	234 # Check we have a valid bucket with valid permissions.

	235 base_url, code = check_bucket_permissions(options.bucket, gsutil)

	236 if code:

	237 return code

	238

	239 return upload_to_google_storage(

	240 input_filenames, base_url, gsutil, options.force, options.use_md5,

	241 options.num_threads, options.skip_hashing)

	242

	243

	244 if __name__ == '__main__':

	245 sys.exit(main(sys.argv))

OLD	NEW

« download_from_google_storage.py ('K') | « tests/upload_to_google_storage_unittests.py ('k') | no next file » | no next file with comments »