upload_to_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Review fixes, updated gsutil Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Uploads files to Google Storage content addressed."""

	7

	8 import optparse

	9 import os

	10 import Queue

	11 import re

	12 import sys

	13 import threading

	14 import time

	15

	16 from common import Gsutil
	M-A Ruel 2013/02/25 15:15:06 Replace with: import common Replace with: import common Ryan Tseng 2013/02/27 02:06:56 Done. Show quoted text On 2013/02/25 15:15:06, Marc-Antoine Ruel wrote: > Replace with: > import common Done.
	17 from common import GetSHA1

	18 from common import GetMD5

	19

	20 GSUTIL_DEFAULT_PATH = os.path.join(

	21 os.path.dirname(os.path.abspath(__file__)),

	22 'third_party', 'gsutil', 'gsutil')

	23

	24 USAGE_STRING = """%prog [options] target [target2 ...].

	25 Target is the file intended to be uploaded to Google Storage.

	26 If target is "-", then a list of files will be taken from standard input

	27

	28 This script will generate a file (original filename).sha1 containing the

	29 sha1 sum of the uploaded file.

	30 It is recommended that the .sha1 file is checked into the repository,

	31 the original file removed from the repository, and a hook added to the

	32 DEPS file to call download_from_google_storage.py.

	33

	34 Example usages

	35 --------------

	36

	37 Scan the current directory and upload all files larger than 1MB:

	38 find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -

	39 """

	40

	41

	42 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):

	43 while True:

	44 try:

	45 filename, sha1_sum = q.get_nowait()

	46 file_url = '%s/%s' % (base_url, sha1_sum)

	47 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:
	M-A Ruel 2013/02/25 15:15:06 I don't see gsutil being defined anywhere, did you I don't see gsutil being defined anywhere, did you test this code? Ryan Tseng 2013/02/27 02:06:56 The gsutil object is initialized in main() and pas Show quoted text On 2013/02/25 15:15:06, Marc-Antoine Ruel wrote: > I don't see gsutil being defined anywhere, did you test this code? The gsutil object is initialized in main() and passed into the worker thread as an argument.
	48 # File exists, check MD5 hash.

	49 _, out, _ = gsutil.check_call('ls', '-L', file_url)

	50 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)

	51 if etag_match:

	52 remote_md5 = etag_match.group(1)

	53 # Calculate the MD5 checksum to match it to Google Storage's ETag.

	54 local_md5 = GetMD5(filename, md5_lock, options.use_md5)

	55 if local_md5 == remote_md5:

	56 print ('File %s already exists at %s and MD5 matches, exiting' %

	57 (filename, file_url))

	58 continue

	59 print 'Uploading %s to %s' % (filename, file_url)

	60 code = gsutil.call('cp', '-q', filename, file_url)

	61 if code != 0:

	62 print >> sys.stderr, gsutil.stderr

	63 continue

	64 except Queue.Empty:

	65 return

	66

	67

	68 def main(args):

	69 parser = optparse.OptionParser(USAGE_STRING)

	70 parser.add_option('-b', '--bucket',

	71 help='Google Storage bucket to upload to.')

	72 parser.add_option('-e', '--boto', help='Specify a custom boto file.')

	73 parser.add_option('-f', '--force', action='store_true',

	74 help='Force upload even if remote file exists.')

	75 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,

	76 help='Path to the gsutil script.')

	77 parser.add_option('-m', '--use_md5', action='store_true', default=False,
	M-A Ruel 2013/02/25 15:15:06 Remove default=False everywhere, it's unnecessary. Remove default=False everywhere, it's unnecessary. Ryan Tseng 2013/02/27 02:06:56 Done. Show quoted text On 2013/02/25 15:15:06, Marc-Antoine Ruel wrote: > Remove default=False everywhere, it's unnecessary. Done.
	78 help='Generate MD5 files when scanning, and don\'t check '

	79 'the MD5 checksum if a .md5 file is found.')

	80 parser.add_option('-t', '--num_threads', default=1, type='int',

	81 help='Number of uploader threads to run.')

	82 parser.add_option('-s', '--skip_hashing', action='store_true', default=False,

	83 help='Skip hashing if .sha1 file exists.')

	84 parser.add_option('-0', '--use_null_terminator', action='store_true',

	85 default=False, help='Use \\0 instead of \\n when parsing '

	86 'the file list from stdin. This is useful if the input '

	87 'is coming from "find ... -print0".')

	88 (options, args) = parser.parse_args()

	89

	90 if len(args) < 1:
	M-A Ruel 2013/02/25 15:15:06 if not args: if not args: Ryan Tseng 2013/02/27 02:06:56 Done. Show quoted text On 2013/02/25 15:15:06, Marc-Antoine Ruel wrote: > if not args: Done.
	91 parser.error('Missing target.')

	92 elif len(args) == 1 and args[0] == '-':

	93 # Take stdin as a newline or null seperated list of files.

	94 if options.use_null_terminator:

	95 input_filenames = sys.stdin.read().split('\0')

	96 else:

	97 input_filenames = sys.stdin.read().splitlines()

	98 else:

	99 input_filenames = args

	100

	101 if not options.bucket:

	102 parser.error('Missing bucket. Specify bucket with --bucket.')

	103 base_url = 'gs://%s' % options.bucket

	104

	105 # Make sure we can find a working instance of gsutil.

	106 if os.path.exists(GSUTIL_DEFAULT_PATH):

	107 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)

	108 else:

	109 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %

	110 GSUTIL_DEFAULT_PATH)

	111 return 1

	112

	113 # Check if we have permissions to the Google Storage bucket.
	M-A Ruel 2013/02/25 15:15:06 Can you split the rest of this code into its separ Can you split the rest of this code into its separate function? I'm a big fan of splitting argument processing from actual "main processing". Ryan Tseng 2013/02/27 02:06:56 Done. Show quoted text On 2013/02/25 15:15:06, Marc-Antoine Ruel wrote: > Can you split the rest of this code into its separate function? I'm a big fan of > splitting argument processing from actual "main processing". Done.
	114 code, _, ls_err = gsutil.check_call('ls', base_url)

	115 if code == 403:

	116 code, _, _ = gsutil.call('config')

	117 if code != 0:

	118 print >> sys.stderr, 'Error while authenticating to %s.' % base_url

	119 return 403

	120 elif code == 404:

	121 print >> sys.stderr, '%s not found.' % base_url

	122 return 404

	123 elif code != 0:

	124 print >> sys.stderr, ls_err

	125 return code

	126

	127 # We want to hash everything in a single thread since its faster.

	128 # The bottleneck is in disk IO, not CPU.

	129 upload_queue = Queue.Queue()

	130 hash_timer = time.time()

	131 for filename in input_filenames:

	132 if not os.path.exists(filename):

	133 print 'Error: %s not found, skipping.' % filename

	134 continue

	135 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:

	136 print 'Found hash for %s, skipping.' % filename

	137 upload_queue.put((filename, open('%s.sha1' % filename).read()))

	138 continue

	139 print 'Calculating hash for %s...' % filename,

	140 sha1_sum = GetSHA1(filename)

	141 with open(filename + '.sha1', 'wb') as f:

	142 f.write(sha1_sum)

	143 print 'done'

	144 upload_queue.put((filename, sha1_sum))

	145 hash_time = time.time() - hash_timer

	146

	147 # Start up all the worker threads.

	148 all_threads = []

	149

	150 # We only want one MD5 calculation happening at a time.

	151 md5_lock = threading.Lock()

	152 upload_timer = time.time()

	153

	154 for thread_num in range(options.num_threads):

	155 t = threading.Thread(target=_upload_worker, args=[thread_num,
	M-A Ruel 2013/02/25 15:15:06 Argument alignement Start the threads before enque Argument alignement Start the threads before enqueuing. Ryan Tseng 2013/02/27 02:06:56 Done. Show quoted text On 2013/02/25 15:15:06, Marc-Antoine Ruel wrote: > Argument alignement > Start the threads before enqueuing. Done.
	156 upload_queue, base_url, gsutil.clone(), options, md5_lock])

	157 t.daemon = True

	158 t.start()

	159 all_threads.append(t)

	160

	161 # Wait for everything to finish.

	162 for t in all_threads:

	163 t.join()

	164

	165 print 'Success.'

	166 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)

	167 print 'Uploading took %1f seconds' % (time.time() - upload_timer)

	168 return 0

	169

	170

	171 if __name__ == '__main__':

	172 sys.exit(main(sys.argv))

OLD	NEW

« third_party/gsutil/boto/tests/__init__.py ('K') | « third_party/gsutil/third_party/retry_decorator/decorators.py ('k') | no next file » | no next file with comments »