Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(244)

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Test fix Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Uploads files to Google Storage content addressed."""
7
8 import optparse
9 import os
10 import Queue
11 import re
12 import sys
13 import threading
14 import time
15
16 import gstools
17
18 GSUTIL_DEFAULT_PATH = os.path.join(
19 os.path.dirname(os.path.abspath(__file__)),
20 'third_party', 'gsutil', 'gsutil')
21
22 USAGE_STRING = """%prog [options] target [target2 ...].
23 Target is the file intended to be uploaded to Google Storage.
24 If target is "-", then a list of files will be taken from standard input
25
26 This script will generate a file (original filename).sha1 containing the
27 sha1 sum of the uploaded file.
28 It is recommended that the .sha1 file is checked into the repository,
29 the original file removed from the repository, and a hook added to the
30 DEPS file to call download_from_google_storage.py.
31
32 Example usages
33 --------------
34
35 Scan the current directory and upload all files larger than 1MB:
36 find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -
37 """
38
39
40 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):
41 while True:
42 filename, sha1_sum = q.get()
43 if not filename:
44 break
45 file_url = '%s/%s' % (base_url, sha1_sum)
46 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:
47 # File exists, check MD5 hash.
48 _, out, _ = gsutil.check_call('ls', '-L', file_url)
49 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
50 if etag_match:
51 remote_md5 = etag_match.group(1)
52 # Calculate the MD5 checksum to match it to Google Storage's ETag.
53 if options.use_md5:
54 local_md5 = gstools.GetMD5Cached(filename, md5_lock)
55 else:
56 local_md5 = gstools.GetMD5(filename, md5_lock)
57 if local_md5 == remote_md5:
58 print ('File %s already exists at %s and MD5 matches, exiting' %
59 (filename, file_url))
60 continue
61 print 'Uploading %s to %s' % (filename, file_url)
62 code = gsutil.call('cp', '-q', filename, file_url)
63 if code != 0:
64 print >> sys.stderr, gsutil.stderr
M-A Ruel 2013/02/28 14:53:56 This won't affect this process' exit code?
Ryan Tseng 2013/03/01 02:41:35 It probably should. Updated.
65 continue
66
67
68 def get_targets(options, args, parser):
69 if not args:
70 parser.error('Missing target.')
71
72 if len(args) == 1 and args[0] == '-':
73 # Take stdin as a newline or null seperated list of files.
74 if options.use_null_terminator:
75 return sys.stdin.read().split('\0')
76 else:
77 return sys.stdin.read().splitlines()
78 else:
79 return args
80
81
82 def upload_to_google_storage(input_filenames, base_url, gsutil, options):
83 # We only want one MD5 calculation happening at a time to avoid HD thrashing.
84 md5_lock = threading.Lock()
85
86 # Start up all the worker threads.
87 all_threads = []
88 upload_queue = Queue.Queue()
89 upload_timer = time.time()
90 for thread_num in range(options.num_threads):
91 t = threading.Thread(
92 target=_upload_worker,
93 args=[thread_num, upload_queue, base_url,
94 gsutil.clone(), options, md5_lock])
95 t.daemon = True
96 t.start()
97 all_threads.append(t)
98
99 # We want to hash everything in a single thread since its faster.
100 # The bottleneck is in disk IO, not CPU.
101 hash_timer = time.time() # For timing statistics.
102 for filename in input_filenames:
103 if not os.path.exists(filename):
104 print 'Error: %s not found, skipping.' % filename
105 continue
106 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:
107 print 'Found hash for %s, skipping.' % filename
108 upload_queue.put((filename, open('%s.sha1' % filename).read()))
109 continue
110 print 'Calculating hash for %s...' % filename,
111 sha1_sum = gstools.GetSHA1(filename)
M-A Ruel 2013/02/28 14:53:56 So you may end up doing both a md5 and sha1 calcul
Ryan Tseng 2013/03/01 02:41:35 gsutil doesn't maximize network transfer bandwidth
112 with open(filename + '.sha1', 'wb') as f:
113 f.write(sha1_sum)
114 print 'done'
115 upload_queue.put((filename, sha1_sum))
116 hash_time = time.time() - hash_timer
117
118 # Wait for everything to finish.
119 for _ in all_threads:
120 upload_queue.put((None, None)) # To mark the end of the work queue.
121 for t in all_threads:
122 t.join()
123
124 print 'Success.'
125 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)
126 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
127 return 0
128
129
130 def main(args):
131 parser = optparse.OptionParser(USAGE_STRING)
132 parser.add_option('-b', '--bucket',
133 help='Google Storage bucket to upload to.')
134 parser.add_option('-e', '--boto', help='Specify a custom boto file.')
135 parser.add_option('-f', '--force', action='store_true',
136 help='Force upload even if remote file exists.')
137 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
138 help='Path to the gsutil script.')
139 parser.add_option('-m', '--use_md5', action='store_true',
140 help='Generate MD5 files when scanning, and don\'t check '
141 'the MD5 checksum if a .md5 file is found.')
142 parser.add_option('-t', '--num_threads', default=1, type='int',
143 help='Number of uploader threads to run.')
144 parser.add_option('-s', '--skip_hashing', action='store_true',
145 help='Skip hashing if .sha1 file exists.')
146 parser.add_option('-0', '--use_null_terminator', action='store_true',
147 help='Use \\0 instead of \\n when parsing '
148 'the file list from stdin. This is useful if the input '
149 'is coming from "find ... -print0".')
150 (options, args) = parser.parse_args()
151
152 # Enumerate our inputs.
153 input_filenames = get_targets(options, args, parser)
154
155 # Make sure we can find a working instance of gsutil.
156 if os.path.exists(GSUTIL_DEFAULT_PATH):
157 gsutil = gstools.Gsutil(GSUTIL_DEFAULT_PATH)
158 else:
159 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
160 GSUTIL_DEFAULT_PATH)
161 return 1
162
163 # Check we have a valid bucket with valid permissions.
164 base_url, code = gstools.CheckBucketPermissions(options.bucket, gsutil)
165 if code:
166 return code
167
168 return upload_to_google_storage(input_filenames, base_url, gsutil, options)
169
170
171 if __name__ == '__main__':
172 sys.exit(main(sys.argv))
OLDNEW
« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698