Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(67)

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Review fixes, updated gsutil Created 7 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Uploads files to Google Storage content addressed."""
7
8 import optparse
9 import os
10 import Queue
11 import re
12 import sys
13 import threading
14 import time
15
16 from common import Gsutil
M-A Ruel 2013/02/25 15:15:06 Replace with: import common
Ryan Tseng 2013/02/27 02:06:56 Done.
17 from common import GetSHA1
18 from common import GetMD5
19
20 GSUTIL_DEFAULT_PATH = os.path.join(
21 os.path.dirname(os.path.abspath(__file__)),
22 'third_party', 'gsutil', 'gsutil')
23
24 USAGE_STRING = """%prog [options] target [target2 ...].
25 Target is the file intended to be uploaded to Google Storage.
26 If target is "-", then a list of files will be taken from standard input
27
28 This script will generate a file (original filename).sha1 containing the
29 sha1 sum of the uploaded file.
30 It is recommended that the .sha1 file is checked into the repository,
31 the original file removed from the repository, and a hook added to the
32 DEPS file to call download_from_google_storage.py.
33
34 Example usages
35 --------------
36
37 Scan the current directory and upload all files larger than 1MB:
38 find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -
39 """
40
41
42 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):
43 while True:
44 try:
45 filename, sha1_sum = q.get_nowait()
46 file_url = '%s/%s' % (base_url, sha1_sum)
47 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:
M-A Ruel 2013/02/25 15:15:06 I don't see gsutil being defined anywhere, did you
Ryan Tseng 2013/02/27 02:06:56 The gsutil object is initialized in main() and pas
48 # File exists, check MD5 hash.
49 _, out, _ = gsutil.check_call('ls', '-L', file_url)
50 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
51 if etag_match:
52 remote_md5 = etag_match.group(1)
53 # Calculate the MD5 checksum to match it to Google Storage's ETag.
54 local_md5 = GetMD5(filename, md5_lock, options.use_md5)
55 if local_md5 == remote_md5:
56 print ('File %s already exists at %s and MD5 matches, exiting' %
57 (filename, file_url))
58 continue
59 print 'Uploading %s to %s' % (filename, file_url)
60 code = gsutil.call('cp', '-q', filename, file_url)
61 if code != 0:
62 print >> sys.stderr, gsutil.stderr
63 continue
64 except Queue.Empty:
65 return
66
67
68 def main(args):
69 parser = optparse.OptionParser(USAGE_STRING)
70 parser.add_option('-b', '--bucket',
71 help='Google Storage bucket to upload to.')
72 parser.add_option('-e', '--boto', help='Specify a custom boto file.')
73 parser.add_option('-f', '--force', action='store_true',
74 help='Force upload even if remote file exists.')
75 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
76 help='Path to the gsutil script.')
77 parser.add_option('-m', '--use_md5', action='store_true', default=False,
M-A Ruel 2013/02/25 15:15:06 Remove default=False everywhere, it's unnecessary.
Ryan Tseng 2013/02/27 02:06:56 Done.
78 help='Generate MD5 files when scanning, and don\'t check '
79 'the MD5 checksum if a .md5 file is found.')
80 parser.add_option('-t', '--num_threads', default=1, type='int',
81 help='Number of uploader threads to run.')
82 parser.add_option('-s', '--skip_hashing', action='store_true', default=False,
83 help='Skip hashing if .sha1 file exists.')
84 parser.add_option('-0', '--use_null_terminator', action='store_true',
85 default=False, help='Use \\0 instead of \\n when parsing '
86 'the file list from stdin. This is useful if the input '
87 'is coming from "find ... -print0".')
88 (options, args) = parser.parse_args()
89
90 if len(args) < 1:
M-A Ruel 2013/02/25 15:15:06 if not args:
Ryan Tseng 2013/02/27 02:06:56 Done.
91 parser.error('Missing target.')
92 elif len(args) == 1 and args[0] == '-':
93 # Take stdin as a newline or null seperated list of files.
94 if options.use_null_terminator:
95 input_filenames = sys.stdin.read().split('\0')
96 else:
97 input_filenames = sys.stdin.read().splitlines()
98 else:
99 input_filenames = args
100
101 if not options.bucket:
102 parser.error('Missing bucket. Specify bucket with --bucket.')
103 base_url = 'gs://%s' % options.bucket
104
105 # Make sure we can find a working instance of gsutil.
106 if os.path.exists(GSUTIL_DEFAULT_PATH):
107 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
108 else:
109 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
110 GSUTIL_DEFAULT_PATH)
111 return 1
112
113 # Check if we have permissions to the Google Storage bucket.
M-A Ruel 2013/02/25 15:15:06 Can you split the rest of this code into its separ
Ryan Tseng 2013/02/27 02:06:56 Done.
114 code, _, ls_err = gsutil.check_call('ls', base_url)
115 if code == 403:
116 code, _, _ = gsutil.call('config')
117 if code != 0:
118 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
119 return 403
120 elif code == 404:
121 print >> sys.stderr, '%s not found.' % base_url
122 return 404
123 elif code != 0:
124 print >> sys.stderr, ls_err
125 return code
126
127 # We want to hash everything in a single thread since its faster.
128 # The bottleneck is in disk IO, not CPU.
129 upload_queue = Queue.Queue()
130 hash_timer = time.time()
131 for filename in input_filenames:
132 if not os.path.exists(filename):
133 print 'Error: %s not found, skipping.' % filename
134 continue
135 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:
136 print 'Found hash for %s, skipping.' % filename
137 upload_queue.put((filename, open('%s.sha1' % filename).read()))
138 continue
139 print 'Calculating hash for %s...' % filename,
140 sha1_sum = GetSHA1(filename)
141 with open(filename + '.sha1', 'wb') as f:
142 f.write(sha1_sum)
143 print 'done'
144 upload_queue.put((filename, sha1_sum))
145 hash_time = time.time() - hash_timer
146
147 # Start up all the worker threads.
148 all_threads = []
149
150 # We only want one MD5 calculation happening at a time.
151 md5_lock = threading.Lock()
152 upload_timer = time.time()
153
154 for thread_num in range(options.num_threads):
155 t = threading.Thread(target=_upload_worker, args=[thread_num,
M-A Ruel 2013/02/25 15:15:06 Argument alignement Start the threads before enque
Ryan Tseng 2013/02/27 02:06:56 Done.
156 upload_queue, base_url, gsutil.clone(), options, md5_lock])
157 t.daemon = True
158 t.start()
159 all_threads.append(t)
160
161 # Wait for everything to finish.
162 for t in all_threads:
163 t.join()
164
165 print 'Success.'
166 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)
167 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
168 return 0
169
170
171 if __name__ == '__main__':
172 sys.exit(main(sys.argv))
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698