Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(568)

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Removed gsutil/tests and gsutil/docs Created 7 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« download_from_google_storage.py ('K') | « third_party/gsutil/tox.ini ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Script to upload files to Google Storage."""
M-A Ruel 2013/02/22 01:15:56 """Uploads files to Google Storage."""
Ryan Tseng 2013/02/22 02:38:00 Done.
7
8 import optparse
9 import os
10 import Queue
11 import re
12 import sys
13 import threading
14 import time
15
16 from common import Gsutil
17 from common import GetSHA1
18 from common import GetMD5
19
20 # TODO(hinoka): This is currently incorrect. Should find a better default.
21 GSUTIL_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.normpath(__file__)),
M-A Ruel 2013/02/22 01:15:56 s/normpath/abspath/ But you can't commit this as-
Ryan Tseng 2013/02/22 02:38:00 Done.
22 '..', '..', 'third_party', 'gsutil', 'gsutil')
23
24 USAGE_STRING = """%prog [options] target [target2 ...].
25 Target is the file intended to be uploaded to Google Storage.
26 If target is "-", then a list of files will be taken from standard input
27
28 This script will generate a file (original filename).sha1 containing the
29 sha1 sum of the uploaded file.
30 It is recommended that the .sha1 file is checked into the repository,
31 the original file removed from the repository, and a hook added to the
32 DEPS file to call download_from_google_storage.py.
33
34 Example usages
35 --------------
36
37 Scan the current directory and upload all files larger than 1MB:
38 find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -
39 """
40
41
M-A Ruel 2013/02/22 01:15:56 Remove one line
Ryan Tseng 2013/02/22 02:38:00 Done.
42
43 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):
44 while True:
45 try:
46 filename, sha1_sum = q.get_nowait()
47 file_url = '%s/%s' % (base_url, sha1_sum)
48 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:
49 # File exists, check MD5 hash.
50 _, out, _ = gsutil.check_call('ls', '-L', file_url)
51 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
52 if etag_match:
53 remote_md5 = etag_match.groups()[0]
M-A Ruel 2013/02/22 01:15:56 remote_md5 = etag_match.group(1)
Ryan Tseng 2013/02/22 02:38:00 Done.
54 # Calculate the MD5 checksum to match it to Google Storage's ETag.
55 local_md5 = GetMD5(filename, md5_lock, options.use_md5)
56 if local_md5 == remote_md5:
57 print ('File %s already exists at %s and MD5 matches, exiting' %
58 (filename, file_url))
59 continue
60 print 'Uploading %s to %s' % (filename, file_url)
61 code = gsutil.call('cp', '-q', filename, file_url)
62 if code != 0:
63 print >> sys.stderr, gsutil.stderr
64 continue
65 except Queue.Empty:
66 return
67
M-A Ruel 2013/02/22 01:15:56 2 lines
Ryan Tseng 2013/02/22 02:38:00 Done.
68 def main(args):
69 parser = optparse.OptionParser(USAGE_STRING)
70 parser.add_option('-b', '--bucket', default='chrome-artifacts',
M-A Ruel 2013/02/22 01:15:56 I prefer no default, at least not if this file is
Ryan Tseng 2013/02/22 02:38:00 Done.
71 help='Google Storage bucket to upload to.')
72 parser.add_option('-e', '--boto', default=None,
M-A Ruel 2013/02/22 01:15:56 No need for default=None
Ryan Tseng 2013/02/22 02:38:00 Done.
73 help='Specify a custom boto file.')
74 parser.add_option('-f', '--force', action='store_true', default=False,
M-A Ruel 2013/02/22 01:15:56 No need for default=False, same below.
Ryan Tseng 2013/02/22 02:38:00 Done.
75 help='Force upload even if remote file exists.')
76 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
M-A Ruel 2013/02/22 01:15:56 Why this argument at all if gsutil is included in
Ryan Tseng 2013/02/22 02:38:00 Removed.
77 help='Path to the gsutil script.')
78 parser.add_option('-m', '--use_md5', action='store_true', default=False,
79 help='Generate MD5 files when scanning, and don\'t check '
80 'the MD5 checksum if a .md5 file is found.')
81 parser.add_option('-t', '--num_threads', default=1, type='int',
82 help='Number of uploader threads to run.')
83 parser.add_option('-s', '--skip_hashing', action='store_true', default=False,
M-A Ruel 2013/02/22 01:15:56 Why not the default? Same for --use_md5.
Ryan Tseng 2013/02/22 02:38:00 I'm avoiding the situation where you modify the fi
84 help='Skip hashing if .sha1 file exists.')
85 parser.add_option('-0', '--use_null_terminator', action='store_true',
86 default=False, help='Use \\0 instead of \\n when parsing '
87 'the file list from stdin. This is useful if the input '
88 'is coming from "find ... -print0".')
89 (options, args) = parser.parse_args()
90
91 if len(args) < 1:
92 parser.error('Missing target.')
93 elif len(args) == 1 and args[0] == '-':
94 # Take stdin as a newline or null seperated list of files.
95 if options.use_null_terminator:
96 input_filenames = [line for line in sys.stdin.read().split('\0')]
M-A Ruel 2013/02/22 01:15:56 input_filenames = sys.stdin.read().split('\0')
Ryan Tseng 2013/02/22 02:38:00 Done.
97 else:
98 input_filenames = [line.strip() for line in sys.stdin.readlines()]
M-A Ruel 2013/02/22 01:15:56 Technically, you would want to have it work with a
Ryan Tseng 2013/02/22 02:38:00 Done.
99 else:
100 input_filenames = args
101 base_url = 'gs://%s' % options.bucket
102
103 # Make sure we can find a working instance of gsutil.
104 if os.path.exists(options.gsutil_path):
105 gsutil = Gsutil(options.gsutil_path)
106 else:
107 for path in os.environ["PATH"].split(os.pathsep):
108 if os.path.exists(path) and 'gsutil' in os.listdir(path):
109 gsutil = Gsutil(os.path.join(path, 'gsutil'))
110
111 # Check if we have permissions to the Google Storage bucket.
112 code, _, ls_err = gsutil.check_call('ls', base_url)
113 if code == 403:
114 code, _, _ = gsutil.call('config')
115 if code != 0:
116 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
117 return 403
118 elif code == 404:
119 print >> sys.stderr, '%s not found.' % base_url
120 return 404
121 elif code != 0:
122 print >> sys.stderr, ls_err
123 return code
124
125 # We want to hash everything in a single thread since its faster.
M-A Ruel 2013/02/22 01:15:56 I don't understand why it'd be faster since it's C
Ryan Tseng 2013/02/22 02:38:00 We are most definitely IO bound at harddrive read
M-A Ruel 2013/02/25 15:15:06 Err right, sorry.
126 # The bottleneck is in disk IO, not CPU.
127 upload_queue = Queue.Queue()
128 hash_timer = time.time()
129 for filename in input_filenames:
130 if not os.path.exists(filename):
131 print 'Error: %s not found, skipping.' % filename
132 continue
133 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:
134 print 'Found hash for %s, skipping.' % filename
135 upload_queue.put((filename, open('%s.sha1' % filename).read()))
136 continue
137 print 'Calculating hash for %s...' % filename,
138 sha1_sum = GetSHA1(filename)
139 with open(filename + '.sha1', 'w') as f:
M-A Ruel 2013/02/22 01:15:56 'wb'
Ryan Tseng 2013/02/22 02:38:00 Done.
140 f.write(sha1_sum)
141 print 'done'
142 upload_queue.put((filename, sha1_sum))
143 hash_time = time.time() - hash_timer
144
145 # Start up all the worker threads.
146 all_threads = []
147
148 # We only want one MD5 calculation happening at a time.
M-A Ruel 2013/02/22 01:15:56 Why?
Ryan Tseng 2013/02/22 02:38:00 Harddrive IO bound. The harddrive read head jitte
149 md5_lock = threading.Lock()
150 upload_timer = time.time()
151
152 for thread_num in range(options.num_threads):
153 t = threading.Thread(target=_upload_worker, args=[thread_num,
M-A Ruel 2013/02/22 01:15:56 Don't split arguments like that
Ryan Tseng 2013/02/22 02:38:00 ??? What did I do?
M-A Ruel 2013/02/25 15:15:06 t = threading.Thread( target=_upload_worker,
Ryan Tseng 2013/02/27 02:06:55 Done.
154 upload_queue, base_url, gsutil.clone(), options, md5_lock])
155 t.daemon = True
156 t.start()
157 all_threads.append(t)
158
159 # Wait for everything to finish.
160 for t in all_threads:
161 t.join()
162
163 print 'Success.'
164 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)
165 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
166 return 0
167
M-A Ruel 2013/02/22 01:15:56 two lines
Ryan Tseng 2013/02/22 02:38:00 Done.
168 if __name__ == '__main__':
169 sys.exit(main(sys.argv))
OLDNEW
« download_from_google_storage.py ('K') | « third_party/gsutil/tox.ini ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698