third_party/gsutil/boto/boto/glacier/utils.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Unified Diff: third_party/gsutil/boto/boto/glacier/utils.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Review fixes, updated gsutil Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« third_party/gsutil/README ('K') | « third_party/gsutil/boto/boto/glacier/response.py ('k') | third_party/gsutil/boto/boto/glacier/vault.py » ('j') | third_party/gsutil/boto/tests/__init__.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/gsutil/boto/boto/glacier/utils.py

diff --git a/third_party/gsutil/boto/boto/glacier/utils.py b/third_party/gsutil/boto/boto/glacier/utils.py

new file mode 100644

index 0000000000000000000000000000000000000000..af779f5cc30c829a24dd33e2f389184d5b1496ad

--- /dev/null

+++ b/third_party/gsutil/boto/boto/glacier/utils.py

@@ -0,0 +1,163 @@

+# Permission is hereby granted, free of charge, to any person obtaining a

+# copy of this software and associated documentation files (the

+# "Software"), to deal in the Software without restriction, including

+# without limitation the rights to use, copy, modify, merge, publish, dis-

+# tribute, sublicense, and/or sell copies of the Software, and to permit

+# persons to whom the Software is furnished to do so, subject to the fol-

+# lowing conditions:

+# The above copyright notice and this permission notice shall be included

+# in all copies or substantial portions of the Software.

+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS

+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-

+# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT

+# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

+# IN THE SOFTWARE.

+import hashlib

+import math

+_MEGABYTE = 1024 * 1024

+DEFAULT_PART_SIZE = 4 * _MEGABYTE

+MAXIMUM_NUMBER_OF_PARTS = 10000

+def minimum_part_size(size_in_bytes, default_part_size=DEFAULT_PART_SIZE):

+ """Calculate the minimum part size needed for a multipart upload.

+ Glacier allows a maximum of 10,000 parts per upload. It also

+ states that the maximum archive size is 10,000 * 4 GB, which means

+ the part size can range from 1MB to 4GB (provided it is one 1MB

+ multiplied by a power of 2).

+ This function will compute what the minimum part size must be in

+ order to upload a file of size ``size_in_bytes``.

+ It will first check if ``default_part_size`` is sufficient for

+ a part size given the ``size_in_bytes``. If this is not the case,

+ then the smallest part size than can accomodate a file of size

+ ``size_in_bytes`` will be returned.

+ If the file size is greater than the maximum allowed archive

+ size of 10,000 * 4GB, a ``ValueError`` will be raised.

+ """

+ # The default part size (4 MB) will be too small for a very large

+ # archive, as there is a limit of 10,000 parts in a multipart upload.

+ # This puts the maximum allowed archive size with the default part size

+ # at 40,000 MB. We need to do a sanity check on the part size, and find

+ # one that works if the default is too small.

+ part_size = _MEGABYTE

+ if (default_part_size * MAXIMUM_NUMBER_OF_PARTS) < size_in_bytes:

+ if size_in_bytes > (4096 * _MEGABYTE * 10000):

+ raise ValueError("File size too large: %s" % size_in_bytes)

+ min_part_size = size_in_bytes / 10000

+ power = 3

+ while part_size < min_part_size:

+ part_size = math.ldexp(_MEGABYTE, power)

+ power += 1

+ part_size = int(part_size)

+ else:

+ part_size = default_part_size

+ return part_size

+def chunk_hashes(bytestring, chunk_size=_MEGABYTE):

+ chunk_count = int(math.ceil(len(bytestring) / float(chunk_size)))

+ hashes = []

+ for i in xrange(chunk_count):

+ start = i * chunk_size

+ end = (i + 1) * chunk_size

+ hashes.append(hashlib.sha256(bytestring[start:end]).digest())

+ if not hashes:

+ return [hashlib.sha256('').digest()]

+ return hashes

+def tree_hash(fo):

+ """

+ Given a hash of each 1MB chunk (from chunk_hashes) this will hash

+ together adjacent hashes until it ends up with one big one. So a

+ tree of hashes.

+ """

+ hashes = []

+ hashes.extend(fo)

+ while len(hashes) > 1:

+ new_hashes = []

+ while True:

+ if len(hashes) > 1:

+ first = hashes.pop(0)

+ second = hashes.pop(0)

+ new_hashes.append(hashlib.sha256(first + second).digest())

+ elif len(hashes) == 1:

+ only = hashes.pop(0)

+ new_hashes.append(only)

+ else:

+ break

+ hashes.extend(new_hashes)

+ return hashes[0]

+def compute_hashes_from_fileobj(fileobj, chunk_size=1024 * 1024):

+ """Compute the linear and tree hash from a fileobj.

+ This function will compute the linear/tree hash of a fileobj

+ in a single pass through the fileobj.

+ :param fileobj: A file like object.

+ :param chunk_size: The size of the chunks to use for the tree

+ hash. This is also the buffer size used to read from

+ `fileobj`.

+ :rtype: tuple

+ :return: A tuple of (linear_hash, tree_hash). Both hashes

+ are returned in hex.

+ """

+ linear_hash = hashlib.sha256()

+ chunks = []

+ chunk = fileobj.read(chunk_size)

+ while chunk:

+ linear_hash.update(chunk)

+ chunks.append(hashlib.sha256(chunk).digest())

+ chunk = fileobj.read(chunk_size)

+ if not chunks:

+ chunks = [hashlib.sha256('').digest()]

+ return linear_hash.hexdigest(), bytes_to_hex(tree_hash(chunks))

+def bytes_to_hex(str_as_bytes):

+ return ''.join(["%02x" % ord(x) for x in str_as_bytes]).strip()

+def tree_hash_from_str(str_as_bytes):

+ """

+ :type str_as_bytes: str

+ :param str_as_bytes: The string for which to compute the tree hash.

+ :rtype: str

+ :return: The computed tree hash, returned as hex.

+ """

+ return bytes_to_hex(tree_hash(chunk_hashes(str_as_bytes)))

+class ResettingFileSender(object):

+ def __init__(self, archive):

+ self._archive = archive

+ self._starting_offset = archive.tell()

+ def __call__(self, connection, method, path, body, headers):

+ try:

+ connection.request(method, path, self._archive, headers)

+ return connection.getresponse()

+ finally:

+ self._archive.seek(self._starting_offset)