third_party/gsutil/boto/glacier/utils.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: third_party/gsutil/boto/glacier/utils.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Removed gsutil/tests and gsutil/docs Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright (c) 2012 Amazon.com, Inc. or its affiliates. All Rights Reserved

	2 #

	3 # Permission is hereby granted, free of charge, to any person obtaining a

	4 # copy of this software and associated documentation files (the

	5 # "Software"), to deal in the Software without restriction, including

	6 # without limitation the rights to use, copy, modify, merge, publish, dis-

	7 # tribute, sublicense, and/or sell copies of the Software, and to permit

	8 # persons to whom the Software is furnished to do so, subject to the fol-

	9 # lowing conditions:

	10 #

	11 # The above copyright notice and this permission notice shall be included

	12 # in all copies or substantial portions of the Software.

	13 #

	14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS

	15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-

	16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT

	17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

	18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

	19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

	20 # IN THE SOFTWARE.

	21 #

	22 import hashlib

	23 import math

	24

	25

	26 _MEGABYTE = 1024 * 1024

	27 DEFAULT_PART_SIZE = 4 * _MEGABYTE

	28 MAXIMUM_NUMBER_OF_PARTS = 10000

	29

	30

	31 def minimum_part_size(size_in_bytes):

	32 # The default part size (4 MB) will be too small for a very large

	33 # archive, as there is a limit of 10,000 parts in a multipart upload.

	34 # This puts the maximum allowed archive size with the default part size

	35 # at 40,000 MB. We need to do a sanity check on the part size, and find

	36 # one that works if the default is too small.

	37 part_size = _MEGABYTE

	38 if (DEFAULT_PART_SIZE * MAXIMUM_NUMBER_OF_PARTS) < size_in_bytes:

	39 if size_in_bytes > (4096 * _MEGABYTE * 10000):

	40 raise ValueError("File size too large: %s" % size_in_bytes)

	41 min_part_size = size_in_bytes / 10000

	42 power = 3

	43 while part_size < min_part_size:

	44 part_size = math.ldexp(_MEGABYTE, power)

	45 power += 1

	46 part_size = int(part_size)

	47 else:

	48 part_size = DEFAULT_PART_SIZE

	49 return part_size

	50

	51

	52 def chunk_hashes(bytestring, chunk_size=_MEGABYTE):

	53 chunk_count = int(math.ceil(len(bytestring) / float(chunk_size)))

	54 hashes = []

	55 for i in xrange(chunk_count):

	56 start = i * chunk_size

	57 end = (i + 1) * chunk_size

	58 hashes.append(hashlib.sha256(bytestring[start:end]).digest())

	59 return hashes

	60

	61

	62 def tree_hash(fo):

	63 """

	64 Given a hash of each 1MB chunk (from chunk_hashes) this will hash

	65 together adjacent hashes until it ends up with one big one. So a

	66 tree of hashes.

	67 """

	68 hashes = []

	69 hashes.extend(fo)

	70 while len(hashes) > 1:

	71 new_hashes = []

	72 while True:

	73 if len(hashes) > 1:

	74 first = hashes.pop(0)

	75 second = hashes.pop(0)

	76 new_hashes.append(hashlib.sha256(first + second).digest())

	77 elif len(hashes) == 1:

	78 only = hashes.pop(0)

	79 new_hashes.append(only)

	80 else:

	81 break

	82 hashes.extend(new_hashes)

	83 return hashes[0]

	84

	85

	86 def compute_hashes_from_fileobj(fileobj, chunk_size=1024 * 1024):

	87 """Compute the linear and tree hash from a fileobj.

	88

	89 This function will compute the linear/tree hash of a fileobj

	90 in a single pass through the fileobj.

	91

	92 :param fileobj: A file like object.

	93

	94 :param chunk_size: The size of the chunks to use for the tree

	95 hash. This is also the buffer size used to read from

	96 `fileobj`.

	97

	98 :rtype: tuple

	99 :return: A tuple of (linear_hash, tree_hash). Both hashes

	100 are returned in hex.

	101

	102 """

	103 linear_hash = hashlib.sha256()

	104 chunks = []

	105 chunk = fileobj.read(chunk_size)

	106 while chunk:

	107 linear_hash.update(chunk)

	108 chunks.append(hashlib.sha256(chunk).digest())

	109 chunk = fileobj.read(chunk_size)

	110 return linear_hash.hexdigest(), bytes_to_hex(tree_hash(chunks))

	111

	112

	113 def bytes_to_hex(str_as_bytes):

	114 return ''.join(["%02x" % ord(x) for x in str_as_bytes]).strip()

OLD	NEW

« download_from_google_storage.py ('K') | « third_party/gsutil/boto/glacier/response.py ('k') | third_party/gsutil/boto/glacier/vault.py » ('j') | upload_to_google_storage.py » ('J')