| OLD | NEW |
| (Empty) | |
| 1 # Copyright (c) 2012 Amazon.com, Inc. or its affiliates. All Rights Reserved |
| 2 # |
| 3 # Permission is hereby granted, free of charge, to any person obtaining a |
| 4 # copy of this software and associated documentation files (the |
| 5 # "Software"), to deal in the Software without restriction, including |
| 6 # without limitation the rights to use, copy, modify, merge, publish, dis- |
| 7 # tribute, sublicense, and/or sell copies of the Software, and to permit |
| 8 # persons to whom the Software is furnished to do so, subject to the fol- |
| 9 # lowing conditions: |
| 10 # |
| 11 # The above copyright notice and this permission notice shall be included |
| 12 # in all copies or substantial portions of the Software. |
| 13 # |
| 14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| 15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- |
| 16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT |
| 17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| 20 # IN THE SOFTWARE. |
| 21 # |
| 22 import hashlib |
| 23 import math |
| 24 |
| 25 |
| 26 _MEGABYTE = 1024 * 1024 |
| 27 DEFAULT_PART_SIZE = 4 * _MEGABYTE |
| 28 MAXIMUM_NUMBER_OF_PARTS = 10000 |
| 29 |
| 30 |
| 31 def minimum_part_size(size_in_bytes): |
| 32 # The default part size (4 MB) will be too small for a very large |
| 33 # archive, as there is a limit of 10,000 parts in a multipart upload. |
| 34 # This puts the maximum allowed archive size with the default part size |
| 35 # at 40,000 MB. We need to do a sanity check on the part size, and find |
| 36 # one that works if the default is too small. |
| 37 part_size = _MEGABYTE |
| 38 if (DEFAULT_PART_SIZE * MAXIMUM_NUMBER_OF_PARTS) < size_in_bytes: |
| 39 if size_in_bytes > (4096 * _MEGABYTE * 10000): |
| 40 raise ValueError("File size too large: %s" % size_in_bytes) |
| 41 min_part_size = size_in_bytes / 10000 |
| 42 power = 3 |
| 43 while part_size < min_part_size: |
| 44 part_size = math.ldexp(_MEGABYTE, power) |
| 45 power += 1 |
| 46 part_size = int(part_size) |
| 47 else: |
| 48 part_size = DEFAULT_PART_SIZE |
| 49 return part_size |
| 50 |
| 51 |
| 52 def chunk_hashes(bytestring, chunk_size=_MEGABYTE): |
| 53 chunk_count = int(math.ceil(len(bytestring) / float(chunk_size))) |
| 54 hashes = [] |
| 55 for i in xrange(chunk_count): |
| 56 start = i * chunk_size |
| 57 end = (i + 1) * chunk_size |
| 58 hashes.append(hashlib.sha256(bytestring[start:end]).digest()) |
| 59 return hashes |
| 60 |
| 61 |
| 62 def tree_hash(fo): |
| 63 """ |
| 64 Given a hash of each 1MB chunk (from chunk_hashes) this will hash |
| 65 together adjacent hashes until it ends up with one big one. So a |
| 66 tree of hashes. |
| 67 """ |
| 68 hashes = [] |
| 69 hashes.extend(fo) |
| 70 while len(hashes) > 1: |
| 71 new_hashes = [] |
| 72 while True: |
| 73 if len(hashes) > 1: |
| 74 first = hashes.pop(0) |
| 75 second = hashes.pop(0) |
| 76 new_hashes.append(hashlib.sha256(first + second).digest()) |
| 77 elif len(hashes) == 1: |
| 78 only = hashes.pop(0) |
| 79 new_hashes.append(only) |
| 80 else: |
| 81 break |
| 82 hashes.extend(new_hashes) |
| 83 return hashes[0] |
| 84 |
| 85 |
| 86 def compute_hashes_from_fileobj(fileobj, chunk_size=1024 * 1024): |
| 87 """Compute the linear and tree hash from a fileobj. |
| 88 |
| 89 This function will compute the linear/tree hash of a fileobj |
| 90 in a single pass through the fileobj. |
| 91 |
| 92 :param fileobj: A file like object. |
| 93 |
| 94 :param chunk_size: The size of the chunks to use for the tree |
| 95 hash. This is also the buffer size used to read from |
| 96 `fileobj`. |
| 97 |
| 98 :rtype: tuple |
| 99 :return: A tuple of (linear_hash, tree_hash). Both hashes |
| 100 are returned in hex. |
| 101 |
| 102 """ |
| 103 linear_hash = hashlib.sha256() |
| 104 chunks = [] |
| 105 chunk = fileobj.read(chunk_size) |
| 106 while chunk: |
| 107 linear_hash.update(chunk) |
| 108 chunks.append(hashlib.sha256(chunk).digest()) |
| 109 chunk = fileobj.read(chunk_size) |
| 110 return linear_hash.hexdigest(), bytes_to_hex(tree_hash(chunks)) |
| 111 |
| 112 |
| 113 def bytes_to_hex(str_as_bytes): |
| 114 return ''.join(["%02x" % ord(x) for x in str_as_bytes]).strip() |
| OLD | NEW |