Index: third_party/gsutil/gslib/wildcard_iterator.py |
diff --git a/third_party/gsutil/gslib/wildcard_iterator.py b/third_party/gsutil/gslib/wildcard_iterator.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..97e7bc7a9793673642abd87c23b3f7ab5d575506 |
--- /dev/null |
+++ b/third_party/gsutil/gslib/wildcard_iterator.py |
@@ -0,0 +1,498 @@ |
+# Copyright 2010 Google Inc. All Rights Reserved. |
+# |
+# Permission is hereby granted, free of charge, to any person obtaining a |
+# copy of this software and associated documentation files (the |
+# "Software"), to deal in the Software without restriction, including |
+# without limitation the rights to use, copy, modify, merge, publish, dis- |
+# tribute, sublicense, and/or sell copies of the Software, and to permit |
+# persons to whom the Software is furnished to do so, subject to the fol- |
+# lowing conditions: |
+# |
+# The above copyright notice and this permission notice shall be included |
+# in all copies or substantial portions of the Software. |
+# |
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- |
+# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT |
+# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
+# IN THE SOFTWARE. |
+ |
+"""Implementation of wildcarding over StorageUris. |
+ |
+StorageUri is an abstraction that Google introduced in the boto library, |
+for representing storage provider-independent bucket and object names with |
+a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current |
+class provides wildcarding support for StorageUri objects (including both |
+bucket and file system objects), allowing one to express collections of |
+objects with syntax like the following: |
+ gs://mybucket/images/*.png |
+ file:///tmp/???abc??? |
+ |
+We provide wildcarding support as part of gsutil rather than as part |
+of boto because wildcarding is really part of shell command-like |
+functionality. |
+ |
+A comment about wildcard semantics: We support both single path component |
+wildcards (e.g., using '*') and recursive wildcards (using '**'), for both |
+file and cloud URIs. For example, |
+ gs://bucket/doc/*/*.html |
+would enumerate HTML files one directory down from gs://bucket/doc, while |
+ gs://bucket/**/*.html |
+would enumerate HTML files in all objects contained in the bucket. |
+ |
+Note also that if you use file system wildcards it's likely your shell |
+interprets the wildcarding before passing the command to gsutil. For example: |
+ % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse |
+would likely be expanded by the shell into the following before running gsutil: |
+ % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse |
+ |
+Note also that most shells don't support '**' wildcarding (I think only |
+zsh does). If you want to use '**' wildcarding with such a shell you can |
+single quote each wildcarded string, so it gets passed uninterpreted by the |
+shell to gsutil (at which point gsutil will perform the wildcarding expansion): |
+ % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse |
+""" |
+ |
+import boto |
+import fnmatch |
+import glob |
+import os |
+import re |
+import sys |
+import urllib |
+ |
+from boto.s3.prefix import Prefix |
+from boto.storage_uri import BucketStorageUri |
+from bucket_listing_ref import BucketListingRef |
+ |
+# Regex to determine if a string contains any wildcards. |
+WILDCARD_REGEX = re.compile('[*?\[\]]') |
+ |
+WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator' |
+WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator' |
+ |
+ |
+class WildcardIterator(object): |
+ """Base class for wildcarding over StorageUris. |
+ |
+ This class implements support for iterating over StorageUris that |
+ contain wildcards. |
+ |
+ The base class is abstract; you should instantiate using the |
+ wildcard_iterator() static factory method, which chooses the right |
+ implementation depending on the StorageUri. |
+ """ |
+ |
+ def __repr__(self): |
+ """Returns string representation of WildcardIterator.""" |
+ return 'WildcardIterator(%s)' % self.wildcard_uri |
+ |
+ |
+class CloudWildcardIterator(WildcardIterator): |
+ """WildcardIterator subclass for buckets and objects. |
+ |
+ Iterates over BucketListingRef matching the StorageUri wildcard. It's |
+ much more efficient to request the Key from the BucketListingRef (via |
+ GetKey()) than to request the StorageUri and then call uri.get_key() |
+ to retrieve the key, for cases where you want to get metadata that's |
+ available in the Bucket (for example to get the name and size of |
+ each object), because that information is available in the bucket GET |
+ results. If you were to iterate over URIs for such cases and then get |
+ the name and size info from each resulting StorageUri, it would cause |
+ an additional object GET request for each of the result URIs. |
+ """ |
+ |
+ def __init__(self, wildcard_uri, proj_id_handler, |
+ bucket_storage_uri_class=BucketStorageUri, all_versions=False, |
+ headers=None, debug=0): |
+ """ |
+ Instantiates an iterator over BucketListingRef matching given wildcard URI. |
+ |
+ Args: |
+ wildcard_uri: StorageUri that contains the wildcard to iterate. |
+ proj_id_handler: ProjectIdHandler to use for current command. |
+ bucket_storage_uri_class: BucketStorageUri interface. |
+ Settable for testing/mocking. |
+ headers: Dictionary containing optional HTTP headers to pass to boto. |
+ debug: Debug level to pass in to boto connection (range 0..3). |
+ """ |
+ self.wildcard_uri = wildcard_uri |
+ # Make a copy of the headers so any updates we make during wildcard |
+ # expansion aren't left in the input params (specifically, so we don't |
+ # include the x-goog-project-id header needed by a subset of cases, in |
+ # the data returned to caller, which could then be used in other cases |
+ # where that header must not be passed). |
+ if headers is None: |
+ self.headers = {} |
+ else: |
+ self.headers = headers.copy() |
+ self.proj_id_handler = proj_id_handler |
+ self.bucket_storage_uri_class = bucket_storage_uri_class |
+ self.all_versions = all_versions |
+ self.debug = debug |
+ |
+ def __iter__(self): |
+ """Python iterator that gets called when iterating over cloud wildcard. |
+ |
+ Yields: |
+ BucketListingRef, or empty iterator if no matches. |
+ """ |
+ # First handle bucket wildcarding, if any. |
+ if ContainsWildcard(self.wildcard_uri.bucket_name): |
+ regex = fnmatch.translate(self.wildcard_uri.bucket_name) |
+ bucket_uris = [] |
+ prog = re.compile(regex) |
+ self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR, |
+ self.wildcard_uri, |
+ self.headers) |
+ for b in self.wildcard_uri.get_all_buckets(headers=self.headers): |
+ if prog.match(b.name): |
+ # Use str(b.name) because get_all_buckets() returns Unicode |
+ # string, which when used to construct x-goog-copy-src metadata |
+ # requests for object-to-object copies causes pathname '/' chars |
+ # to be entity-encoded (bucket%2Fdir instead of bucket/dir), |
+ # which causes the request to fail. |
+ uri_str = '%s://%s' % (self.wildcard_uri.scheme, |
+ urllib.quote_plus(str(b.name))) |
+ bucket_uris.append( |
+ boto.storage_uri( |
+ uri_str, debug=self.debug, |
+ bucket_storage_uri_class=self.bucket_storage_uri_class, |
+ suppress_consec_slashes=False)) |
+ else: |
+ bucket_uris = [self.wildcard_uri.clone_replace_name('')] |
+ |
+ # Now iterate over bucket(s), and handle object wildcarding, if any. |
+ self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR, |
+ self.wildcard_uri, |
+ self.headers) |
+ for bucket_uri in bucket_uris: |
+ if self.wildcard_uri.names_bucket(): |
+ # Bucket-only URI. |
+ yield BucketListingRef(bucket_uri, key=None, prefix=None, |
+ headers=self.headers) |
+ else: |
+ # URI contains an object name. If there's no wildcard just yield |
+ # the needed URI. |
+ if not ContainsWildcard(self.wildcard_uri.object_name): |
+ uri_to_yield = bucket_uri.clone_replace_name( |
+ self.wildcard_uri.object_name) |
+ yield BucketListingRef(uri_to_yield, key=None, prefix=None, |
+ headers=self.headers) |
+ else: |
+ # URI contains a wildcard. Expand iteratively by building |
+ # prefix/delimiter bucket listing request, filtering the results per |
+ # the current level's wildcard, and continuing with the next component |
+ # of the wildcard. See _BuildBucketFilterStrings() documentation |
+ # for details. |
+ # |
+ # Initialize the iteration with bucket name from bucket_uri but |
+ # object name from self.wildcard_uri. This is needed to handle cases |
+ # where both the bucket and object names contain wildcards. |
+ uris_needing_expansion = [ |
+ bucket_uri.clone_replace_name(self.wildcard_uri.object_name)] |
+ while len(uris_needing_expansion) > 0: |
+ uri = uris_needing_expansion.pop(0) |
+ (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( |
+ self._BuildBucketFilterStrings(uri.object_name)) |
+ prog = re.compile(fnmatch.translate(prefix_wildcard)) |
+ # List bucket for objects matching prefix up to delimiter. |
+ for key in bucket_uri.list_bucket(prefix=prefix, |
+ delimiter=delimiter, |
+ headers=self.headers, |
+ all_versions=self.all_versions): |
+ # Check that the prefix regex matches rstripped key.name (to |
+ # correspond with the rstripped prefix_wildcard from |
+ # _BuildBucketFilterStrings()). |
+ if prog.match(key.name.rstrip('/')): |
+ if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard: |
+ if isinstance(key, Prefix): |
+ # There's more wildcard left to expand. |
+ uris_needing_expansion.append( |
+ uri.clone_replace_name(key.name.rstrip('/') + '/' |
+ + suffix_wildcard)) |
+ else: |
+ # Done expanding. |
+ expanded_uri = uri.clone_replace_key(key) |
+ |
+ if isinstance(key, Prefix): |
+ yield BucketListingRef(expanded_uri, key=None, prefix=key, |
+ headers=self.headers) |
+ else: |
+ if self.all_versions: |
+ yield BucketListingRef(expanded_uri, key=key, prefix=None, |
+ headers=self.headers) |
+ else: |
+ # Yield BLR wrapping version-less URI. |
+ yield BucketListingRef(expanded_uri.clone_replace_name( |
+ expanded_uri.object_name), key=key, prefix=None, |
+ headers=self.headers) |
+ |
+ def _BuildBucketFilterStrings(self, wildcard): |
+ """ |
+ Builds strings needed for querying a bucket and filtering results to |
+ implement wildcard object name matching. |
+ |
+ Args: |
+ wildcard: The wildcard string to match to objects. |
+ |
+ Returns: |
+ (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
+ where: |
+ prefix is the prefix to be sent in bucket GET request. |
+ delimiter is the delimiter to be sent in bucket GET request. |
+ prefix_wildcard is the wildcard to be used to filter bucket GET results. |
+ suffix_wildcard is wildcard to be appended to filtered bucket GET |
+ results for next wildcard expansion iteration. |
+ For example, given the wildcard gs://bucket/abc/d*e/f*.txt we |
+ would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and |
+ suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket |
+ listing request will then produce a listing result set that can be |
+ filtered using this prefix_wildcard; and we'd use this suffix_wildcard |
+ to feed into the next call(s) to _BuildBucketFilterStrings(), for the |
+ next iteration of listing/filtering. |
+ |
+ Raises: |
+ AssertionError if wildcard doesn't contain any wildcard chars. |
+ """ |
+ # Generate a request prefix if the object name part of the wildcard starts |
+ # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). |
+ match = WILDCARD_REGEX.search(wildcard) |
+ if not match: |
+ # Input "wildcard" has no wildcard chars, so just return tuple that will |
+ # cause a bucket listing to match the given input wildcard. Example: if |
+ # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, |
+ # the next iteration will call _BuildBucketFilterStrings() with |
+ # gs://bucket/dir/abc, and we will return prefix ='dir/abc', |
+ # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. |
+ prefix = wildcard |
+ delimiter = '/' |
+ prefix_wildcard = wildcard |
+ suffix_wildcard = '' |
+ else: |
+ if match.start() > 0: |
+ # Wildcard does not occur at beginning of object name, so construct a |
+ # prefix string to send to server. |
+ prefix = wildcard[:match.start()] |
+ wildcard_part = wildcard[match.start():] |
+ else: |
+ prefix = None |
+ wildcard_part = wildcard |
+ end = wildcard_part.find('/') |
+ if end != -1: |
+ wildcard_part = wildcard_part[:end+1] |
+ # Remove trailing '/' so we will match gs://bucket/abc* as well as |
+ # gs://bucket/abc*/ with the same wildcard regex. |
+ prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/') |
+ suffix_wildcard = wildcard[match.end():] |
+ end = suffix_wildcard.find('/') |
+ if end == -1: |
+ suffix_wildcard = '' |
+ else: |
+ suffix_wildcard = suffix_wildcard[end+1:] |
+ # To implement recursive (**) wildcarding, if prefix_wildcard |
+ # suffix_wildcard starts with '**' don't send a delimiter, and combine |
+ # suffix_wildcard at end of prefix_wildcard. |
+ if prefix_wildcard.find('**') != -1: |
+ delimiter = None |
+ prefix_wildcard = prefix_wildcard + suffix_wildcard |
+ suffix_wildcard = '' |
+ else: |
+ delimiter = '/' |
+ delim_pos = suffix_wildcard.find(delimiter) |
+ # The following debug output is useful for tracing how the algorithm |
+ # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt |
+ if self.debug > 1: |
+ sys.stderr.write( |
+ 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' |
+ 'prefix_wildcard=%s, suffix_wildcard=%s\n' % |
+ (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) |
+ return (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
+ |
+ def IterKeys(self): |
+ """ |
+ Convenience iterator that runs underlying iterator and returns Key for each |
+ iteration. |
+ |
+ Yields: |
+ Subclass of boto.s3.key.Key, or empty iterator if no matches. |
+ |
+ Raises: |
+ WildcardException: for bucket-only uri. |
+ """ |
+ for bucket_listing_ref in self. __iter__(): |
+ if bucket_listing_ref.HasKey(): |
+ yield bucket_listing_ref.GetKey() |
+ |
+ def IterUris(self): |
+ """ |
+ Convenience iterator that runs underlying iterator and returns StorageUri |
+ for each iteration. |
+ |
+ Yields: |
+ StorageUri, or empty iterator if no matches. |
+ """ |
+ for bucket_listing_ref in self. __iter__(): |
+ yield bucket_listing_ref.GetUri() |
+ |
+ def IterUrisForKeys(self): |
+ """ |
+ Convenience iterator that runs underlying iterator and returns the |
+ StorageUri for each iterated BucketListingRef that has a Key. |
+ |
+ Yields: |
+ StorageUri, or empty iterator if no matches. |
+ """ |
+ for bucket_listing_ref in self. __iter__(): |
+ if bucket_listing_ref.HasKey(): |
+ yield bucket_listing_ref.GetUri() |
+ |
+ |
+class FileWildcardIterator(WildcardIterator): |
+ """WildcardIterator subclass for files and directories. |
+ |
+ If you use recursive wildcards ('**') only a single such wildcard is |
+ supported. For example you could use the wildcard '**/*.txt' to list all .txt |
+ files in any subdirectory of the current directory, but you couldn't use a |
+ wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt |
+ files in any subdirectory named 'abc'). |
+ """ |
+ |
+ def __init__(self, wildcard_uri, headers=None, debug=0): |
+ """ |
+ Instantiate an iterator over BucketListingRefs matching given wildcard URI. |
+ |
+ Args: |
+ wildcard_uri: StorageUri that contains the wildcard to iterate. |
+ headers: Dictionary containing optional HTTP headers to pass to boto. |
+ debug: Debug level to pass in to boto connection (range 0..3). |
+ """ |
+ self.wildcard_uri = wildcard_uri |
+ self.headers = headers |
+ self.debug = debug |
+ |
+ def __iter__(self): |
+ wildcard = self.wildcard_uri.object_name |
+ match = re.search('\*\*', wildcard) |
+ if match: |
+ # Recursive wildcarding request ('.../**/...'). |
+ # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' |
+ base_dir = wildcard[:match.start()-1] |
+ remaining_wildcard = wildcard[match.start()+2:] |
+ # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and |
+ # remaining_wildcard = '/*' |
+ if remaining_wildcard.startswith('*'): |
+ raise WildcardException('Invalid wildcard with more than 2 consecutive ' |
+ '*s (%s)' % wildcard) |
+ # If there was no remaining wildcard past the recursive wildcard, |
+ # treat it as if it were a '*'. For example, file://tmp/** is equivalent |
+ # to file://tmp/**/* |
+ if not remaining_wildcard: |
+ remaining_wildcard = '*' |
+ # Skip slash(es). |
+ remaining_wildcard = remaining_wildcard.lstrip(os.sep) |
+ filepaths = [] |
+ for dirpath, unused_dirnames, filenames in os.walk(base_dir): |
+ filepaths.extend( |
+ os.path.join(dirpath, f) for f in fnmatch.filter(filenames, |
+ remaining_wildcard) |
+ ) |
+ else: |
+ # Not a recursive wildcarding request. |
+ filepaths = glob.glob(wildcard) |
+ for filepath in filepaths: |
+ expanded_uri = self.wildcard_uri.clone_replace_name(filepath) |
+ yield BucketListingRef(expanded_uri) |
+ |
+ def IterKeys(self): |
+ """ |
+ Placeholder to allow polymorphic use of WildcardIterator. |
+ |
+ Raises: |
+ WildcardException: in all cases. |
+ """ |
+ raise WildcardException( |
+ 'Iterating over Keys not possible for file wildcards') |
+ |
+ def IterUris(self): |
+ """ |
+ Convenience iterator that runs underlying iterator and returns StorageUri |
+ for each iteration. |
+ |
+ Yields: |
+ StorageUri, or empty iterator if no matches. |
+ """ |
+ for bucket_listing_ref in self. __iter__(): |
+ yield bucket_listing_ref.GetUri() |
+ |
+ |
+class WildcardException(StandardError): |
+ """Exception thrown for invalid wildcard URIs.""" |
+ |
+ def __init__(self, reason): |
+ StandardError.__init__(self) |
+ self.reason = reason |
+ |
+ def __repr__(self): |
+ return 'WildcardException: %s' % self.reason |
+ |
+ def __str__(self): |
+ return 'WildcardException: %s' % self.reason |
+ |
+ |
+def wildcard_iterator(uri_or_str, proj_id_handler, |
+ bucket_storage_uri_class=BucketStorageUri, |
+ all_versions=False, |
+ headers=None, debug=0): |
+ """Instantiate a WildCardIterator for the given StorageUri. |
+ |
+ Args: |
+ uri_or_str: StorageUri or URI string naming wildcard objects to iterate. |
+ proj_id_handler: ProjectIdHandler to use for current command. |
+ bucket_storage_uri_class: BucketStorageUri interface. |
+ Settable for testing/mocking. |
+ headers: Dictionary containing optional HTTP headers to pass to boto. |
+ debug: Debug level to pass in to boto connection (range 0..3). |
+ |
+ Returns: |
+ A WildcardIterator that handles the requested iteration. |
+ """ |
+ |
+ if isinstance(uri_or_str, basestring): |
+ # Disable enforce_bucket_naming, to allow bucket names containing wildcard |
+ # chars. |
+ uri = boto.storage_uri( |
+ uri_or_str, debug=debug, validate=False, |
+ bucket_storage_uri_class=bucket_storage_uri_class, |
+ suppress_consec_slashes=False) |
+ else: |
+ uri = uri_or_str |
+ |
+ if uri.is_cloud_uri(): |
+ return CloudWildcardIterator( |
+ uri, proj_id_handler, |
+ bucket_storage_uri_class=bucket_storage_uri_class, |
+ all_versions=all_versions, |
+ headers=headers, |
+ debug=debug) |
+ elif uri.is_file_uri(): |
+ return FileWildcardIterator(uri, headers=headers, debug=debug) |
+ else: |
+ raise WildcardException('Unexpected type of StorageUri (%s)' % uri) |
+ |
+ |
+def ContainsWildcard(uri_or_str): |
+ """Checks whether uri_or_str contains a wildcard. |
+ |
+ Args: |
+ uri_or_str: StorageUri or URI string to check. |
+ |
+ Returns: |
+ bool indicator. |
+ """ |
+ if isinstance(uri_or_str, basestring): |
+ return bool(WILDCARD_REGEX.search(uri_or_str)) |
+ else: |
+ return bool(WILDCARD_REGEX.search(uri_or_str.uri)) |