OLD | NEW |
(Empty) | |
| 1 # Copyright 2010 Google Inc. All Rights Reserved. |
| 2 # |
| 3 # Permission is hereby granted, free of charge, to any person obtaining a |
| 4 # copy of this software and associated documentation files (the |
| 5 # "Software"), to deal in the Software without restriction, including |
| 6 # without limitation the rights to use, copy, modify, merge, publish, dis- |
| 7 # tribute, sublicense, and/or sell copies of the Software, and to permit |
| 8 # persons to whom the Software is furnished to do so, subject to the fol- |
| 9 # lowing conditions: |
| 10 # |
| 11 # The above copyright notice and this permission notice shall be included |
| 12 # in all copies or substantial portions of the Software. |
| 13 # |
| 14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| 15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- |
| 16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT |
| 17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| 20 # IN THE SOFTWARE. |
| 21 |
| 22 """Implementation of wildcarding over StorageUris. |
| 23 |
| 24 StorageUri is an abstraction that Google introduced in the boto library, |
| 25 for representing storage provider-independent bucket and object names with |
| 26 a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current |
| 27 class provides wildcarding support for StorageUri objects (including both |
| 28 bucket and file system objects), allowing one to express collections of |
| 29 objects with syntax like the following: |
| 30 gs://mybucket/images/*.png |
| 31 file:///tmp/???abc??? |
| 32 |
| 33 We provide wildcarding support as part of gsutil rather than as part |
| 34 of boto because wildcarding is really part of shell command-like |
| 35 functionality. |
| 36 |
| 37 A comment about wildcard semantics: We support both single path component |
| 38 wildcards (e.g., using '*') and recursive wildcards (using '**'), for both |
| 39 file and cloud URIs. For example, |
| 40 gs://bucket/doc/*/*.html |
| 41 would enumerate HTML files one directory down from gs://bucket/doc, while |
| 42 gs://bucket/**/*.html |
| 43 would enumerate HTML files in all objects contained in the bucket. |
| 44 |
| 45 Note also that if you use file system wildcards it's likely your shell |
| 46 interprets the wildcarding before passing the command to gsutil. For example: |
| 47 % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse |
| 48 would likely be expanded by the shell into the following before running gsutil: |
| 49 % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse |
| 50 |
| 51 Note also that most shells don't support '**' wildcarding (I think only |
| 52 zsh does). If you want to use '**' wildcarding with such a shell you can |
| 53 single quote each wildcarded string, so it gets passed uninterpreted by the |
| 54 shell to gsutil (at which point gsutil will perform the wildcarding expansion): |
| 55 % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse |
| 56 """ |
| 57 |
| 58 import boto |
| 59 import fnmatch |
| 60 import glob |
| 61 import os |
| 62 import re |
| 63 import sys |
| 64 import urllib |
| 65 |
| 66 from boto.s3.prefix import Prefix |
| 67 from boto.storage_uri import BucketStorageUri |
| 68 from bucket_listing_ref import BucketListingRef |
| 69 |
| 70 # Regex to determine if a string contains any wildcards. |
| 71 WILDCARD_REGEX = re.compile('[*?\[\]]') |
| 72 |
| 73 WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator' |
| 74 WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator' |
| 75 |
| 76 |
| 77 class WildcardIterator(object): |
| 78 """Base class for wildcarding over StorageUris. |
| 79 |
| 80 This class implements support for iterating over StorageUris that |
| 81 contain wildcards. |
| 82 |
| 83 The base class is abstract; you should instantiate using the |
| 84 wildcard_iterator() static factory method, which chooses the right |
| 85 implementation depending on the StorageUri. |
| 86 """ |
| 87 |
| 88 def __repr__(self): |
| 89 """Returns string representation of WildcardIterator.""" |
| 90 return 'WildcardIterator(%s)' % self.wildcard_uri |
| 91 |
| 92 |
| 93 class CloudWildcardIterator(WildcardIterator): |
| 94 """WildcardIterator subclass for buckets and objects. |
| 95 |
| 96 Iterates over BucketListingRef matching the StorageUri wildcard. It's |
| 97 much more efficient to request the Key from the BucketListingRef (via |
| 98 GetKey()) than to request the StorageUri and then call uri.get_key() |
| 99 to retrieve the key, for cases where you want to get metadata that's |
| 100 available in the Bucket (for example to get the name and size of |
| 101 each object), because that information is available in the bucket GET |
| 102 results. If you were to iterate over URIs for such cases and then get |
| 103 the name and size info from each resulting StorageUri, it would cause |
| 104 an additional object GET request for each of the result URIs. |
| 105 """ |
| 106 |
| 107 def __init__(self, wildcard_uri, proj_id_handler, |
| 108 bucket_storage_uri_class=BucketStorageUri, all_versions=False, |
| 109 headers=None, debug=0): |
| 110 """ |
| 111 Instantiates an iterator over BucketListingRef matching given wildcard URI. |
| 112 |
| 113 Args: |
| 114 wildcard_uri: StorageUri that contains the wildcard to iterate. |
| 115 proj_id_handler: ProjectIdHandler to use for current command. |
| 116 bucket_storage_uri_class: BucketStorageUri interface. |
| 117 Settable for testing/mocking. |
| 118 headers: Dictionary containing optional HTTP headers to pass to boto. |
| 119 debug: Debug level to pass in to boto connection (range 0..3). |
| 120 """ |
| 121 self.wildcard_uri = wildcard_uri |
| 122 # Make a copy of the headers so any updates we make during wildcard |
| 123 # expansion aren't left in the input params (specifically, so we don't |
| 124 # include the x-goog-project-id header needed by a subset of cases, in |
| 125 # the data returned to caller, which could then be used in other cases |
| 126 # where that header must not be passed). |
| 127 if headers is None: |
| 128 self.headers = {} |
| 129 else: |
| 130 self.headers = headers.copy() |
| 131 self.proj_id_handler = proj_id_handler |
| 132 self.bucket_storage_uri_class = bucket_storage_uri_class |
| 133 self.all_versions = all_versions |
| 134 self.debug = debug |
| 135 |
| 136 def __iter__(self): |
| 137 """Python iterator that gets called when iterating over cloud wildcard. |
| 138 |
| 139 Yields: |
| 140 BucketListingRef, or empty iterator if no matches. |
| 141 """ |
| 142 # First handle bucket wildcarding, if any. |
| 143 if ContainsWildcard(self.wildcard_uri.bucket_name): |
| 144 regex = fnmatch.translate(self.wildcard_uri.bucket_name) |
| 145 bucket_uris = [] |
| 146 prog = re.compile(regex) |
| 147 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR, |
| 148 self.wildcard_uri, |
| 149 self.headers) |
| 150 for b in self.wildcard_uri.get_all_buckets(headers=self.headers): |
| 151 if prog.match(b.name): |
| 152 # Use str(b.name) because get_all_buckets() returns Unicode |
| 153 # string, which when used to construct x-goog-copy-src metadata |
| 154 # requests for object-to-object copies causes pathname '/' chars |
| 155 # to be entity-encoded (bucket%2Fdir instead of bucket/dir), |
| 156 # which causes the request to fail. |
| 157 uri_str = '%s://%s' % (self.wildcard_uri.scheme, |
| 158 urllib.quote_plus(str(b.name))) |
| 159 bucket_uris.append( |
| 160 boto.storage_uri( |
| 161 uri_str, debug=self.debug, |
| 162 bucket_storage_uri_class=self.bucket_storage_uri_class, |
| 163 suppress_consec_slashes=False)) |
| 164 else: |
| 165 bucket_uris = [self.wildcard_uri.clone_replace_name('')] |
| 166 |
| 167 # Now iterate over bucket(s), and handle object wildcarding, if any. |
| 168 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR, |
| 169 self.wildcard_uri, |
| 170 self.headers) |
| 171 for bucket_uri in bucket_uris: |
| 172 if self.wildcard_uri.names_bucket(): |
| 173 # Bucket-only URI. |
| 174 yield BucketListingRef(bucket_uri, key=None, prefix=None, |
| 175 headers=self.headers) |
| 176 else: |
| 177 # URI contains an object name. If there's no wildcard just yield |
| 178 # the needed URI. |
| 179 if not ContainsWildcard(self.wildcard_uri.object_name): |
| 180 uri_to_yield = bucket_uri.clone_replace_name( |
| 181 self.wildcard_uri.object_name) |
| 182 yield BucketListingRef(uri_to_yield, key=None, prefix=None, |
| 183 headers=self.headers) |
| 184 else: |
| 185 # URI contains a wildcard. Expand iteratively by building |
| 186 # prefix/delimiter bucket listing request, filtering the results per |
| 187 # the current level's wildcard, and continuing with the next component |
| 188 # of the wildcard. See _BuildBucketFilterStrings() documentation |
| 189 # for details. |
| 190 # |
| 191 # Initialize the iteration with bucket name from bucket_uri but |
| 192 # object name from self.wildcard_uri. This is needed to handle cases |
| 193 # where both the bucket and object names contain wildcards. |
| 194 uris_needing_expansion = [ |
| 195 bucket_uri.clone_replace_name(self.wildcard_uri.object_name)] |
| 196 while len(uris_needing_expansion) > 0: |
| 197 uri = uris_needing_expansion.pop(0) |
| 198 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( |
| 199 self._BuildBucketFilterStrings(uri.object_name)) |
| 200 prog = re.compile(fnmatch.translate(prefix_wildcard)) |
| 201 # List bucket for objects matching prefix up to delimiter. |
| 202 for key in bucket_uri.list_bucket(prefix=prefix, |
| 203 delimiter=delimiter, |
| 204 headers=self.headers, |
| 205 all_versions=self.all_versions): |
| 206 # Check that the prefix regex matches rstripped key.name (to |
| 207 # correspond with the rstripped prefix_wildcard from |
| 208 # _BuildBucketFilterStrings()). |
| 209 if prog.match(key.name.rstrip('/')): |
| 210 if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard: |
| 211 if isinstance(key, Prefix): |
| 212 # There's more wildcard left to expand. |
| 213 uris_needing_expansion.append( |
| 214 uri.clone_replace_name(key.name.rstrip('/') + '/' |
| 215 + suffix_wildcard)) |
| 216 else: |
| 217 # Done expanding. |
| 218 expanded_uri = uri.clone_replace_key(key) |
| 219 |
| 220 if isinstance(key, Prefix): |
| 221 yield BucketListingRef(expanded_uri, key=None, prefix=key, |
| 222 headers=self.headers) |
| 223 else: |
| 224 if self.all_versions: |
| 225 yield BucketListingRef(expanded_uri, key=key, prefix=None, |
| 226 headers=self.headers) |
| 227 else: |
| 228 # Yield BLR wrapping version-less URI. |
| 229 yield BucketListingRef(expanded_uri.clone_replace_name( |
| 230 expanded_uri.object_name), key=key, prefix=None, |
| 231 headers=self.headers) |
| 232 |
| 233 def _BuildBucketFilterStrings(self, wildcard): |
| 234 """ |
| 235 Builds strings needed for querying a bucket and filtering results to |
| 236 implement wildcard object name matching. |
| 237 |
| 238 Args: |
| 239 wildcard: The wildcard string to match to objects. |
| 240 |
| 241 Returns: |
| 242 (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
| 243 where: |
| 244 prefix is the prefix to be sent in bucket GET request. |
| 245 delimiter is the delimiter to be sent in bucket GET request. |
| 246 prefix_wildcard is the wildcard to be used to filter bucket GET results. |
| 247 suffix_wildcard is wildcard to be appended to filtered bucket GET |
| 248 results for next wildcard expansion iteration. |
| 249 For example, given the wildcard gs://bucket/abc/d*e/f*.txt we |
| 250 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and |
| 251 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket |
| 252 listing request will then produce a listing result set that can be |
| 253 filtered using this prefix_wildcard; and we'd use this suffix_wildcard |
| 254 to feed into the next call(s) to _BuildBucketFilterStrings(), for the |
| 255 next iteration of listing/filtering. |
| 256 |
| 257 Raises: |
| 258 AssertionError if wildcard doesn't contain any wildcard chars. |
| 259 """ |
| 260 # Generate a request prefix if the object name part of the wildcard starts |
| 261 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). |
| 262 match = WILDCARD_REGEX.search(wildcard) |
| 263 if not match: |
| 264 # Input "wildcard" has no wildcard chars, so just return tuple that will |
| 265 # cause a bucket listing to match the given input wildcard. Example: if |
| 266 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, |
| 267 # the next iteration will call _BuildBucketFilterStrings() with |
| 268 # gs://bucket/dir/abc, and we will return prefix ='dir/abc', |
| 269 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. |
| 270 prefix = wildcard |
| 271 delimiter = '/' |
| 272 prefix_wildcard = wildcard |
| 273 suffix_wildcard = '' |
| 274 else: |
| 275 if match.start() > 0: |
| 276 # Wildcard does not occur at beginning of object name, so construct a |
| 277 # prefix string to send to server. |
| 278 prefix = wildcard[:match.start()] |
| 279 wildcard_part = wildcard[match.start():] |
| 280 else: |
| 281 prefix = None |
| 282 wildcard_part = wildcard |
| 283 end = wildcard_part.find('/') |
| 284 if end != -1: |
| 285 wildcard_part = wildcard_part[:end+1] |
| 286 # Remove trailing '/' so we will match gs://bucket/abc* as well as |
| 287 # gs://bucket/abc*/ with the same wildcard regex. |
| 288 prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/') |
| 289 suffix_wildcard = wildcard[match.end():] |
| 290 end = suffix_wildcard.find('/') |
| 291 if end == -1: |
| 292 suffix_wildcard = '' |
| 293 else: |
| 294 suffix_wildcard = suffix_wildcard[end+1:] |
| 295 # To implement recursive (**) wildcarding, if prefix_wildcard |
| 296 # suffix_wildcard starts with '**' don't send a delimiter, and combine |
| 297 # suffix_wildcard at end of prefix_wildcard. |
| 298 if prefix_wildcard.find('**') != -1: |
| 299 delimiter = None |
| 300 prefix_wildcard = prefix_wildcard + suffix_wildcard |
| 301 suffix_wildcard = '' |
| 302 else: |
| 303 delimiter = '/' |
| 304 delim_pos = suffix_wildcard.find(delimiter) |
| 305 # The following debug output is useful for tracing how the algorithm |
| 306 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt |
| 307 if self.debug > 1: |
| 308 sys.stderr.write( |
| 309 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' |
| 310 'prefix_wildcard=%s, suffix_wildcard=%s\n' % |
| 311 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) |
| 312 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
| 313 |
| 314 def IterKeys(self): |
| 315 """ |
| 316 Convenience iterator that runs underlying iterator and returns Key for each |
| 317 iteration. |
| 318 |
| 319 Yields: |
| 320 Subclass of boto.s3.key.Key, or empty iterator if no matches. |
| 321 |
| 322 Raises: |
| 323 WildcardException: for bucket-only uri. |
| 324 """ |
| 325 for bucket_listing_ref in self. __iter__(): |
| 326 if bucket_listing_ref.HasKey(): |
| 327 yield bucket_listing_ref.GetKey() |
| 328 |
| 329 def IterUris(self): |
| 330 """ |
| 331 Convenience iterator that runs underlying iterator and returns StorageUri |
| 332 for each iteration. |
| 333 |
| 334 Yields: |
| 335 StorageUri, or empty iterator if no matches. |
| 336 """ |
| 337 for bucket_listing_ref in self. __iter__(): |
| 338 yield bucket_listing_ref.GetUri() |
| 339 |
| 340 def IterUrisForKeys(self): |
| 341 """ |
| 342 Convenience iterator that runs underlying iterator and returns the |
| 343 StorageUri for each iterated BucketListingRef that has a Key. |
| 344 |
| 345 Yields: |
| 346 StorageUri, or empty iterator if no matches. |
| 347 """ |
| 348 for bucket_listing_ref in self. __iter__(): |
| 349 if bucket_listing_ref.HasKey(): |
| 350 yield bucket_listing_ref.GetUri() |
| 351 |
| 352 |
| 353 class FileWildcardIterator(WildcardIterator): |
| 354 """WildcardIterator subclass for files and directories. |
| 355 |
| 356 If you use recursive wildcards ('**') only a single such wildcard is |
| 357 supported. For example you could use the wildcard '**/*.txt' to list all .txt |
| 358 files in any subdirectory of the current directory, but you couldn't use a |
| 359 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt |
| 360 files in any subdirectory named 'abc'). |
| 361 """ |
| 362 |
| 363 def __init__(self, wildcard_uri, headers=None, debug=0): |
| 364 """ |
| 365 Instantiate an iterator over BucketListingRefs matching given wildcard URI. |
| 366 |
| 367 Args: |
| 368 wildcard_uri: StorageUri that contains the wildcard to iterate. |
| 369 headers: Dictionary containing optional HTTP headers to pass to boto. |
| 370 debug: Debug level to pass in to boto connection (range 0..3). |
| 371 """ |
| 372 self.wildcard_uri = wildcard_uri |
| 373 self.headers = headers |
| 374 self.debug = debug |
| 375 |
| 376 def __iter__(self): |
| 377 wildcard = self.wildcard_uri.object_name |
| 378 match = re.search('\*\*', wildcard) |
| 379 if match: |
| 380 # Recursive wildcarding request ('.../**/...'). |
| 381 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' |
| 382 base_dir = wildcard[:match.start()-1] |
| 383 remaining_wildcard = wildcard[match.start()+2:] |
| 384 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and |
| 385 # remaining_wildcard = '/*' |
| 386 if remaining_wildcard.startswith('*'): |
| 387 raise WildcardException('Invalid wildcard with more than 2 consecutive ' |
| 388 '*s (%s)' % wildcard) |
| 389 # If there was no remaining wildcard past the recursive wildcard, |
| 390 # treat it as if it were a '*'. For example, file://tmp/** is equivalent |
| 391 # to file://tmp/**/* |
| 392 if not remaining_wildcard: |
| 393 remaining_wildcard = '*' |
| 394 # Skip slash(es). |
| 395 remaining_wildcard = remaining_wildcard.lstrip(os.sep) |
| 396 filepaths = [] |
| 397 for dirpath, unused_dirnames, filenames in os.walk(base_dir): |
| 398 filepaths.extend( |
| 399 os.path.join(dirpath, f) for f in fnmatch.filter(filenames, |
| 400 remaining_wildcard) |
| 401 ) |
| 402 else: |
| 403 # Not a recursive wildcarding request. |
| 404 filepaths = glob.glob(wildcard) |
| 405 for filepath in filepaths: |
| 406 expanded_uri = self.wildcard_uri.clone_replace_name(filepath) |
| 407 yield BucketListingRef(expanded_uri) |
| 408 |
| 409 def IterKeys(self): |
| 410 """ |
| 411 Placeholder to allow polymorphic use of WildcardIterator. |
| 412 |
| 413 Raises: |
| 414 WildcardException: in all cases. |
| 415 """ |
| 416 raise WildcardException( |
| 417 'Iterating over Keys not possible for file wildcards') |
| 418 |
| 419 def IterUris(self): |
| 420 """ |
| 421 Convenience iterator that runs underlying iterator and returns StorageUri |
| 422 for each iteration. |
| 423 |
| 424 Yields: |
| 425 StorageUri, or empty iterator if no matches. |
| 426 """ |
| 427 for bucket_listing_ref in self. __iter__(): |
| 428 yield bucket_listing_ref.GetUri() |
| 429 |
| 430 |
| 431 class WildcardException(StandardError): |
| 432 """Exception thrown for invalid wildcard URIs.""" |
| 433 |
| 434 def __init__(self, reason): |
| 435 StandardError.__init__(self) |
| 436 self.reason = reason |
| 437 |
| 438 def __repr__(self): |
| 439 return 'WildcardException: %s' % self.reason |
| 440 |
| 441 def __str__(self): |
| 442 return 'WildcardException: %s' % self.reason |
| 443 |
| 444 |
| 445 def wildcard_iterator(uri_or_str, proj_id_handler, |
| 446 bucket_storage_uri_class=BucketStorageUri, |
| 447 all_versions=False, |
| 448 headers=None, debug=0): |
| 449 """Instantiate a WildCardIterator for the given StorageUri. |
| 450 |
| 451 Args: |
| 452 uri_or_str: StorageUri or URI string naming wildcard objects to iterate. |
| 453 proj_id_handler: ProjectIdHandler to use for current command. |
| 454 bucket_storage_uri_class: BucketStorageUri interface. |
| 455 Settable for testing/mocking. |
| 456 headers: Dictionary containing optional HTTP headers to pass to boto. |
| 457 debug: Debug level to pass in to boto connection (range 0..3). |
| 458 |
| 459 Returns: |
| 460 A WildcardIterator that handles the requested iteration. |
| 461 """ |
| 462 |
| 463 if isinstance(uri_or_str, basestring): |
| 464 # Disable enforce_bucket_naming, to allow bucket names containing wildcard |
| 465 # chars. |
| 466 uri = boto.storage_uri( |
| 467 uri_or_str, debug=debug, validate=False, |
| 468 bucket_storage_uri_class=bucket_storage_uri_class, |
| 469 suppress_consec_slashes=False) |
| 470 else: |
| 471 uri = uri_or_str |
| 472 |
| 473 if uri.is_cloud_uri(): |
| 474 return CloudWildcardIterator( |
| 475 uri, proj_id_handler, |
| 476 bucket_storage_uri_class=bucket_storage_uri_class, |
| 477 all_versions=all_versions, |
| 478 headers=headers, |
| 479 debug=debug) |
| 480 elif uri.is_file_uri(): |
| 481 return FileWildcardIterator(uri, headers=headers, debug=debug) |
| 482 else: |
| 483 raise WildcardException('Unexpected type of StorageUri (%s)' % uri) |
| 484 |
| 485 |
| 486 def ContainsWildcard(uri_or_str): |
| 487 """Checks whether uri_or_str contains a wildcard. |
| 488 |
| 489 Args: |
| 490 uri_or_str: StorageUri or URI string to check. |
| 491 |
| 492 Returns: |
| 493 bool indicator. |
| 494 """ |
| 495 if isinstance(uri_or_str, basestring): |
| 496 return bool(WILDCARD_REGEX.search(uri_or_str)) |
| 497 else: |
| 498 return bool(WILDCARD_REGEX.search(uri_or_str.uri)) |
OLD | NEW |