OLD | NEW |
(Empty) | |
| 1 # Copyright 2012 Google Inc. All Rights Reserved. |
| 2 # |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 # you may not use this file except in compliance with the License. |
| 5 # You may obtain a copy of the License at |
| 6 # |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 |
| 8 # |
| 9 # Unless required by applicable law or agreed to in writing, |
| 10 # software distributed under the License is distributed on an |
| 11 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, |
| 12 # either express or implied. See the License for the specific |
| 13 # language governing permissions and limitations under the License. |
| 14 |
| 15 """Helpers shared by cloudstorage_stub and cloudstorage_api.""" |
| 16 |
| 17 |
| 18 |
| 19 |
| 20 |
| 21 __all__ = ['CS_XML_NS', |
| 22 'CSFileStat', |
| 23 'dt_str_to_posix', |
| 24 'local_api_url', |
| 25 'LOCAL_GCS_ENDPOINT', |
| 26 'local_run', |
| 27 'get_access_token', |
| 28 'get_stored_content_length', |
| 29 'get_metadata', |
| 30 'GCSFileStat', |
| 31 'http_time_to_posix', |
| 32 'memory_usage', |
| 33 'posix_time_to_http', |
| 34 'posix_to_dt_str', |
| 35 'set_access_token', |
| 36 'validate_options', |
| 37 'validate_bucket_name', |
| 38 'validate_bucket_path', |
| 39 'validate_file_path', |
| 40 ] |
| 41 |
| 42 |
| 43 import calendar |
| 44 import datetime |
| 45 from email import utils as email_utils |
| 46 import logging |
| 47 import os |
| 48 import re |
| 49 |
| 50 try: |
| 51 from google.appengine.api import runtime |
| 52 except ImportError: |
| 53 from google.appengine.api import runtime |
| 54 |
| 55 |
| 56 _GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}' |
| 57 _GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$') |
| 58 _GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$') |
| 59 _GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*') |
| 60 _GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*') |
| 61 _GCS_METADATA = ['x-goog-meta-', |
| 62 'content-disposition', |
| 63 'cache-control', |
| 64 'content-encoding'] |
| 65 _GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl'] |
| 66 CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01' |
| 67 LOCAL_GCS_ENDPOINT = '/_ah/gcs' |
| 68 _access_token = '' |
| 69 |
| 70 |
| 71 _MAX_GET_BUCKET_RESULT = 1000 |
| 72 |
| 73 |
| 74 def set_access_token(access_token): |
| 75 """Set the shared access token to authenticate with Google Cloud Storage. |
| 76 |
| 77 When set, the library will always attempt to communicate with the |
| 78 real Google Cloud Storage with this token even when running on dev appserver. |
| 79 Note the token could expire so it's up to you to renew it. |
| 80 |
| 81 When absent, the library will automatically request and refresh a token |
| 82 on appserver, or when on dev appserver, talk to a Google Cloud Storage |
| 83 stub. |
| 84 |
| 85 Args: |
| 86 access_token: you can get one by run 'gsutil -d ls' and copy the |
| 87 str after 'Bearer'. |
| 88 """ |
| 89 global _access_token |
| 90 _access_token = access_token |
| 91 |
| 92 |
| 93 def get_access_token(): |
| 94 """Returns the shared access token.""" |
| 95 return _access_token |
| 96 |
| 97 |
| 98 class GCSFileStat(object): |
| 99 """Container for GCS file stat.""" |
| 100 |
| 101 def __init__(self, |
| 102 filename, |
| 103 st_size, |
| 104 etag, |
| 105 st_ctime, |
| 106 content_type=None, |
| 107 metadata=None, |
| 108 is_dir=False): |
| 109 """Initialize. |
| 110 |
| 111 For files, the non optional arguments are always set. |
| 112 For directories, only filename and is_dir is set. |
| 113 |
| 114 Args: |
| 115 filename: a Google Cloud Storage filename of form '/bucket/filename'. |
| 116 st_size: file size in bytes. long compatible. |
| 117 etag: hex digest of the md5 hash of the file's content. str. |
| 118 st_ctime: posix file creation time. float compatible. |
| 119 content_type: content type. str. |
| 120 metadata: a str->str dict of user specified options when creating |
| 121 the file. Possible keys are x-goog-meta-, content-disposition, |
| 122 content-encoding, and cache-control. |
| 123 is_dir: True if this represents a directory. False if this is a real file. |
| 124 """ |
| 125 self.filename = filename |
| 126 self.is_dir = is_dir |
| 127 self.st_size = None |
| 128 self.st_ctime = None |
| 129 self.etag = None |
| 130 self.content_type = content_type |
| 131 self.metadata = metadata |
| 132 |
| 133 if not is_dir: |
| 134 self.st_size = long(st_size) |
| 135 self.st_ctime = float(st_ctime) |
| 136 if etag[0] == '"' and etag[-1] == '"': |
| 137 etag = etag[1:-1] |
| 138 self.etag = etag |
| 139 |
| 140 def __repr__(self): |
| 141 if self.is_dir: |
| 142 return '(directory: %s)' % self.filename |
| 143 |
| 144 return ( |
| 145 '(filename: %(filename)s, st_size: %(st_size)s, ' |
| 146 'st_ctime: %(st_ctime)s, etag: %(etag)s, ' |
| 147 'content_type: %(content_type)s, ' |
| 148 'metadata: %(metadata)s)' % |
| 149 dict(filename=self.filename, |
| 150 st_size=self.st_size, |
| 151 st_ctime=self.st_ctime, |
| 152 etag=self.etag, |
| 153 content_type=self.content_type, |
| 154 metadata=self.metadata)) |
| 155 |
| 156 def __cmp__(self, other): |
| 157 if not isinstance(other, self.__class__): |
| 158 raise ValueError('Argument to cmp must have the same type. ' |
| 159 'Expect %s, got %s', self.__class__.__name__, |
| 160 other.__class__.__name__) |
| 161 if self.filename > other.filename: |
| 162 return 1 |
| 163 elif self.filename < other.filename: |
| 164 return -1 |
| 165 return 0 |
| 166 |
| 167 def __hash__(self): |
| 168 if self.etag: |
| 169 return hash(self.etag) |
| 170 return hash(self.filename) |
| 171 |
| 172 |
| 173 CSFileStat = GCSFileStat |
| 174 |
| 175 |
| 176 def get_stored_content_length(headers): |
| 177 """Return the content length (in bytes) of the object as stored in GCS. |
| 178 |
| 179 x-goog-stored-content-length should always be present except when called via |
| 180 the local dev_appserver. Therefore if it is not present we default to the |
| 181 standard content-length header. |
| 182 |
| 183 Args: |
| 184 headers: a dict of headers from the http response. |
| 185 |
| 186 Returns: |
| 187 the stored content length. |
| 188 """ |
| 189 length = headers.get('x-goog-stored-content-length') |
| 190 if length is None: |
| 191 length = headers.get('content-length') |
| 192 return length |
| 193 |
| 194 |
| 195 def get_metadata(headers): |
| 196 """Get user defined options from HTTP response headers.""" |
| 197 return dict((k, v) for k, v in headers.iteritems() |
| 198 if any(k.lower().startswith(valid) for valid in _GCS_METADATA)) |
| 199 |
| 200 |
| 201 def validate_bucket_name(name): |
| 202 """Validate a Google Storage bucket name. |
| 203 |
| 204 Args: |
| 205 name: a Google Storage bucket name with no prefix or suffix. |
| 206 |
| 207 Raises: |
| 208 ValueError: if name is invalid. |
| 209 """ |
| 210 _validate_path(name) |
| 211 if not _GCS_BUCKET_REGEX.match(name): |
| 212 raise ValueError('Bucket should be 3-63 characters long using only a-z,' |
| 213 '0-9, underscore, dash or dot but got %s' % name) |
| 214 |
| 215 |
| 216 def validate_bucket_path(path): |
| 217 """Validate a Google Cloud Storage bucket path. |
| 218 |
| 219 Args: |
| 220 path: a Google Storage bucket path. It should have form '/bucket'. |
| 221 |
| 222 Raises: |
| 223 ValueError: if path is invalid. |
| 224 """ |
| 225 _validate_path(path) |
| 226 if not _GCS_BUCKET_PATH_REGEX.match(path): |
| 227 raise ValueError('Bucket should have format /bucket ' |
| 228 'but got %s' % path) |
| 229 |
| 230 |
| 231 def validate_file_path(path): |
| 232 """Validate a Google Cloud Storage file path. |
| 233 |
| 234 Args: |
| 235 path: a Google Storage file path. It should have form '/bucket/filename'. |
| 236 |
| 237 Raises: |
| 238 ValueError: if path is invalid. |
| 239 """ |
| 240 _validate_path(path) |
| 241 if not _GCS_FULLPATH_REGEX.match(path): |
| 242 raise ValueError('Path should have format /bucket/filename ' |
| 243 'but got %s' % path) |
| 244 |
| 245 |
| 246 def _process_path_prefix(path_prefix): |
| 247 """Validate and process a Google Cloud Stoarge path prefix. |
| 248 |
| 249 Args: |
| 250 path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix' |
| 251 or '/bucket/' or '/bucket'. |
| 252 |
| 253 Raises: |
| 254 ValueError: if path is invalid. |
| 255 |
| 256 Returns: |
| 257 a tuple of /bucket and prefix. prefix can be None. |
| 258 """ |
| 259 _validate_path(path_prefix) |
| 260 if not _GCS_PATH_PREFIX_REGEX.match(path_prefix): |
| 261 raise ValueError('Path prefix should have format /bucket, /bucket/, ' |
| 262 'or /bucket/prefix but got %s.' % path_prefix) |
| 263 bucket_name_end = path_prefix.find('/', 1) |
| 264 bucket = path_prefix |
| 265 prefix = None |
| 266 if bucket_name_end != -1: |
| 267 bucket = path_prefix[:bucket_name_end] |
| 268 prefix = path_prefix[bucket_name_end + 1:] or None |
| 269 return bucket, prefix |
| 270 |
| 271 |
| 272 def _validate_path(path): |
| 273 """Basic validation of Google Storage paths. |
| 274 |
| 275 Args: |
| 276 path: a Google Storage path. It should have form '/bucket/filename' |
| 277 or '/bucket'. |
| 278 |
| 279 Raises: |
| 280 ValueError: if path is invalid. |
| 281 TypeError: if path is not of type basestring. |
| 282 """ |
| 283 if not path: |
| 284 raise ValueError('Path is empty') |
| 285 if not isinstance(path, basestring): |
| 286 raise TypeError('Path should be a string but is %s (%s).' % |
| 287 (path.__class__, path)) |
| 288 |
| 289 |
| 290 def validate_options(options): |
| 291 """Validate Google Cloud Storage options. |
| 292 |
| 293 Args: |
| 294 options: a str->basestring dict of options to pass to Google Cloud Storage. |
| 295 |
| 296 Raises: |
| 297 ValueError: if option is not supported. |
| 298 TypeError: if option is not of type str or value of an option |
| 299 is not of type basestring. |
| 300 """ |
| 301 if not options: |
| 302 return |
| 303 |
| 304 for k, v in options.iteritems(): |
| 305 if not isinstance(k, str): |
| 306 raise TypeError('option %r should be a str.' % k) |
| 307 if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS): |
| 308 raise ValueError('option %s is not supported.' % k) |
| 309 if not isinstance(v, basestring): |
| 310 raise TypeError('value %r for option %s should be of type basestring.' % |
| 311 (v, k)) |
| 312 |
| 313 |
| 314 def http_time_to_posix(http_time): |
| 315 """Convert HTTP time format to posix time. |
| 316 |
| 317 See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1 |
| 318 for http time format. |
| 319 |
| 320 Args: |
| 321 http_time: time in RFC 2616 format. e.g. |
| 322 "Mon, 20 Nov 1995 19:12:08 GMT". |
| 323 |
| 324 Returns: |
| 325 A float of secs from unix epoch. |
| 326 """ |
| 327 if http_time is not None: |
| 328 return email_utils.mktime_tz(email_utils.parsedate_tz(http_time)) |
| 329 |
| 330 |
| 331 def posix_time_to_http(posix_time): |
| 332 """Convert posix time to HTML header time format. |
| 333 |
| 334 Args: |
| 335 posix_time: unix time. |
| 336 |
| 337 Returns: |
| 338 A datatime str in RFC 2616 format. |
| 339 """ |
| 340 if posix_time: |
| 341 return email_utils.formatdate(posix_time, usegmt=True) |
| 342 |
| 343 |
| 344 _DT_FORMAT = '%Y-%m-%dT%H:%M:%S' |
| 345 |
| 346 |
| 347 def dt_str_to_posix(dt_str): |
| 348 """format str to posix. |
| 349 |
| 350 datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ, |
| 351 e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator |
| 352 between date and time when they are on the same line. |
| 353 Z indicates UTC (zero meridian). |
| 354 |
| 355 A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html |
| 356 |
| 357 This is used to parse LastModified node from GCS's GET bucket XML response. |
| 358 |
| 359 Args: |
| 360 dt_str: A datetime str. |
| 361 |
| 362 Returns: |
| 363 A float of secs from unix epoch. By posix definition, epoch is midnight |
| 364 1970/1/1 UTC. |
| 365 """ |
| 366 parsable, _ = dt_str.split('.') |
| 367 dt = datetime.datetime.strptime(parsable, _DT_FORMAT) |
| 368 return calendar.timegm(dt.utctimetuple()) |
| 369 |
| 370 |
| 371 def posix_to_dt_str(posix): |
| 372 """Reverse of str_to_datetime. |
| 373 |
| 374 This is used by GCS stub to generate GET bucket XML response. |
| 375 |
| 376 Args: |
| 377 posix: A float of secs from unix epoch. |
| 378 |
| 379 Returns: |
| 380 A datetime str. |
| 381 """ |
| 382 dt = datetime.datetime.utcfromtimestamp(posix) |
| 383 dt_str = dt.strftime(_DT_FORMAT) |
| 384 return dt_str + '.000Z' |
| 385 |
| 386 |
| 387 def local_run(): |
| 388 """Whether we should hit GCS dev appserver stub.""" |
| 389 server_software = os.environ.get('SERVER_SOFTWARE') |
| 390 if server_software is None: |
| 391 return True |
| 392 if 'remote_api' in server_software: |
| 393 return False |
| 394 if server_software.startswith(('Development', 'testutil')): |
| 395 return True |
| 396 return False |
| 397 |
| 398 |
| 399 def local_api_url(): |
| 400 """Return URL for GCS emulation on dev appserver.""" |
| 401 return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT) |
| 402 |
| 403 |
| 404 def memory_usage(method): |
| 405 """Log memory usage before and after a method.""" |
| 406 def wrapper(*args, **kwargs): |
| 407 logging.info('Memory before method %s is %s.', |
| 408 method.__name__, runtime.memory_usage().current()) |
| 409 result = method(*args, **kwargs) |
| 410 logging.info('Memory after method %s is %s', |
| 411 method.__name__, runtime.memory_usage().current()) |
| 412 return result |
| 413 return wrapper |
| 414 |
| 415 |
| 416 def _add_ns(tagname): |
| 417 return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS, |
| 418 'tag': tagname} |
| 419 |
| 420 |
| 421 _T_CONTENTS = _add_ns('Contents') |
| 422 _T_LAST_MODIFIED = _add_ns('LastModified') |
| 423 _T_ETAG = _add_ns('ETag') |
| 424 _T_KEY = _add_ns('Key') |
| 425 _T_SIZE = _add_ns('Size') |
| 426 _T_PREFIX = _add_ns('Prefix') |
| 427 _T_COMMON_PREFIXES = _add_ns('CommonPrefixes') |
| 428 _T_NEXT_MARKER = _add_ns('NextMarker') |
| 429 _T_IS_TRUNCATED = _add_ns('IsTruncated') |
OLD | NEW |