# Copyright 2017 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Shared utilities used by both downloads and uploads.""" import base64 import hashlib import logging import random import time import warnings from six.moves import http_client from google.resumable_media import common RANGE_HEADER = u"range" CONTENT_RANGE_HEADER = u"content-range" RETRYABLE = ( common.TOO_MANY_REQUESTS, http_client.INTERNAL_SERVER_ERROR, http_client.BAD_GATEWAY, http_client.SERVICE_UNAVAILABLE, http_client.GATEWAY_TIMEOUT, ) _SLOW_CRC32C_WARNING = ( "Currently using crcmod in pure python form. This is a slow " "implementation. Python 3 has a faster implementation, `google-crc32c`, " "which will be used if it is installed." ) _HASH_HEADER = u"x-goog-hash" _MISSING_CHECKSUM = u"""\ No {checksum_type} checksum was returned from the service while downloading {} (which happens for composite objects), so client-side content integrity checking is not being performed.""" _LOGGER = logging.getLogger(__name__) def do_nothing(): """Simple default callback.""" def header_required(response, name, get_headers, callback=do_nothing): """Checks that a specific header is in a headers dictionary. Args: response (object): An HTTP response object, expected to have a ``headers`` attribute that is a ``Mapping[str, str]``. name (str): The name of a required header. get_headers (Callable[Any, Mapping[str, str]]): Helper to get headers from an HTTP response. callback (Optional[Callable]): A callback that takes no arguments, to be executed when an exception is being raised. Returns: str: The desired header. Raises: ~google.resumable_media.common.InvalidResponse: If the header is missing. """ headers = get_headers(response) if name not in headers: callback() raise common.InvalidResponse( response, u"Response headers must contain header", name ) return headers[name] def require_status_code(response, status_codes, get_status_code, callback=do_nothing): """Require a response has a status code among a list. Args: response (object): The HTTP response object. status_codes (tuple): The acceptable status codes. get_status_code (Callable[Any, int]): Helper to get a status code from a response. callback (Optional[Callable]): A callback that takes no arguments, to be executed when an exception is being raised. Returns: int: The status code. Raises: ~google.resumable_media.common.InvalidResponse: If the status code is not one of the values in ``status_codes``. """ status_code = get_status_code(response) if status_code not in status_codes: callback() raise common.InvalidResponse( response, u"Request failed with status code", status_code, u"Expected one of", *status_codes ) return status_code def calculate_retry_wait(base_wait, max_sleep): """Calculate the amount of time to wait before a retry attempt. Wait time grows exponentially with the number of attempts, until it hits ``max_sleep``. A random amount of jitter (between 0 and 1 seconds) is added to spread out retry attempts from different clients. Args: base_wait (float): The "base" wait time (i.e. without any jitter) that will be doubled until it reaches the maximum sleep. max_sleep (float): Maximum value that a sleep time is allowed to be. Returns: Tuple[float, float]: The new base wait time as well as the wait time to be applied (with a random amount of jitter between 0 and 1 seconds added). """ new_base_wait = 2.0 * base_wait if new_base_wait > max_sleep: new_base_wait = max_sleep jitter_ms = random.randint(0, 1000) return new_base_wait, new_base_wait + 0.001 * jitter_ms def wait_and_retry(func, get_status_code, retry_strategy): """Attempts to retry a call to ``func`` until success. Expects ``func`` to return an HTTP response and uses ``get_status_code`` to check if the response is retry-able. Will retry until :meth:`~.RetryStrategy.retry_allowed` (on the current ``retry_strategy``) returns :data:`False`. Uses :func:`calculate_retry_wait` to double the wait time (with jitter) after each attempt. Args: func (Callable): A callable that takes no arguments and produces an HTTP response which will be checked as retry-able. get_status_code (Callable[Any, int]): Helper to get a status code from a response. retry_strategy (~google.resumable_media.common.RetryStrategy): The strategy to use if the request fails and must be retried. Returns: object: The return value of ``func``. """ response = func() if get_status_code(response) not in RETRYABLE: return response total_sleep = 0.0 num_retries = 0 base_wait = 0.5 # When doubled will give 1.0 while retry_strategy.retry_allowed(total_sleep, num_retries): base_wait, wait_time = calculate_retry_wait(base_wait, retry_strategy.max_sleep) num_retries += 1 total_sleep += wait_time time.sleep(wait_time) response = func() if get_status_code(response) not in RETRYABLE: return response return response def _get_crc32c_object(): """Get crc32c object Attempt to use the Google-CRC32c package. If it isn't available, try to use CRCMod. CRCMod might be using a 'slow' varietal. If so, warn... """ try: import google_crc32c crc_obj = google_crc32c.Checksum() except ImportError: try: import crcmod crc_obj = crcmod.predefined.Crc("crc-32c") _is_fast_crcmod() except ImportError: raise ImportError("Failed to import either `google-crc32c` or `crcmod`") return crc_obj def _is_fast_crcmod(): # Determine if this is using the slow form of crcmod. nested_crcmod = __import__( "crcmod.crcmod", globals(), locals(), ["_usingExtension"], 0, ) fast_crc = getattr(nested_crcmod, "_usingExtension", False) if not fast_crc: warnings.warn(_SLOW_CRC32C_WARNING, RuntimeWarning, stacklevel=2) return fast_crc def _get_metadata_key(checksum_type): if checksum_type == "md5": return "md5Hash" else: return checksum_type def prepare_checksum_digest(digest_bytestring): """Convert a checksum object into a digest encoded for an HTTP header. Args: bytes: A checksum digest bytestring. Returns: str: A base64 string representation of the input. """ encoded_digest = base64.b64encode(digest_bytestring) # NOTE: ``b64encode`` returns ``bytes``, but HTTP headers expect ``str``. return encoded_digest.decode(u"utf-8") def _get_expected_checksum(response, get_headers, media_url, checksum_type): """Get the expected checksum and checksum object for the download response. Args: response (~requests.Response): The HTTP response object. get_headers (callable: response->dict): returns response headers. media_url (str): The URL containing the media to be downloaded. checksum_type Optional(str): The checksum type to read from the headers, exactly as it will appear in the headers (case-sensitive). Must be "md5", "crc32c" or None. Returns: Tuple (Optional[str], object): The expected checksum of the response, if it can be detected from the ``X-Goog-Hash`` header, and the appropriate checksum object for the expected checksum. """ if checksum_type not in ["md5", "crc32c", None]: raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``") elif checksum_type in ["md5", "crc32c"]: headers = get_headers(response) expected_checksum = _parse_checksum_header( headers.get(_HASH_HEADER), response, checksum_label=checksum_type ) if expected_checksum is None: msg = _MISSING_CHECKSUM.format( media_url, checksum_type=checksum_type.upper() ) _LOGGER.info(msg) checksum_object = _DoNothingHash() else: if checksum_type == "md5": checksum_object = hashlib.md5() else: checksum_object = _get_crc32c_object() else: expected_checksum = None checksum_object = _DoNothingHash() return (expected_checksum, checksum_object) def _parse_checksum_header(header_value, response, checksum_label): """Parses the checksum header from an ``X-Goog-Hash`` value. .. _header reference: https://cloud.google.com/storage/docs/\ xml-api/reference-headers#xgooghash Expects ``header_value`` (if not :data:`None`) to be in one of the three following formats: * ``crc32c=n03x6A==`` * ``md5=Ojk9c3dhfxgoKVVHYwFbHQ==`` * ``crc32c=n03x6A==,md5=Ojk9c3dhfxgoKVVHYwFbHQ==`` See the `header reference`_ for more information. Args: header_value (Optional[str]): The ``X-Goog-Hash`` header from a download response. response (~requests.Response): The HTTP response object. checksum_label (str): The label of the header value to read, as in the examples above. Typically "md5" or "crc32c" Returns: Optional[str]: The expected checksum of the response, if it can be detected from the ``X-Goog-Hash`` header; otherwise, None. Raises: ~google.resumable_media.common.InvalidResponse: If there are multiple checksums of the requested type in ``header_value``. """ if header_value is None: return None matches = [] for checksum in header_value.split(u","): name, value = checksum.split(u"=", 1) # Official docs say "," is the separator, but real-world responses have encountered ", " if name.lstrip() == checksum_label: matches.append(value) if len(matches) == 0: return None elif len(matches) == 1: return matches[0] else: raise common.InvalidResponse( response, u"X-Goog-Hash header had multiple ``{}`` values.".format(checksum_label), header_value, matches, ) def _get_checksum_object(checksum_type): """Respond with a checksum object for a supported type, if not None. Raises ValueError if checksum_type is unsupported. """ if checksum_type == "md5": return hashlib.md5() elif checksum_type == "crc32c": return _get_crc32c_object() elif checksum_type is None: return None else: raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``") class _DoNothingHash(object): """Do-nothing hash object. Intended as a stand-in for ``hashlib.md5`` or a crc32c checksum implementation in cases where it isn't necessary to compute the hash. """ def update(self, unused_chunk): """Do-nothing ``update`` method. Intended to match the interface of ``hashlib.md5`` and other checksums. Args: unused_chunk (bytes): A chunk of data. """