501 lines
19 KiB
Python
501 lines
19 KiB
Python
# Copyright 2017 Google Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Support for downloading media from Google APIs."""
|
|
|
|
import urllib3.response
|
|
|
|
from google.resumable_media import _download
|
|
from google.resumable_media import common
|
|
from google.resumable_media import _helpers
|
|
from google.resumable_media.requests import _request_helpers
|
|
|
|
|
|
_CHECKSUM_MISMATCH = u"""\
|
|
Checksum mismatch while downloading:
|
|
|
|
{}
|
|
|
|
The X-Goog-Hash header indicated an {checksum_type} checksum of:
|
|
|
|
{}
|
|
|
|
but the actual {checksum_type} checksum of the downloaded contents was:
|
|
|
|
{}
|
|
"""
|
|
|
|
|
|
class Download(_request_helpers.RequestsMixin, _download.Download):
|
|
"""Helper to manage downloading a resource from a Google API.
|
|
|
|
"Slices" of the resource can be retrieved by specifying a range
|
|
with ``start`` and / or ``end``. However, in typical usage, neither
|
|
``start`` nor ``end`` is expected to be provided.
|
|
|
|
Args:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
stream (IO[bytes]): A write-able stream (i.e. file-like object) that
|
|
the downloaded resource can be written to.
|
|
start (int): The first byte in a range to be downloaded. If not
|
|
provided, but ``end`` is provided, will download from the
|
|
beginning to ``end`` of the media.
|
|
end (int): The last byte in a range to be downloaded. If not
|
|
provided, but ``start`` is provided, will download from the
|
|
``start`` to the end of the media.
|
|
headers (Optional[Mapping[str, str]]): Extra headers that should
|
|
be sent with the request, e.g. headers for encrypted data.
|
|
checksum Optional([str]): The type of checksum to compute to verify
|
|
the integrity of the object. The response headers must contain
|
|
a checksum of the requested type. If the headers lack an
|
|
appropriate checksum (for instance in the case of transcoded or
|
|
ranged downloads where the remote service does not know the
|
|
correct checksum) an INFO-level log will be emitted. Supported
|
|
values are "md5", "crc32c" and None. The default is "md5".
|
|
|
|
Attributes:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
start (Optional[int]): The first byte in a range to be downloaded.
|
|
end (Optional[int]): The last byte in a range to be downloaded.
|
|
"""
|
|
|
|
def _write_to_stream(self, response):
|
|
"""Write response body to a write-able stream.
|
|
|
|
.. note:
|
|
|
|
This method assumes that the ``_stream`` attribute is set on the
|
|
current download.
|
|
|
|
Args:
|
|
response (~requests.Response): The HTTP response object.
|
|
|
|
Raises:
|
|
~google.resumable_media.common.DataCorruption: If the download's
|
|
checksum doesn't agree with server-computed checksum.
|
|
"""
|
|
|
|
# `_get_expected_checksum()` may return None even if a checksum was
|
|
# requested, in which case it will emit an info log _MISSING_CHECKSUM.
|
|
# If an invalid checksum type is specified, this will raise ValueError.
|
|
expected_checksum, checksum_object = _helpers._get_expected_checksum(
|
|
response, self._get_headers, self.media_url, checksum_type=self.checksum
|
|
)
|
|
|
|
with response:
|
|
# NOTE: In order to handle compressed streams gracefully, we try
|
|
# to insert our checksum object into the decompression stream. If
|
|
# the stream is indeed compressed, this will delegate the checksum
|
|
# object to the decoder and return a _DoNothingHash here.
|
|
local_checksum_object = _add_decoder(response.raw, checksum_object)
|
|
body_iter = response.iter_content(
|
|
chunk_size=_request_helpers._SINGLE_GET_CHUNK_SIZE, decode_unicode=False
|
|
)
|
|
for chunk in body_iter:
|
|
self._stream.write(chunk)
|
|
local_checksum_object.update(chunk)
|
|
|
|
if expected_checksum is None:
|
|
return
|
|
else:
|
|
actual_checksum = _helpers.prepare_checksum_digest(checksum_object.digest())
|
|
if actual_checksum != expected_checksum:
|
|
msg = _CHECKSUM_MISMATCH.format(
|
|
self.media_url,
|
|
expected_checksum,
|
|
actual_checksum,
|
|
checksum_type=self.checksum.upper(),
|
|
)
|
|
raise common.DataCorruption(response, msg)
|
|
|
|
def consume(
|
|
self,
|
|
transport,
|
|
timeout=(
|
|
_request_helpers._DEFAULT_CONNECT_TIMEOUT,
|
|
_request_helpers._DEFAULT_READ_TIMEOUT,
|
|
),
|
|
):
|
|
"""Consume the resource to be downloaded.
|
|
|
|
If a ``stream`` is attached to this download, then the downloaded
|
|
resource will be written to the stream.
|
|
|
|
Args:
|
|
transport (~requests.Session): A ``requests`` object which can
|
|
make authenticated requests.
|
|
timeout (Optional[Union[float, Tuple[float, float]]]):
|
|
The number of seconds to wait for the server response.
|
|
Depending on the retry strategy, a request may be repeated
|
|
several times using the same timeout each time.
|
|
|
|
Can also be passed as a tuple (connect_timeout, read_timeout).
|
|
See :meth:`requests.Session.request` documentation for details.
|
|
|
|
Returns:
|
|
~requests.Response: The HTTP response returned by ``transport``.
|
|
|
|
Raises:
|
|
~google.resumable_media.common.DataCorruption: If the download's
|
|
checksum doesn't agree with server-computed checksum.
|
|
ValueError: If the current :class:`Download` has already
|
|
finished.
|
|
"""
|
|
method, url, payload, headers = self._prepare_request()
|
|
# NOTE: We assume "payload is None" but pass it along anyway.
|
|
request_kwargs = {
|
|
u"data": payload,
|
|
u"headers": headers,
|
|
u"retry_strategy": self._retry_strategy,
|
|
u"timeout": timeout,
|
|
}
|
|
if self._stream is not None:
|
|
request_kwargs[u"stream"] = True
|
|
|
|
result = _request_helpers.http_request(transport, method, url, **request_kwargs)
|
|
|
|
self._process_response(result)
|
|
|
|
if self._stream is not None:
|
|
self._write_to_stream(result)
|
|
|
|
return result
|
|
|
|
|
|
class RawDownload(_request_helpers.RawRequestsMixin, _download.Download):
|
|
"""Helper to manage downloading a raw resource from a Google API.
|
|
|
|
"Slices" of the resource can be retrieved by specifying a range
|
|
with ``start`` and / or ``end``. However, in typical usage, neither
|
|
``start`` nor ``end`` is expected to be provided.
|
|
|
|
Args:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
stream (IO[bytes]): A write-able stream (i.e. file-like object) that
|
|
the downloaded resource can be written to.
|
|
start (int): The first byte in a range to be downloaded. If not
|
|
provided, but ``end`` is provided, will download from the
|
|
beginning to ``end`` of the media.
|
|
end (int): The last byte in a range to be downloaded. If not
|
|
provided, but ``start`` is provided, will download from the
|
|
``start`` to the end of the media.
|
|
headers (Optional[Mapping[str, str]]): Extra headers that should
|
|
be sent with the request, e.g. headers for encrypted data.
|
|
checksum Optional([str]): The type of checksum to compute to verify
|
|
the integrity of the object. The response headers must contain
|
|
a checksum of the requested type. If the headers lack an
|
|
appropriate checksum (for instance in the case of transcoded or
|
|
ranged downloads where the remote service does not know the
|
|
correct checksum) an INFO-level log will be emitted. Supported
|
|
values are "md5", "crc32c" and None. The default is "md5".
|
|
Attributes:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
start (Optional[int]): The first byte in a range to be downloaded.
|
|
end (Optional[int]): The last byte in a range to be downloaded.
|
|
"""
|
|
|
|
def _write_to_stream(self, response):
|
|
"""Write response body to a write-able stream.
|
|
|
|
.. note:
|
|
|
|
This method assumes that the ``_stream`` attribute is set on the
|
|
current download.
|
|
|
|
Args:
|
|
response (~requests.Response): The HTTP response object.
|
|
|
|
Raises:
|
|
~google.resumable_media.common.DataCorruption: If the download's
|
|
checksum doesn't agree with server-computed checksum.
|
|
"""
|
|
|
|
# `_get_expected_checksum()` may return None even if a checksum was
|
|
# requested, in which case it will emit an info log _MISSING_CHECKSUM.
|
|
# If an invalid checksum type is specified, this will raise ValueError.
|
|
expected_checksum, checksum_object = _helpers._get_expected_checksum(
|
|
response, self._get_headers, self.media_url, checksum_type=self.checksum
|
|
)
|
|
|
|
with response:
|
|
body_iter = response.raw.stream(
|
|
_request_helpers._SINGLE_GET_CHUNK_SIZE, decode_content=False
|
|
)
|
|
for chunk in body_iter:
|
|
self._stream.write(chunk)
|
|
checksum_object.update(chunk)
|
|
response._content_consumed = True
|
|
|
|
if expected_checksum is None:
|
|
return
|
|
else:
|
|
actual_checksum = _helpers.prepare_checksum_digest(checksum_object.digest())
|
|
|
|
if actual_checksum != expected_checksum:
|
|
msg = _CHECKSUM_MISMATCH.format(
|
|
self.media_url,
|
|
expected_checksum,
|
|
actual_checksum,
|
|
checksum_type=self.checksum.upper(),
|
|
)
|
|
raise common.DataCorruption(response, msg)
|
|
|
|
def consume(
|
|
self,
|
|
transport,
|
|
timeout=(
|
|
_request_helpers._DEFAULT_CONNECT_TIMEOUT,
|
|
_request_helpers._DEFAULT_READ_TIMEOUT,
|
|
),
|
|
):
|
|
"""Consume the resource to be downloaded.
|
|
|
|
If a ``stream`` is attached to this download, then the downloaded
|
|
resource will be written to the stream.
|
|
|
|
Args:
|
|
transport (~requests.Session): A ``requests`` object which can
|
|
make authenticated requests.
|
|
timeout (Optional[Union[float, Tuple[float, float]]]):
|
|
The number of seconds to wait for the server response.
|
|
Depending on the retry strategy, a request may be repeated
|
|
several times using the same timeout each time.
|
|
|
|
Can also be passed as a tuple (connect_timeout, read_timeout).
|
|
See :meth:`requests.Session.request` documentation for details.
|
|
|
|
Returns:
|
|
~requests.Response: The HTTP response returned by ``transport``.
|
|
|
|
Raises:
|
|
~google.resumable_media.common.DataCorruption: If the download's
|
|
checksum doesn't agree with server-computed checksum.
|
|
ValueError: If the current :class:`Download` has already
|
|
finished.
|
|
"""
|
|
method, url, payload, headers = self._prepare_request()
|
|
# NOTE: We assume "payload is None" but pass it along anyway.
|
|
result = _request_helpers.http_request(
|
|
transport,
|
|
method,
|
|
url,
|
|
data=payload,
|
|
headers=headers,
|
|
retry_strategy=self._retry_strategy,
|
|
stream=True,
|
|
timeout=timeout,
|
|
)
|
|
|
|
self._process_response(result)
|
|
|
|
if self._stream is not None:
|
|
self._write_to_stream(result)
|
|
|
|
return result
|
|
|
|
|
|
class ChunkedDownload(_request_helpers.RequestsMixin, _download.ChunkedDownload):
|
|
"""Download a resource in chunks from a Google API.
|
|
|
|
Args:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
chunk_size (int): The number of bytes to be retrieved in each
|
|
request.
|
|
stream (IO[bytes]): A write-able stream (i.e. file-like object) that
|
|
will be used to concatenate chunks of the resource as they are
|
|
downloaded.
|
|
start (int): The first byte in a range to be downloaded. If not
|
|
provided, defaults to ``0``.
|
|
end (int): The last byte in a range to be downloaded. If not
|
|
provided, will download to the end of the media.
|
|
headers (Optional[Mapping[str, str]]): Extra headers that should
|
|
be sent with each request, e.g. headers for data encryption
|
|
key headers.
|
|
|
|
Attributes:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
start (Optional[int]): The first byte in a range to be downloaded.
|
|
end (Optional[int]): The last byte in a range to be downloaded.
|
|
chunk_size (int): The number of bytes to be retrieved in each request.
|
|
|
|
Raises:
|
|
ValueError: If ``start`` is negative.
|
|
"""
|
|
|
|
def consume_next_chunk(
|
|
self,
|
|
transport,
|
|
timeout=(
|
|
_request_helpers._DEFAULT_CONNECT_TIMEOUT,
|
|
_request_helpers._DEFAULT_READ_TIMEOUT,
|
|
),
|
|
):
|
|
"""Consume the next chunk of the resource to be downloaded.
|
|
|
|
Args:
|
|
transport (~requests.Session): A ``requests`` object which can
|
|
make authenticated requests.
|
|
timeout (Optional[Union[float, Tuple[float, float]]]):
|
|
The number of seconds to wait for the server response.
|
|
Depending on the retry strategy, a request may be repeated
|
|
several times using the same timeout each time.
|
|
|
|
Can also be passed as a tuple (connect_timeout, read_timeout).
|
|
See :meth:`requests.Session.request` documentation for details.
|
|
|
|
Returns:
|
|
~requests.Response: The HTTP response returned by ``transport``.
|
|
|
|
Raises:
|
|
ValueError: If the current download has finished.
|
|
"""
|
|
method, url, payload, headers = self._prepare_request()
|
|
# NOTE: We assume "payload is None" but pass it along anyway.
|
|
result = _request_helpers.http_request(
|
|
transport,
|
|
method,
|
|
url,
|
|
data=payload,
|
|
headers=headers,
|
|
retry_strategy=self._retry_strategy,
|
|
timeout=timeout,
|
|
)
|
|
self._process_response(result)
|
|
return result
|
|
|
|
|
|
class RawChunkedDownload(_request_helpers.RawRequestsMixin, _download.ChunkedDownload):
|
|
"""Download a raw resource in chunks from a Google API.
|
|
|
|
Args:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
chunk_size (int): The number of bytes to be retrieved in each
|
|
request.
|
|
stream (IO[bytes]): A write-able stream (i.e. file-like object) that
|
|
will be used to concatenate chunks of the resource as they are
|
|
downloaded.
|
|
start (int): The first byte in a range to be downloaded. If not
|
|
provided, defaults to ``0``.
|
|
end (int): The last byte in a range to be downloaded. If not
|
|
provided, will download to the end of the media.
|
|
headers (Optional[Mapping[str, str]]): Extra headers that should
|
|
be sent with each request, e.g. headers for data encryption
|
|
key headers.
|
|
|
|
Attributes:
|
|
media_url (str): The URL containing the media to be downloaded.
|
|
start (Optional[int]): The first byte in a range to be downloaded.
|
|
end (Optional[int]): The last byte in a range to be downloaded.
|
|
chunk_size (int): The number of bytes to be retrieved in each request.
|
|
|
|
Raises:
|
|
ValueError: If ``start`` is negative.
|
|
"""
|
|
|
|
def consume_next_chunk(
|
|
self,
|
|
transport,
|
|
timeout=(
|
|
_request_helpers._DEFAULT_CONNECT_TIMEOUT,
|
|
_request_helpers._DEFAULT_READ_TIMEOUT,
|
|
),
|
|
):
|
|
"""Consume the next chunk of the resource to be downloaded.
|
|
|
|
Args:
|
|
transport (~requests.Session): A ``requests`` object which can
|
|
make authenticated requests.
|
|
timeout (Optional[Union[float, Tuple[float, float]]]):
|
|
The number of seconds to wait for the server response.
|
|
Depending on the retry strategy, a request may be repeated
|
|
several times using the same timeout each time.
|
|
|
|
Can also be passed as a tuple (connect_timeout, read_timeout).
|
|
See :meth:`requests.Session.request` documentation for details.
|
|
|
|
Returns:
|
|
~requests.Response: The HTTP response returned by ``transport``.
|
|
|
|
Raises:
|
|
ValueError: If the current download has finished.
|
|
"""
|
|
method, url, payload, headers = self._prepare_request()
|
|
# NOTE: We assume "payload is None" but pass it along anyway.
|
|
result = _request_helpers.http_request(
|
|
transport,
|
|
method,
|
|
url,
|
|
data=payload,
|
|
headers=headers,
|
|
stream=True,
|
|
retry_strategy=self._retry_strategy,
|
|
timeout=timeout,
|
|
)
|
|
self._process_response(result)
|
|
return result
|
|
|
|
|
|
def _add_decoder(response_raw, checksum):
|
|
"""Patch the ``_decoder`` on a ``urllib3`` response.
|
|
|
|
This is so that we can intercept the compressed bytes before they are
|
|
decoded.
|
|
|
|
Only patches if the content encoding is ``gzip``.
|
|
|
|
Args:
|
|
response_raw (urllib3.response.HTTPResponse): The raw response for
|
|
an HTTP request.
|
|
checksum (object):
|
|
A checksum which will be updated with compressed bytes.
|
|
|
|
Returns:
|
|
object: Either the original ``checksum`` if ``_decoder`` is not
|
|
patched, or a ``_DoNothingHash`` if the decoder is patched, since the
|
|
caller will no longer need to hash to decoded bytes.
|
|
"""
|
|
encoding = response_raw.headers.get(u"content-encoding", u"").lower()
|
|
if encoding != u"gzip":
|
|
return checksum
|
|
|
|
response_raw._decoder = _GzipDecoder(checksum)
|
|
return _helpers._DoNothingHash()
|
|
|
|
|
|
class _GzipDecoder(urllib3.response.GzipDecoder):
|
|
"""Custom subclass of ``urllib3`` decoder for ``gzip``-ed bytes.
|
|
|
|
Allows a checksum function to see the compressed bytes before they are
|
|
decoded. This way the checksum of the compressed value can be computed.
|
|
|
|
Args:
|
|
checksum (object):
|
|
A checksum which will be updated with compressed bytes.
|
|
"""
|
|
|
|
def __init__(self, checksum):
|
|
super(_GzipDecoder, self).__init__()
|
|
self._checksum = checksum
|
|
|
|
def decompress(self, data):
|
|
"""Decompress the bytes.
|
|
|
|
Args:
|
|
data (bytes): The compressed bytes to be decompressed.
|
|
|
|
Returns:
|
|
bytes: The decompressed bytes from ``data``.
|
|
"""
|
|
self._checksum.update(data)
|
|
return super(_GzipDecoder, self).decompress(data)
|