diff --git a/news/12208.feature.rst b/news/12208.feature.rst
new file mode 100644
index 00000000000..a4c298505a5
--- /dev/null
+++ b/news/12208.feature.rst
@@ -0,0 +1,2 @@
+Implement lazier lazy_wheel that avoids the HEAD request, fetching the metadata
+for many wheels in 1 request.
diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py
index 2587740f73a..fd62175ddae 100644
--- a/src/pip/_internal/exceptions.py
+++ b/src/pip/_internal/exceptions.py
@@ -323,12 +323,14 @@ class UnsupportedWheel(InstallationError):
class InvalidWheel(InstallationError):
"""Invalid (e.g. corrupt) wheel."""
- def __init__(self, location: str, name: str):
+ def __init__(self, location: str, name: str, context: Optional[str] = None):
self.location = location
self.name = name
+ self.context = context
def __str__(self) -> str:
- return f"Wheel '{self.name}' located at {self.location} is invalid."
+ suffix = f" ({self.context})" if self.context else "."
+ return f"Wheel '{self.name}' located at {self.location} is invalid{suffix}"
class MetadataInconsistent(InstallationError):
diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py
index 82ec50d5106..834316ea011 100644
--- a/src/pip/_internal/network/lazy_wheel.py
+++ b/src/pip/_internal/network/lazy_wheel.py
@@ -1,23 +1,36 @@
"""Lazy ZIP over HTTP"""
-__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url"]
+from __future__ import annotations
+import abc
+import io
+import logging
+import re
from bisect import bisect_left, bisect_right
from contextlib import contextmanager
from tempfile import NamedTemporaryFile
-from typing import Any, Dict, Generator, List, Optional, Tuple
+from typing import Any, BinaryIO, ClassVar, Iterable, Iterator, cast
+from urllib.parse import urlparse
from zipfile import BadZipFile, ZipFile
from pip._vendor.packaging.utils import canonicalize_name
-from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
+from pip._vendor.requests.exceptions import ContentDecodingError
+from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, HTTPError, Response
+from pip._vendor.requests.status_codes import codes
+from pip._internal.exceptions import InvalidWheel, UnsupportedWheel
from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution
from pip._internal.network.session import PipSession
-from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
+from pip._internal.network.utils import HEADERS
+from pip._internal.utils.logging import indent_log
+
+__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url", "LazyWheelOverHTTP"]
+
+logger = logging.getLogger(__name__)
class HTTPRangeRequestUnsupported(Exception):
- pass
+ """Raised when the remote server appears unable to support byte ranges."""
def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistribution:
@@ -25,42 +38,86 @@ def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistrib
This uses HTTP range requests to only fetch the portion of the wheel
containing metadata, just enough for the object to be constructed.
- If such requests are not supported, HTTPRangeRequestUnsupported
- is raised.
+
+ :raises HTTPRangeRequestUnsupported: if range requests are unsupported for ``url``.
+ :raises InvalidWheel: if the zip file contents could not be parsed.
"""
- with LazyZipOverHTTP(url, session) as zf:
- # For read-only ZIP files, ZipFile only needs methods read,
- # seek, seekable and tell, not the whole IO protocol.
- wheel = MemoryWheel(zf.name, zf) # type: ignore
- # After context manager exit, wheel.name
- # is an invalid file by intention.
- return get_wheel_distribution(wheel, canonicalize_name(name))
+ try:
+ # After context manager exit, wheel.name will point to a deleted file path.
+ # Add `delete_backing_file=False` to disable this for debugging.
+ with LazyWheelOverHTTP(url, session) as lazy_file:
+ lazy_file.prefetch_contiguous_dist_info(name)
+ wheel = MemoryWheel(lazy_file.name, lazy_file)
+ return get_wheel_distribution(wheel, canonicalize_name(name))
+ except (BadZipFile, UnsupportedWheel, ContentDecodingError) as e:
+ # We assume that these errors have occurred because the wheel contents themself
+ # are invalid, not because we've messed up our bookkeeping and produced an
+ # invalid file that pip would otherwise process normally.
+ raise InvalidWheel(url, name, context=str(e))
-class LazyZipOverHTTP:
- """File-like object mapped to a ZIP file over HTTP.
- This uses HTTP range requests to lazily fetch the file's content,
- which is supposed to be fed to ZipFile. If such requests are not
- supported by the server, raise HTTPRangeRequestUnsupported
- during initialization.
+class MergeIntervals:
+ """Stateful bookkeeping to merge interval graphs."""
+
+ def __init__(self, *, left: Iterable[int] = (), right: Iterable[int] = ()) -> None:
+ self._left = list(left)
+ self._right = list(right)
+
+ def __repr__(self) -> str:
+ return f"{type(self).__name__}(left={tuple(self._left)}, right={tuple(self._right)})" # noqa: E501
+
+ def _merge(
+ self, start: int, end: int, left: int, right: int
+ ) -> Iterator[tuple[int, int]]:
+ """Return an iterator of intervals to be fetched.
+
+ Args:
+ start (int): Start of needed interval
+ end (int): End of needed interval
+ left (int): Index of first overlapping downloaded data
+ right (int): Index after last overlapping downloaded data
+ """
+ lslice, rslice = self._left[left:right], self._right[left:right]
+ i = start = min([start] + lslice[:1])
+ end = max([end] + rslice[-1:])
+ for j, k in zip(lslice, rslice):
+ if j > i:
+ yield i, j - 1
+ i = k + 1
+ if i <= end:
+ yield i, end
+ self._left[left:right], self._right[left:right] = [start], [end]
+
+ def minimal_intervals_covering(
+ self, start: int, end: int
+ ) -> Iterator[tuple[int, int]]:
+ """Provide the intervals needed to cover from ``start <= x <= end``.
+
+ This method mutates internal state so that later calls only return intervals not
+ covered by prior calls. The first call to this method will always return exactly
+ one interval, which was exactly the one requested. Later requests for
+ intervals overlapping that first requested interval will yield only the ranges
+ not previously covered (which may be empty, e.g. if the same interval is
+ requested twice).
+
+ This may be used e.g. to download substrings of remote files on demand.
+ """
+ left = bisect_left(self._right, start)
+ right = bisect_right(self._left, end)
+ for start, end in self._merge(start, end, left, right):
+ yield (start, end)
+
+
+class ReadOnlyIOWrapper(BinaryIO):
+ """Implement read-side ``BinaryIO`` methods wrapping an inner ``BinaryIO``.
+
+ This wrapper is useful because Python currently does not distinguish read-only
+ streams at the type level.
"""
- def __init__(
- self, url: str, session: PipSession, chunk_size: int = CONTENT_CHUNK_SIZE
- ) -> None:
- head = session.head(url, headers=HEADERS)
- raise_for_status(head)
- assert head.status_code == 200
- self._session, self._url, self._chunk_size = session, url, chunk_size
- self._length = int(head.headers["Content-Length"])
- self._file = NamedTemporaryFile()
- self.truncate(self._length)
- self._left: List[int] = []
- self._right: List[int] = []
- if "bytes" not in head.headers.get("Accept-Ranges", "none"):
- raise HTTPRangeRequestUnsupported("range request is not supported")
- self._check_zip()
+ def __init__(self, inner: BinaryIO) -> None:
+ self._file = inner
@property
def mode(self) -> str:
@@ -85,6 +142,19 @@ def closed(self) -> bool:
"""Whether the file is closed."""
return self._file.closed
+ def fileno(self) -> int:
+ return self._file.fileno()
+
+ def flush(self) -> None:
+ self._file.flush()
+
+ def isatty(self) -> bool:
+ return False
+
+ def readable(self) -> bool:
+ """Return whether the file is readable, which is True."""
+ return True
+
def read(self, size: int = -1) -> bytes:
"""Read up to size bytes from the object and return them.
@@ -92,16 +162,14 @@ def read(self, size: int = -1) -> bytes:
all bytes until EOF are returned. Fewer than
size bytes may be returned if EOF is reached.
"""
- download_size = max(size, self._chunk_size)
- start, length = self.tell(), self._length
- stop = length if size < 0 else min(start + download_size, length)
- start = max(0, stop - download_size)
- self._download(start, stop - 1)
return self._file.read(size)
- def readable(self) -> bool:
- """Return whether the file is readable, which is True."""
- return True
+ def readline(self, limit: int = -1) -> bytes:
+ # Explicit impl needed to satisfy mypy.
+ raise NotImplementedError
+
+ def readlines(self, hint: int = -1) -> list[bytes]:
+ raise NotImplementedError
def seek(self, offset: int, whence: int = 0) -> int:
"""Change stream position and return the new absolute position.
@@ -117,7 +185,7 @@ def tell(self) -> int:
"""Return the current position."""
return self._file.tell()
- def truncate(self, size: Optional[int] = None) -> int:
+ def truncate(self, size: int | None = None) -> int:
"""Resize the stream to the given size in bytes.
If size is unspecified resize to the current position.
@@ -131,15 +199,81 @@ def writable(self) -> bool:
"""Return False."""
return False
- def __enter__(self) -> "LazyZipOverHTTP":
+ def write(self, s: bytes) -> int: # type: ignore[override]
+ raise NotImplementedError
+
+ def writelines(self, lines: Iterable[bytes]) -> None: # type: ignore[override]
+ raise NotImplementedError
+
+ def __enter__(self) -> ReadOnlyIOWrapper:
self._file.__enter__()
return self
def __exit__(self, *exc: Any) -> None:
self._file.__exit__(*exc)
+ def __iter__(self) -> Iterator[bytes]:
+ raise NotImplementedError
+
+ def __next__(self) -> bytes:
+ raise NotImplementedError
+
+
+class LazyRemoteResource(ReadOnlyIOWrapper):
+ """Abstract class for a binary file object that lazily downloads its contents.
+
+ This class uses a ``MergeIntervals`` instance to keep track of what it's downloaded.
+ """
+
+ def __init__(self, inner: BinaryIO) -> None:
+ super().__init__(inner)
+ self._merge_intervals: MergeIntervals | None = None
+
+ @abc.abstractmethod
+ def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]:
+ """Call to the remote backend to provide this byte range in one or more chunks.
+
+ NB: For compatibility with HTTP range requests, the range provided to this
+ method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2
+ bytes long, and the range can never be empty).
+
+ :raises Exception: implementations may raise an exception for e.g. intermittent
+ errors accessing the remote resource.
+ """
+ ...
+
+ def _setup_content(self) -> None:
+ """Ensure ``self._merge_intervals`` is initialized.
+
+ Called in ``__enter__``, and should make recursive invocations into a no-op.
+ Subclasses may override this method."""
+ if self._merge_intervals is None:
+ self._merge_intervals = MergeIntervals()
+
+ def _reset_content(self) -> None:
+ """Unset ``self._merge_intervals``.
+
+ Called in ``__exit__``, and should make recursive invocations into a no-op.
+ Subclasses may override this method."""
+ if self._merge_intervals is not None:
+ logger.debug(
+ "unsetting merge intervals (were: %s)", repr(self._merge_intervals)
+ )
+ self._merge_intervals = None
+
+ def __enter__(self) -> LazyRemoteResource:
+ """Call ``self._setup_content()``, then return self."""
+ super().__enter__()
+ self._setup_content()
+ return self
+
+ def __exit__(self, *exc: Any) -> None:
+ """Call ``self._reset_content()``."""
+ self._reset_content()
+ super().__exit__(*exc)
+
@contextmanager
- def _stay(self) -> Generator[None, None, None]:
+ def _stay(self) -> Iterator[None]:
"""Return a context manager keeping the position.
At the end of the block, seek back to original position.
@@ -150,61 +284,465 @@ def _stay(self) -> Generator[None, None, None]:
finally:
self.seek(pos)
- def _check_zip(self) -> None:
- """Check and download until the file is a valid ZIP."""
- end = self._length - 1
- for start in reversed(range(0, end, self._chunk_size)):
- self._download(start, end)
- with self._stay():
- try:
- # For read-only ZIP files, ZipFile only needs
- # methods read, seek, seekable and tell.
- ZipFile(self) # type: ignore
- except BadZipFile:
- pass
- else:
- break
+ def ensure_downloaded(self, start: int, end: int) -> None:
+ """Ensures bytes start to end (inclusive) have been downloaded and written to
+ the backing file.
+
+ :raises ValueError: if ``__enter__`` was not called beforehand.
+ """
+ if self._merge_intervals is None:
+ raise ValueError(".__enter__() must be called to set up merge intervals")
+ # Reducing by 1 to get an inclusive end range.
+ end -= 1
+ with self._stay():
+ for start, end in self._merge_intervals.minimal_intervals_covering(
+ start, end
+ ):
+ self.seek(start)
+ for chunk in self.fetch_content_range(start, end):
+ self._file.write(chunk)
+
- def _stream_response(
- self, start: int, end: int, base_headers: Dict[str, str] = HEADERS
- ) -> Response:
- """Return HTTP response to a range request from start to end."""
- headers = base_headers.copy()
+class FixedSizeLazyResource(LazyRemoteResource):
+ """Fetches a fixed content length from the remote server when initialized in order
+ to support ``.read(-1)`` and seek calls against ``io.SEEK_END``."""
+
+ def __init__(self, inner: BinaryIO) -> None:
+ super().__init__(inner)
+ self._length: int | None = None
+
+ @abc.abstractmethod
+ def _fetch_content_length(self) -> int:
+ """Query the remote resource for the total length of the file.
+
+ This method may also mutate internal state, such as writing to the backing
+ file. It's marked private because it will only be called within this class's
+ ``__enter__`` implementation to populate an internal length field.
+
+ :raises Exception: implementations may raise any type of exception if the length
+ value could not be parsed, or any other issue which might
+ cause valid calls to ``self.fetch_content_range()`` to fail.
+ """
+ ...
+
+ def _setup_content(self) -> None:
+ """Initialize the internal length field and other bookkeeping.
+
+ After parsing the remote file length with ``self._fetch_content_length()``,
+ this method will truncate the underlying file from parent abstract class
+ ``ReadOnlyIOWrapper`` to that size in order to support seek operations against
+ ``io.SEEK_END`` in ``self.read()``.
+
+ This method is called in ``__enter__``.
+ """
+ super()._setup_content()
+
+ if self._length is None:
+ logger.debug("begin fetching content length")
+ with indent_log():
+ self._length = self._fetch_content_length()
+ logger.debug("done fetching content length (is: %d)", self._length)
+ else:
+ logger.debug("content length already fetched (is: %d)", self._length)
+
+ # Enable us to seek and write anywhere in the backing file up to this
+ # known length.
+ self.truncate(self._length)
+
+ def _reset_content(self) -> None:
+ """Unset the internal length field and other bookkeeping.
+
+ This method is called in ``__exit__``."""
+ super()._reset_content()
+
+ if self._length is not None:
+ logger.debug("unsetting content length (was: %d)", self._length)
+ self._length = None
+
+ def read(self, size: int = -1) -> bytes:
+ """Read up to size bytes from the object and return them.
+
+ As a convenience, if size is unspecified or -1,
+ all bytes until EOF are returned. Fewer than
+ size bytes may be returned if EOF is reached.
+
+ :raises ValueError: if ``__enter__`` was not called beforehand.
+ """
+ if self._length is None:
+ raise ValueError(".__enter__() must be called to set up content length")
+ cur = self.tell()
+ logger.debug("read size %d at %d from lazy file %s", size, cur, self.name)
+ if size < 0:
+ assert cur <= self._length
+ download_size = self._length - cur
+ elif size == 0:
+ return b""
+ else:
+ download_size = size
+ stop = min(cur + download_size, self._length)
+ self.ensure_downloaded(cur, stop)
+ return super().read(download_size)
+
+
+class LazyHTTPFile(FixedSizeLazyResource):
+ """File-like object representing a fixed-length file over HTTP.
+
+ This uses HTTP range requests to lazily fetch the file's content into a temporary
+ file. If such requests are not supported by the server, raises
+ ``HTTPRangeRequestUnsupported`` in the ``__enter__`` method."""
+
+ def __init__(
+ self,
+ url: str,
+ session: PipSession,
+ delete_backing_file: bool = True,
+ ) -> None:
+ super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file)))
+
+ self._request_count = 0
+ self._session = session
+ self._url = url
+
+ def _fetch_content_length(self) -> int:
+ """Get the remote file's length from a HEAD request."""
+ # NB: This is currently dead code, as _fetch_content_length() is overridden
+ # again in LazyWheelOverHTTP.
+ return self._content_length_from_head()
+
+ def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]:
+ """Perform a series of HTTP range requests to cover the specified byte range."""
+ for chunk in self._stream_response(start, end).iter_content(CONTENT_CHUNK_SIZE):
+ yield chunk
+
+ def __enter__(self) -> LazyHTTPFile:
+ """Fetch the remote file length and reset the log of downloaded intervals.
+
+ This method must be called before ``.read()``.
+
+ :raises HTTPRangeRequestUnsupported: if the remote server fails to indicate
+ support for "bytes" ranges.
+ """
+ super().__enter__()
+ return self
+
+ def __exit__(self, *exc: Any) -> None:
+ """Logs request count to quickly identify any pathological cases in log data."""
+ logger.debug("%d requests for url %s", self._request_count, self._url)
+ super().__exit__(*exc)
+
+ def _content_length_from_head(self) -> int:
+ """Performs a HEAD request to extract the Content-Length.
+
+ :raises HTTPRangeRequestUnsupported: if the response fails to indicate support
+ for "bytes" ranges."""
+ self._request_count += 1
+ head = self._session.head(self._url, headers=self._uncached_headers())
+ head.raise_for_status()
+ assert head.status_code == codes.ok
+ accepted_range = head.headers.get("Accept-Ranges", None)
+ if accepted_range != "bytes":
+ raise HTTPRangeRequestUnsupported(
+ f"server does not support byte ranges: header was '{accepted_range}'"
+ )
+ return int(head.headers["Content-Length"])
+
+ def _stream_response(self, start: int, end: int) -> Response:
+ """Return streaming HTTP response to a range request from start to end."""
+ headers = self._uncached_headers()
headers["Range"] = f"bytes={start}-{end}"
- # TODO: Get range requests to be correctly cached
- headers["Cache-Control"] = "no-cache"
- return self._session.get(self._url, headers=headers, stream=True)
+ logger.debug("streamed bytes request: %s", headers["Range"])
+ self._request_count += 1
+ response = self._session.get(self._url, headers=headers, stream=True)
+ response.raise_for_status()
+ assert int(response.headers["Content-Length"]) == (end - start + 1)
+ return response
+
+ @classmethod
+ def _uncached_headers(cls) -> dict[str, str]:
+ """HTTP headers to bypass any HTTP caching.
+
+ The requests we perform in this file are intentionally small, and any caching
+ should be done at a higher level e.g. https://github.com/pypa/pip/issues/12184.
+ """
+ # "no-cache" is the correct value for "up to date every time", so this will also
+ # ensure we get the most recent value from the server:
+ # https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching#provide_up-to-date_content_every_time
+ return {**HEADERS, "Cache-Control": "no-cache"}
- def _merge(
- self, start: int, end: int, left: int, right: int
- ) -> Generator[Tuple[int, int], None, None]:
- """Return a generator of intervals to be fetched.
- Args:
- start (int): Start of needed interval
- end (int): End of needed interval
- left (int): Index of first overlapping downloaded data
- right (int): Index after last overlapping downloaded data
+class LazyWheelOverHTTP(LazyHTTPFile):
+ """File-like object mapped to a ZIP file over HTTP.
+
+ This uses HTTP range requests to lazily fetch the file's content, which should be
+ provided as the first argument to a ``ZipFile``.
+ """
+
+ # Cache this on the type to avoid trying and failing our initial lazy wheel request
+ # multiple times in the same pip invocation against an index without this support.
+ _domains_without_negative_range: ClassVar[set[str]] = set()
+
+ # This override is needed for mypy so we can call .prefetch_contiguous_dist_info()
+ # within a `with` block.
+ def __enter__(self) -> LazyWheelOverHTTP:
+ """Fetch the remote file length and reset the log of downloaded intervals.
+
+ This method must be called before ``.read()`` or
+ ``.prefetch_contiguous_dist_info()``.
"""
- lslice, rslice = self._left[left:right], self._right[left:right]
- i = start = min([start] + lslice[:1])
- end = max([end] + rslice[-1:])
- for j, k in zip(lslice, rslice):
- if j > i:
- yield i, j - 1
- i = k + 1
- if i <= end:
- yield i, end
- self._left[left:right], self._right[left:right] = [start], [end]
+ super().__enter__()
+ return self
- def _download(self, start: int, end: int) -> None:
- """Download bytes from start to end inclusively."""
- with self._stay():
- left = bisect_left(self._right, start)
- right = bisect_right(self._left, end)
- for start, end in self._merge(start, end, left, right):
- response = self._stream_response(start, end)
- response.raise_for_status()
- self.seek(start)
- for chunk in response_chunks(response, self._chunk_size):
- self._file.write(chunk)
+ @classmethod
+ def _initial_chunk_length(cls) -> int:
+ """Return the size of the chunk (in bytes) to download from the end of the file.
+
+ This method is called in ``self._fetch_content_length()``. If
+ the chunk size from this method is larger than the size of an entire wheel, that
+ may raise an HTTP error, but this is gracefully handled in
+ ``self._fetch_content_length()`` with a small performance penalty.
+
+ Performance testing for values returned by this method (radoering):
+ https://github.com/pypa/pip/pull/12208#discussion_r1434335276
+ > could not find any significant differences for values between 1.000 and
+ > 100.000 in a real-world example with more than 300 dependencies. Values below
+ > 1.000 resulted in a slightly worse performance, larger values like 1.000.000
+ > resulted in a significant worse performance.
+ """
+ return 10_000
+
+ def _fetch_content_length(self) -> int:
+ """Get the total remote file length, but also download a chunk from the end.
+
+ This method is called within ``__enter__``. In an attempt to reduce
+ the total number of requests needed to populate this lazy file's contents, this
+ method will also attempt to fetch a chunk of the file's actual content. This
+ chunk will be ``self._initial_chunk_length()`` bytes in size, or just the remote
+ file's length if that's smaller, and the chunk will come from the *end* of
+ the file.
+
+ This method will first attempt to download with a negative byte range request,
+ i.e. a GET with the headers ``Range: bytes=-N`` for ``N`` equal to
+ ``self._initial_chunk_length()``. If negative offsets are unsupported, it will
+ instead fall back to making a HEAD request first to extract the length, followed
+ by a GET request with the double-ended range header ``Range: bytes=X-Y`` to
+ extract the final ``N`` bytes from the remote resource.
+ """
+ initial_chunk_size = self._initial_chunk_length()
+ ret_length, tail = self._extract_content_length(initial_chunk_size)
+
+ # Need to explicitly truncate here in order to perform the write and seek
+ # operations below when we write the chunk of file contents to disk.
+ self.truncate(ret_length)
+
+ if tail is None:
+ # If we could not download any file contents yet (e.g. if negative byte
+ # ranges were not supported, or the requested range was larger than the file
+ # size), then download all of this at once, hopefully pulling in the entire
+ # central directory.
+ initial_start = max(0, ret_length - initial_chunk_size)
+ self.ensure_downloaded(initial_start, ret_length)
+ else:
+ # If we *could* download some file contents, then write them to the end of
+ # the file and set up our bisect boundaries by hand.
+ with self._stay():
+ response_length = int(tail.headers["Content-Length"])
+
+ # Some servers return a 200 OK in response to an overflowing negative
+ # range request. In that case we want to write the entire file contents.
+ if tail.status_code == codes.ok:
+ assert ret_length == response_length
+ assert ret_length > 0
+ assert response_length <= initial_chunk_size
+ for chunk in tail.iter_content(CONTENT_CHUNK_SIZE):
+ self._file.write(chunk)
+ assert self._merge_intervals is not None
+ assert ((0, response_length - 1),) == tuple(
+ self._merge_intervals.minimal_intervals_covering(
+ 0, response_length - 1
+ )
+ )
+ return response_length
+ else:
+ assert response_length == initial_chunk_size
+ self.seek(-response_length, io.SEEK_END)
+ # Default initial chunk size is currently 1MB, but streaming content
+ # here allows it to be set arbitrarily large.
+ for chunk in tail.iter_content(CONTENT_CHUNK_SIZE):
+ self._file.write(chunk)
+
+ # We now need to update our bookkeeping to cover the interval we
+ # just wrote to file so we know not to do it in later read()s.
+ init_chunk_start = ret_length - response_length
+ # MergeIntervals uses inclusive boundaries i.e. start <= x <= end.
+ init_chunk_end = ret_length - 1
+ assert self._merge_intervals is not None
+ assert ((init_chunk_start, init_chunk_end),) == tuple(
+ # NB: We expect LazyRemoteResource to reset
+ # `self._merge_intervals` just before it calls the current
+ # method, so our assertion here checks that indeed no prior
+ # overlapping intervals have been covered.
+ self._merge_intervals.minimal_intervals_covering(
+ init_chunk_start, init_chunk_end
+ )
+ )
+ return ret_length
+
+ @staticmethod
+ def _parse_full_length_from_content_range(arg: str) -> int:
+ """Parse the file's full underlying length from the Content-Range header.
+
+ This supports both * and numeric ranges, from success or error responses:
+ https://www.rfc-editor.org/rfc/rfc9110#field.content-range.
+ """
+ m = re.match(r"bytes [^/]+/([0-9]+)", arg)
+ if m is None:
+ raise HTTPRangeRequestUnsupported(f"could not parse Content-Range: '{arg}'")
+ return int(m.group(1))
+
+ def _try_initial_chunk_request(
+ self, initial_chunk_size: int
+ ) -> tuple[int, Response]:
+ """Attempt to fetch a chunk from the end of the file with a negative offset."""
+ headers = self._uncached_headers()
+ # Perform a negative range index, which is not supported by some servers.
+ headers["Range"] = f"bytes=-{initial_chunk_size}"
+ logger.debug("initial bytes request: %s", headers["Range"])
+
+ self._request_count += 1
+ tail = self._session.get(self._url, headers=headers, stream=True)
+ tail.raise_for_status()
+
+ code = tail.status_code
+ if code != codes.partial_content:
+ # According to
+ # https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests, a 200 OK
+ # implies that range requests are not supported, regardless of the
+ # requested size.
+ # However, some servers that support negative range requests also return a
+ # 200 OK if the requested range from the end was larger than the file size.
+ if code == codes.ok:
+ accept_ranges = tail.headers.get("Accept-Ranges", None)
+ content_length = int(tail.headers["Content-Length"])
+ if accept_ranges == "bytes" and content_length <= initial_chunk_size:
+ return content_length, tail
+ raise HTTPRangeRequestUnsupported(
+ f"did not receive partial content: got code {code}"
+ )
+
+ file_length = self._parse_full_length_from_content_range(
+ tail.headers["Content-Range"]
+ )
+ return (file_length, tail)
+
+ def _extract_content_length(
+ self, initial_chunk_size: int
+ ) -> tuple[int, Response | None]:
+ """Get the Content-Length of the remote file, and possibly a chunk of it."""
+ domain = urlparse(self._url).netloc
+ if domain in self._domains_without_negative_range:
+ return (self._content_length_from_head(), None)
+
+ try:
+ # Initial range request for just the end of the file.
+ file_length, tail = self._try_initial_chunk_request(initial_chunk_size)
+ except HTTPError as e:
+ resp = e.response
+ code = resp.status_code
+ # Our initial request using a negative byte range was not supported.
+ if code in [codes.not_implemented, codes.method_not_allowed]:
+ # pypi notably does not support negative byte ranges: see
+ # https://github.com/pypi/warehouse/issues/12823.
+ logger.debug(
+ "Negative byte range not supported for domain '%s': "
+ "using HEAD request before lazy wheel from now on",
+ domain,
+ )
+ # Avoid trying a negative byte range request against this domain for the
+ # rest of the resolve.
+ self._domains_without_negative_range.add(domain)
+ # Apply a HEAD request to get the real size, and nothing else for now.
+ return (self._content_length_from_head(), None)
+ # This indicates that the requested range from the end was larger than the
+ # actual file size: https://www.rfc-editor.org/rfc/rfc9110#status.416.
+ if code == codes.requested_range_not_satisfiable:
+ # In this case, we don't have any file content yet, but we do know the
+ # size the file will be, so we can return that and exit here.
+ file_length = self._parse_full_length_from_content_range(
+ resp.headers["Content-Range"]
+ )
+ return (file_length, None)
+ # If we get some other error, then we expect that non-range requests will
+ # also fail, so we error out here and let the user figure it out.
+ raise
+
+ # Some servers do not actually support negative offsets, but instead sneakily
+ # coerce a negative offset like "-10" into "0-10", which is a *positive* offset!
+ tail_length = int(tail.headers["Content-Length"])
+ tail_range = tail.headers.get("Content-Range", "")
+ if (tail_length == initial_chunk_size + 1) and tail_range.startswith(
+ f"bytes 0-{initial_chunk_size}"
+ ):
+ logger.debug(
+ "an intended negative range -%d was converted to 0-%d for domain %s: "
+ "using HEAD request before lazy wheel from now on",
+ initial_chunk_size,
+ initial_chunk_size,
+ domain,
+ )
+ # Avoid trying a negative byte range request against this domain for the
+ # rest of the resolve.
+ self._domains_without_negative_range.add(domain)
+ # Apply a HEAD request to get the real size, and nothing else for now.
+ return (self._content_length_from_head(), None)
+ return file_length, tail
+
+ def prefetch_contiguous_dist_info(self, name: str) -> None:
+ """Read contents of entire dist-info section of wheel.
+
+ We know pip will read every entry in this directory when generating a dist from
+ a wheel, so prepopulating the file contents avoids waiting for further
+ range requests.
+ """
+ # Clarify in debug output which requests were sent during __init__, which during
+ # the prefetch, and which during the dist metadata generation.
+ logger.debug("begin prefetching dist-info for %s", name)
+ with indent_log():
+ self._prefetch_contiguous_dist_info(name)
+ logger.debug("done prefetching dist-info for %s", name)
+
+ def _prefetch_contiguous_dist_info(self, name: str) -> None:
+ """Locate the *.dist-info/ entries from a temporary ``ZipFile`` wrapper, and
+ download them.
+
+ This method assumes that the *.dist-info directory (containing e.g. METADATA) is
+ contained in a single contiguous section of the zip file in order to ensure it
+ can be downloaded in a single ranged GET request."""
+ dist_info_prefix = re.compile(r"^[^/]*\.dist-info/")
+ start: int | None = None
+ end: int | None = None
+
+ # This may perform further requests if __init__() did not pull in the entire
+ # central directory at the end of the file (although _initial_chunk_length()
+ # should be set large enough to avoid this).
+ zf = ZipFile(self)
+
+ for info in zf.infolist():
+ if start is None:
+ if dist_info_prefix.search(info.filename):
+ start = info.header_offset
+ continue
+ else:
+ # The last .dist-info/ entry may be before the end of the file if the
+ # wheel's entries are sorted lexicographically (which is unusual).
+ if not dist_info_prefix.search(info.filename):
+ end = info.header_offset
+ break
+ if start is None:
+ raise UnsupportedWheel(
+ f"no {dist_info_prefix!r} directory found for {name} in {self.name}"
+ )
+ # If the last entries of the zip are the .dist-info/ dir (as usual), then give
+ # us everything until the start of the central directory.
+ if end is None:
+ end = zf.start_dir
+ self.ensure_downloaded(start, end)
diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py
index e6aa3447200..7ca0855ad76 100644
--- a/src/pip/_internal/operations/prepare.py
+++ b/src/pip/_internal/operations/prepare.py
@@ -273,6 +273,9 @@ def __init__(
# Previous "header" printed for a link-based InstallRequirement
self._previous_requirement_header = ("", "")
+ # Whether an HTTPRangeRequestUnsupported was already thrown for this domain.
+ self._domains_without_http_range: set[str] = set()
+
def _log_preparing_link(self, req: InstallRequirement) -> None:
"""Provide context for the requirement being prepared."""
if req.link.is_file and not req.is_wheel_from_cache:
@@ -430,6 +433,9 @@ def _fetch_metadata_using_lazy_wheel(
)
return None
+ if link.netloc in self._domains_without_http_range:
+ return None
+
wheel = Wheel(link.filename)
name = canonicalize_name(wheel.name)
logger.info(
@@ -437,11 +443,16 @@ def _fetch_metadata_using_lazy_wheel(
name,
wheel.version,
)
- url = link.url.split("#", 1)[0]
try:
- return dist_from_wheel_url(name, url, self._session)
- except HTTPRangeRequestUnsupported:
- logger.debug("%s does not support range requests", url)
+ return dist_from_wheel_url(name, link.url_without_fragment, self._session)
+ except HTTPRangeRequestUnsupported as e:
+ logger.debug(
+ "domain '%s' does not support range requests (%s): "
+ "not attempting lazy wheel further",
+ link.netloc,
+ str(e),
+ )
+ self._domains_without_http_range.add(link.netloc)
return None
def _complete_partial_requirements(
diff --git a/src/pip/_internal/utils/wheel.py b/src/pip/_internal/utils/wheel.py
index f85aee8a3f9..8474d818fcc 100644
--- a/src/pip/_internal/utils/wheel.py
+++ b/src/pip/_internal/utils/wheel.py
@@ -67,6 +67,10 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str:
def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes:
+ if src_fp := source.fp:
+ src_name = getattr(src_fp, "name", None)
+ logger.debug("extracting entry '%s' from zip '%s'", path, src_name)
+
try:
return source.read(path)
# BadZipFile for general corruption, KeyError for missing entry,
diff --git a/tests/conftest.py b/tests/conftest.py
index da4ab5b9dfb..4cb8154fad2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,4 @@
+import abc
import compileall
import contextlib
import fnmatch
@@ -8,6 +9,7 @@
import subprocess
import sys
import threading
+from contextlib import AbstractContextManager, contextmanager
from dataclasses import dataclass
from enum import Enum
from hashlib import sha256
@@ -17,6 +19,7 @@
TYPE_CHECKING,
Any,
AnyStr,
+ BinaryIO,
Callable,
ClassVar,
ContextManager,
@@ -27,6 +30,8 @@
Optional,
Set,
Tuple,
+ Type,
+ Union,
)
from unittest.mock import patch
from zipfile import ZipFile
@@ -55,6 +60,7 @@
PipTestEnvironment,
ScriptFactory,
TestData,
+ create_basic_wheel_for_package,
)
from tests.lib.server import MockServer, make_mock_server
from tests.lib.venv import VirtualEnvironment, VirtualEnvironmentType
@@ -651,6 +657,24 @@ def script(
return script_factory(tmpdir.joinpath("workspace"), virtualenv)
+@pytest.fixture(scope="session")
+def session_script(
+ request: pytest.FixtureRequest,
+ tmpdir_factory: pytest.TempPathFactory,
+ virtualenv_factory: Callable[[Path], VirtualEnvironment],
+ script_factory: ScriptFactory,
+) -> PipTestEnvironment:
+ """PipTestEnvironment shared across the whole session.
+
+ This is used by session-scoped fixtures. Tests should use the
+ function-scoped ``script`` fixture instead.
+ """
+ virtualenv = virtualenv_factory(
+ tmpdir_factory.mktemp("session_venv").joinpath("venv")
+ )
+ return script_factory(tmpdir_factory.mktemp("session_workspace"), virtualenv)
+
+
@pytest.fixture(scope="session")
def common_wheels() -> Path:
"""Provide a directory with latest setuptools and wheel wheels"""
@@ -699,7 +723,7 @@ def factory() -> str:
return factory
-@pytest.fixture
+@pytest.fixture(scope="session")
def mock_server() -> Iterator[MockServer]:
server = make_mock_server()
test_server = MockServer(server)
@@ -733,6 +757,39 @@ class MetadataKind(Enum):
NoFile = "no-file"
+@dataclass(frozen=True)
+class FakePackageSource:
+ """A test package file which may be hardcoded or generated dynamically."""
+
+ source_file: Union[str, Path]
+
+ @classmethod
+ def shared_data_package(cls, name: str) -> "FakePackageSource":
+ return cls(source_file=name)
+
+ @property
+ def _is_shared_data(self) -> bool:
+ return isinstance(self.source_file, str)
+
+ @classmethod
+ def generated_wheel(cls, path: Path) -> "FakePackageSource":
+ return cls(source_file=path)
+
+ @property
+ def filename(self) -> str:
+ if self._is_shared_data:
+ assert isinstance(self.source_file, str)
+ return self.source_file
+ assert isinstance(self.source_file, Path)
+ return self.source_file.name
+
+ def source_path(self, shared_data: TestData) -> Path:
+ if self._is_shared_data:
+ return shared_data.packages / self.filename
+ assert isinstance(self.source_file, Path)
+ return self.source_file
+
+
@dataclass(frozen=True)
class FakePackage:
"""Mock package structure used to generate a PyPI repository.
@@ -742,13 +799,20 @@ class FakePackage:
name: str
version: str
- filename: str
+ source_file: FakePackageSource
metadata: MetadataKind
# This will override any dependencies specified in the actual dist's METADATA.
requires_dist: Tuple[str, ...] = ()
# This will override the Name specified in the actual dist's METADATA.
metadata_name: Optional[str] = None
+ @property
+ def filename(self) -> str:
+ return self.source_file.filename
+
+ def source_path(self, shared_data: TestData) -> Path:
+ return self.source_file.source_path(shared_data)
+
def metadata_filename(self) -> str:
"""This is specified by PEP 658."""
return f"{self.filename}.metadata"
@@ -786,14 +850,49 @@ def generate_metadata(self) -> bytes:
@pytest.fixture(scope="session")
-def fake_packages() -> Dict[str, List[FakePackage]]:
+def fake_packages(session_script: PipTestEnvironment) -> Dict[str, List[FakePackage]]:
"""The package database we generate for testing PEP 658 support."""
+ large_compilewheel_metadata_first = create_basic_wheel_for_package(
+ session_script,
+ "compilewheel",
+ "2.0",
+ extra_files={"asdf.txt": b"a" * 10_000},
+ metadata_first=True,
+ )
+ # This wheel must be larger than 10KB to trigger the lazy wheel behavior we want
+ # to test.
+ assert large_compilewheel_metadata_first.stat().st_size > 10_000
+
+ large_translationstring_metadata_last = create_basic_wheel_for_package(
+ session_script,
+ "translationstring",
+ "0.1",
+ extra_files={"asdf.txt": b"a" * 10_000},
+ metadata_first=False,
+ )
+ assert large_translationstring_metadata_last.stat().st_size > 10_000
+
return {
"simple": [
- FakePackage("simple", "1.0", "simple-1.0.tar.gz", MetadataKind.Sha256),
- FakePackage("simple", "2.0", "simple-2.0.tar.gz", MetadataKind.No),
+ FakePackage(
+ "simple",
+ "1.0",
+ FakePackageSource.shared_data_package("simple-1.0.tar.gz"),
+ MetadataKind.Sha256,
+ ),
+ FakePackage(
+ "simple",
+ "2.0",
+ FakePackageSource.shared_data_package("simple-2.0.tar.gz"),
+ MetadataKind.No,
+ ),
# This will raise a hashing error.
- FakePackage("simple", "3.0", "simple-3.0.tar.gz", MetadataKind.WrongHash),
+ FakePackage(
+ "simple",
+ "3.0",
+ FakePackageSource.shared_data_package("simple-3.0.tar.gz"),
+ MetadataKind.WrongHash,
+ ),
],
"simple2": [
# Override the dependencies here in order to force pip to download
@@ -801,17 +900,22 @@ def fake_packages() -> Dict[str, List[FakePackage]]:
FakePackage(
"simple2",
"1.0",
- "simple2-1.0.tar.gz",
+ FakePackageSource.shared_data_package("simple2-1.0.tar.gz"),
MetadataKind.Unhashed,
("simple==1.0",),
),
# This will raise an error when pip attempts to fetch the metadata file.
- FakePackage("simple2", "2.0", "simple2-2.0.tar.gz", MetadataKind.NoFile),
+ FakePackage(
+ "simple2",
+ "2.0",
+ FakePackageSource.shared_data_package("simple2-2.0.tar.gz"),
+ MetadataKind.NoFile,
+ ),
# This has a METADATA file with a mismatched name.
FakePackage(
"simple2",
"3.0",
- "simple2-3.0.tar.gz",
+ FakePackageSource.shared_data_package("simple2-3.0.tar.gz"),
MetadataKind.Sha256,
metadata_name="not-simple2",
),
@@ -822,7 +926,9 @@ def fake_packages() -> Dict[str, List[FakePackage]]:
FakePackage(
"colander",
"0.9.9",
- "colander-0.9.9-py2.py3-none-any.whl",
+ FakePackageSource.shared_data_package(
+ "colander-0.9.9-py2.py3-none-any.whl"
+ ),
MetadataKind.No,
),
],
@@ -832,25 +938,47 @@ def fake_packages() -> Dict[str, List[FakePackage]]:
FakePackage(
"compilewheel",
"1.0",
- "compilewheel-1.0-py2.py3-none-any.whl",
+ FakePackageSource.shared_data_package(
+ "compilewheel-1.0-py2.py3-none-any.whl"
+ ),
MetadataKind.Unhashed,
("simple==1.0",),
),
+ # This inserts a wheel larger than the default fast-deps request size with
+ # .dist-info metadata at the front.
+ FakePackage(
+ "compilewheel",
+ "2.0",
+ FakePackageSource.generated_wheel(large_compilewheel_metadata_first),
+ MetadataKind.No,
+ ),
],
"has-script": [
# Ensure we check PEP 658 metadata hashing errors for wheel files.
FakePackage(
"has-script",
"1.0",
- "has.script-1.0-py2.py3-none-any.whl",
+ FakePackageSource.shared_data_package(
+ "has.script-1.0-py2.py3-none-any.whl"
+ ),
MetadataKind.WrongHash,
),
],
"translationstring": [
+ # This inserts a wheel larger than the default fast-deps request size with
+ # .dist-info metadata at the back.
+ FakePackage(
+ "translationstring",
+ "0.1",
+ FakePackageSource.generated_wheel(
+ large_translationstring_metadata_last
+ ),
+ MetadataKind.No,
+ ),
FakePackage(
"translationstring",
"1.1",
- "translationstring-1.1.tar.gz",
+ FakePackageSource.shared_data_package("translationstring-1.1.tar.gz"),
MetadataKind.No,
),
],
@@ -859,7 +987,9 @@ def fake_packages() -> Dict[str, List[FakePackage]]:
FakePackage(
"priority",
"1.0",
- "priority-1.0-py2.py3-none-any.whl",
+ FakePackageSource.shared_data_package(
+ "priority-1.0-py2.py3-none-any.whl"
+ ),
MetadataKind.NoFile,
),
],
@@ -868,7 +998,9 @@ def fake_packages() -> Dict[str, List[FakePackage]]:
FakePackage(
"requires-simple-extra",
"0.1",
- "requires_simple_extra-0.1-py2.py3-none-any.whl",
+ FakePackageSource.shared_data_package(
+ "requires_simple_extra-0.1-py2.py3-none-any.whl"
+ ),
MetadataKind.Sha256,
metadata_name="Requires_Simple.Extra",
),
@@ -921,9 +1053,11 @@ def html_index_for_packages(
download_links.append(
f' {package_link.filename}
' # noqa: E501
)
- # (3.2) Copy over the corresponding file in `shared_data.packages`.
+ # (3.2) Copy over the corresponding file in `shared_data.packages`, or the
+ # generated wheel path if provided.
+ source_path = package_link.source_path(shared_data)
shutil.copy(
- shared_data.packages / package_link.filename,
+ source_path,
pkg_subdir / package_link.filename,
)
# (3.3) Write a metadata file, if applicable.
@@ -959,6 +1093,7 @@ class OneTimeDownloadHandler(http.server.SimpleHTTPRequestHandler):
"""Serve files from the current directory, but error if a file is downloaded more
than once."""
+ # NB: Needs to be set on per-function subclass.
_seen_paths: ClassVar[Set[str]] = set()
def do_GET(self) -> None:
@@ -980,7 +1115,8 @@ def html_index_with_onetime_server(
"""Serve files from a generated pypi index, erroring if a file is downloaded more
than once.
- Provide `-i http://localhost:8000` to pip invocations to point them at this server.
+ Provide `-i http://localhost:` to pip invocations to point them at
+ this server.
"""
class InDirectoryServer(http.server.ThreadingHTTPServer):
@@ -995,7 +1131,7 @@ def finish_request(self: "Self", request: Any, client_address: Any) -> None:
class Handler(OneTimeDownloadHandler):
_seen_paths: ClassVar[Set[str]] = set()
- with InDirectoryServer(("", 8000), Handler) as httpd:
+ with InDirectoryServer(("", 0), Handler) as httpd:
server_thread = threading.Thread(target=httpd.serve_forever)
server_thread.start()
@@ -1004,3 +1140,237 @@ class Handler(OneTimeDownloadHandler):
finally:
httpd.shutdown()
server_thread.join()
+
+
+class RangeHandler(Enum):
+ """All the modes of handling range requests we want pip to handle."""
+
+ Always200OK = "always-200-ok"
+ NoNegativeRange = "no-negative-range"
+ SneakilyCoerceNegativeRange = "sneakily-coerce-negative-range"
+ SupportsNegativeRange = "supports-negative-range"
+ NegativeRangeOverflowing = "negative-range-overflowing"
+
+ def supports_range(self) -> bool:
+ return self in [
+ type(self).NoNegativeRange,
+ type(self).SneakilyCoerceNegativeRange,
+ type(self).SupportsNegativeRange,
+ type(self).NegativeRangeOverflowing,
+ ]
+
+ def supports_negative_range(self) -> bool:
+ return self in [
+ type(self).SupportsNegativeRange,
+ type(self).NegativeRangeOverflowing,
+ ]
+
+ def sneakily_coerces_negative_range(self) -> bool:
+ return self == type(self).SneakilyCoerceNegativeRange
+
+ def overflows_negative_range(self) -> bool:
+ return self == type(self).NegativeRangeOverflowing
+
+
+class ContentRangeDownloadHandler(
+ http.server.SimpleHTTPRequestHandler, metaclass=abc.ABCMeta
+):
+ """Extend the basic ``http.server`` to support content ranges."""
+
+ @abc.abstractproperty
+ def range_handler(self) -> RangeHandler: ...
+
+ # NB: Needs to be set on per-function subclasses.
+ get_request_counts: ClassVar[Dict[str, int]] = {}
+ positive_range_request_paths: ClassVar[Set[str]] = set()
+ negative_range_request_paths: ClassVar[Set[str]] = set()
+ head_request_paths: ClassVar[Set[str]] = set()
+ ok_response_counts: ClassVar[Dict[str, int]] = {}
+
+ @contextmanager
+ def _translate_path(self) -> Iterator[Optional[Tuple[BinaryIO, str, int]]]:
+ # Only test fast-deps, not PEP 658.
+ if self.path.endswith(".metadata"):
+ self.send_error(http.HTTPStatus.NOT_FOUND, "File not found")
+ yield None
+ return
+
+ path = self.translate_path(self.path)
+ if os.path.isdir(path):
+ path = os.path.join(path, "index.html")
+
+ ctype = self.guess_type(path)
+ try:
+ with open(path, "rb") as f:
+ fs = os.fstat(f.fileno())
+ full_file_length = fs[6]
+
+ yield (f, ctype, full_file_length)
+ except OSError:
+ self.send_error(http.HTTPStatus.NOT_FOUND, "File not found")
+ yield None
+ return
+
+ def _send_basic_headers(self, ctype: str) -> None:
+ self.send_header("Content-Type", ctype)
+ if self.range_handler.supports_range():
+ self.send_header("Accept-Ranges", "bytes")
+ # NB: callers must call self.end_headers()!
+
+ def _send_full_file_headers(self, ctype: str, full_file_length: int) -> None:
+ self.send_response(http.HTTPStatus.OK)
+ self.ok_response_counts.setdefault(self.path, 0)
+ self.ok_response_counts[self.path] += 1
+ self._send_basic_headers(ctype)
+ self.send_header("Content-Length", str(full_file_length))
+ self.end_headers()
+
+ def do_HEAD(self) -> None:
+ self.head_request_paths.add(self.path)
+
+ with self._translate_path() as x:
+ if x is None:
+ return
+ (_, ctype, full_file_length) = x
+ self._send_full_file_headers(ctype, full_file_length)
+
+ def do_GET(self) -> None:
+ self.get_request_counts.setdefault(self.path, 0)
+ self.get_request_counts[self.path] += 1
+
+ with self._translate_path() as x:
+ if x is None:
+ return
+ (f, ctype, full_file_length) = x
+ range_arg = self.headers.get("Range", None)
+ if range_arg is not None:
+ m = re.match(r"bytes=([0-9]+)?-([0-9]+)", range_arg)
+ if m is not None:
+ if m.group(1) is None:
+ self.negative_range_request_paths.add(self.path)
+ else:
+ self.positive_range_request_paths.add(self.path)
+ # If no range given, return the whole file.
+ if range_arg is None or not self.range_handler.supports_range():
+ self._send_full_file_headers(ctype, full_file_length)
+ self.copyfile(f, self.wfile) # type: ignore[misc]
+ return
+ # Otherwise, return the requested contents.
+ assert m is not None
+ # This is a "start-end" range.
+ if m.group(1) is not None:
+ start = int(m.group(1))
+ end = int(m.group(2))
+ assert start <= end
+ was_out_of_bounds = (end + 1) > full_file_length
+ else:
+ # This is a "-end" range.
+ if self.range_handler.sneakily_coerces_negative_range():
+ end = int(m.group(2))
+ self.send_response(http.HTTPStatus.PARTIAL_CONTENT)
+ self._send_basic_headers(ctype)
+ self.send_header("Content-Length", str(end + 1))
+ self.send_header(
+ "Content-Range", f"bytes 0-{end}/{full_file_length}"
+ )
+ self.end_headers()
+ f.seek(0)
+ self.wfile.write(f.read(end + 1))
+ return
+ if not self.range_handler.supports_negative_range():
+ self.send_response(http.HTTPStatus.NOT_IMPLEMENTED)
+ self._send_basic_headers(ctype)
+ self.end_headers()
+ return
+ end = full_file_length - 1
+ start = end - int(m.group(2)) + 1
+ was_out_of_bounds = start < 0
+ if was_out_of_bounds:
+ if self.range_handler.overflows_negative_range():
+ self._send_full_file_headers(ctype, full_file_length)
+ self.copyfile(f, self.wfile) # type: ignore[misc]
+ return
+ self.send_response(http.HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE)
+ self._send_basic_headers(ctype)
+ self.send_header("Content-Range", f"bytes */{full_file_length}")
+ self.end_headers()
+ return
+ sent_length = end - start + 1
+ self.send_response(http.HTTPStatus.PARTIAL_CONTENT)
+ self._send_basic_headers(ctype)
+ self.send_header("Content-Length", str(sent_length))
+ self.send_header("Content-Range", f"bytes {start}-{end}/{full_file_length}")
+ self.end_headers()
+ f.seek(start)
+ self.wfile.write(f.read(sent_length))
+
+
+@pytest.fixture(scope="session")
+def html_index_no_metadata(
+ html_index_for_packages: Path,
+ tmpdir_factory: pytest.TempPathFactory,
+) -> Path:
+ """Return an index like ``html_index_for_packages`` without any PEP 658 metadata.
+
+ While we already return a 404 in ``ContentRangeDownloadHandler`` for ``.metadata``
+ paths, we need to also remove ``data-dist-info-metadata`` attrs on ```` tags,
+ otherwise pip will error after attempting to retrieve the metadata files."""
+ new_html_dir = tmpdir_factory.mktemp("fake_index_html_content_no_metadata")
+ new_html_dir.rmdir()
+ shutil.copytree(html_index_for_packages, new_html_dir)
+ for index_page in new_html_dir.rglob("index.html"):
+ prev_index = index_page.read_text()
+ no_metadata_index = re.sub(r'data-dist-info-metadata="[^"]+"', "", prev_index)
+ index_page.write_text(no_metadata_index)
+ return new_html_dir
+
+
+HTMLIndexWithRangeServer = Callable[
+ [RangeHandler],
+ "AbstractContextManager[Tuple[Type[ContentRangeDownloadHandler], int]]",
+]
+
+
+@pytest.fixture
+def html_index_with_range_server(
+ html_index_no_metadata: Path,
+) -> HTMLIndexWithRangeServer:
+ """Serve files from a generated pypi index, with support for range requests.
+
+ Provide `-i http://localhost:` to pip invocations to point them at
+ this server.
+ """
+
+ class InDirectoryServer(http.server.ThreadingHTTPServer):
+ def finish_request(self, request: Any, client_address: Any) -> None:
+ self.RequestHandlerClass(
+ request, client_address, self, directory=str(html_index_no_metadata) # type: ignore[call-arg,arg-type]
+ )
+
+ @contextmanager
+ def inner(
+ range_handler: RangeHandler,
+ ) -> Iterator[Tuple[Type[ContentRangeDownloadHandler], int]]:
+ class Handler(ContentRangeDownloadHandler):
+ @property
+ def range_handler(self) -> RangeHandler:
+ return range_handler
+
+ get_request_counts: ClassVar[Dict[str, int]] = {}
+ positive_range_request_paths: ClassVar[Set[str]] = set()
+ negative_range_request_paths: ClassVar[Set[str]] = set()
+ head_request_paths: ClassVar[Set[str]] = set()
+ ok_response_counts: ClassVar[Dict[str, int]] = {}
+
+ with InDirectoryServer(("", 0), Handler) as httpd:
+ server_thread = threading.Thread(target=httpd.serve_forever)
+ server_thread.start()
+
+ _, server_port = httpd.server_address
+ try:
+ yield (Handler, server_port)
+ finally:
+ httpd.shutdown()
+ server_thread.join()
+
+ return inner
diff --git a/tests/functional/test_download.py b/tests/functional/test_download.py
index d469e71c360..714f6670411 100644
--- a/tests/functional/test_download.py
+++ b/tests/functional/test_download.py
@@ -1273,6 +1273,7 @@ def download_server_html_index(
) -> Callable[..., Tuple[TestPipResult, Path]]:
"""Execute `pip download` against a generated PyPI index."""
download_dir = tmpdir / "download_dir"
+ _, server_port = html_index_with_onetime_server.server_address
def run_for_generated_index(
args: List[str],
@@ -1287,7 +1288,7 @@ def run_for_generated_index(
"-d",
str(download_dir),
"-i",
- "http://localhost:8000",
+ f"http://localhost:{server_port}",
*args,
]
result = script.pip(*pip_args, allow_error=allow_error)
@@ -1306,7 +1307,7 @@ def run_for_generated_index(
["colander-0.9.9-py2.py3-none-any.whl", "translationstring-1.1.tar.gz"],
),
(
- "compilewheel",
+ "compilewheel==1.0",
["compilewheel-1.0-py2.py3-none-any.whl", "simple-1.0.tar.gz"],
),
],
@@ -1339,7 +1340,7 @@ def test_download_metadata(
"/colander/colander-0.9.9-py2.py3-none-any.whl",
),
(
- "compilewheel",
+ "compilewheel==1.0",
[
"compilewheel-1.0-py2.py3-none-any.whl",
"simple-1.0.tar.gz",
diff --git a/tests/functional/test_fast_deps.py b/tests/functional/test_fast_deps.py
index 5a910b89763..b691642edb3 100644
--- a/tests/functional/test_fast_deps.py
+++ b/tests/functional/test_fast_deps.py
@@ -4,12 +4,14 @@
import pathlib
import re
from os.path import basename
-from typing import Iterable
+from pathlib import Path
+from typing import Iterable, List
import pytest
from pip._vendor.packaging.utils import canonicalize_name
from pip._internal.utils.misc import hash_file
+from tests.conftest import HTMLIndexWithRangeServer, RangeHandler
from tests.lib import PipTestEnvironment, TestData, TestPipResult
@@ -20,6 +22,7 @@ def pip(script: PipTestEnvironment, command: str, requirement: str) -> TestPipRe
"--no-cache-dir",
"--use-feature=fast-deps",
requirement,
+ # TODO: remove this when fast-deps is on by default.
allow_stderr_warning=True,
)
@@ -45,6 +48,31 @@ def test_install_from_pypi(
assert_installed(script, expected)
+@pytest.mark.network
+@pytest.mark.parametrize(
+ "requirement, url, expected",
+ [
+ (
+ "wcwidth==0.2.1",
+ "https://files.pythonhosted.org/packages/6c/a6/cdb485093ad4017d874d7a2e6a736d02720258f57876548eea2bf04c76f0/wcwidth-0.2.1-py2.py3-none-any.whl",
+ "multiple .dist-info directories found",
+ ),
+ ],
+)
+def test_invalid_wheel_parse_error(
+ requirement: str, url: str, expected: str, script: PipTestEnvironment
+) -> None:
+ """Check for both the full download URL and the reason for the error."""
+ result = script.pip(
+ "install",
+ "--use-feature=fast-deps",
+ requirement,
+ expect_error=True,
+ )
+ assert url in result.stderr
+ assert expected in result.stderr
+
+
@pytest.mark.network
@pytest.mark.parametrize(
"requirement, expected",
@@ -136,3 +164,189 @@ def test_hash_mismatch_existing_download_for_metadata_only_wheel(
hash_file(str(idna_wheel))[0].hexdigest()
== "b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
)
+
+
+@pytest.mark.parametrize("range_handler", list(RangeHandler))
+def test_download_range(
+ script: PipTestEnvironment,
+ tmpdir: Path,
+ html_index_with_range_server: HTMLIndexWithRangeServer,
+ range_handler: RangeHandler,
+) -> None:
+ """Execute `pip download` against a generated PyPI index."""
+ download_dir = tmpdir / "download_dir"
+
+ def run_for_generated_index(server_port: int, args: List[str]) -> TestPipResult:
+ """
+ Produce a PyPI directory structure pointing to the specified packages, then
+ execute `pip download -i ...` pointing to our generated index.
+ """
+ pip_args = [
+ "download",
+ "--use-feature=fast-deps",
+ "-d",
+ str(download_dir),
+ "-i",
+ f"http://localhost:{server_port}",
+ *args,
+ ]
+ return script.pip(*pip_args, allow_stderr_warning=True)
+
+ with html_index_with_range_server(range_handler) as (handler, server_port):
+ run_for_generated_index(
+ server_port,
+ ["colander", "compilewheel==2.0", "has-script", "translationstring==0.1"],
+ )
+ generated_files = os.listdir(download_dir)
+ assert fnmatch.filter(generated_files, "colander*.whl")
+ assert fnmatch.filter(generated_files, "compilewheel*.whl")
+ assert fnmatch.filter(generated_files, "has.script*.whl")
+ assert fnmatch.filter(generated_files, "translationstring*.whl")
+
+ colander_wheel_path = "/colander/colander-0.9.9-py2.py3-none-any.whl"
+ compile_wheel_path = "/compilewheel/compilewheel-2.0-py2.py3-none-any.whl"
+ has_script_path = "/has-script/has.script-1.0-py2.py3-none-any.whl"
+ translationstring_path = (
+ "/translationstring/translationstring-0.1-py2.py3-none-any.whl"
+ )
+
+ if range_handler == RangeHandler.Always200OK:
+ assert not handler.head_request_paths
+ assert not handler.positive_range_request_paths
+ assert {colander_wheel_path} == handler.negative_range_request_paths
+ # Tries a range request, finds it's unsupported, so doesn't try it again.
+ assert handler.get_request_counts[colander_wheel_path] == 2
+ assert handler.ok_response_counts[colander_wheel_path] == 2
+ assert handler.get_request_counts[compile_wheel_path] == 1
+ assert handler.ok_response_counts[compile_wheel_path] == 1
+ assert handler.get_request_counts[has_script_path] == 1
+ assert handler.ok_response_counts[has_script_path] == 1
+ assert handler.get_request_counts[translationstring_path] == 1
+ assert handler.ok_response_counts[translationstring_path] == 1
+ elif range_handler == RangeHandler.NoNegativeRange:
+ assert {
+ colander_wheel_path,
+ compile_wheel_path,
+ has_script_path,
+ translationstring_path,
+ } == handler.head_request_paths
+ assert {
+ colander_wheel_path,
+ compile_wheel_path,
+ has_script_path,
+ translationstring_path,
+ } == handler.positive_range_request_paths
+ # Tries this first, finds that negative offsets are unsupported, so doesn't
+ # try it again.
+ assert {colander_wheel_path} == handler.negative_range_request_paths
+ # Two more for the first wheel, because it has the failing negative
+ # byte request and is larger than the initial chunk size.
+ assert handler.get_request_counts[colander_wheel_path] == 4
+ assert handler.ok_response_counts[colander_wheel_path] == 2
+ # The .dist-info dir at the start requires an additional ranged GET vs
+ # translationstring.
+ assert handler.get_request_counts[compile_wheel_path] == 3
+ assert handler.ok_response_counts[compile_wheel_path] == 2
+ # The entire file should have been pulled in with a single ranged GET.
+ assert handler.get_request_counts[has_script_path] == 2
+ assert handler.ok_response_counts[has_script_path] == 2
+ # The entire .dist-info dir should have been pulled in with a single
+ # ranged GET. The second GET is for the end of the download, pulling down
+ # the entire file contents.
+ assert handler.get_request_counts[translationstring_path] == 2
+ assert handler.ok_response_counts[translationstring_path] == 2
+ elif range_handler == RangeHandler.SneakilyCoerceNegativeRange:
+ assert {
+ colander_wheel_path,
+ compile_wheel_path,
+ has_script_path,
+ translationstring_path,
+ } == handler.head_request_paths
+ assert {
+ colander_wheel_path,
+ compile_wheel_path,
+ has_script_path,
+ translationstring_path,
+ } == handler.positive_range_request_paths
+ # Tries this first, finds that negative offsets are unsupported, so doesn't
+ # try it again.
+ assert {colander_wheel_path} == handler.negative_range_request_paths
+ # Two more for the first wheel, because it has the failing negative
+ # byte request and is larger than the initial chunk size.
+ assert handler.get_request_counts[colander_wheel_path] == 4
+ assert handler.ok_response_counts[colander_wheel_path] == 2
+ # The .dist-info dir at the start requires an additional ranged GET vs
+ # translationstring.
+ assert handler.get_request_counts[compile_wheel_path] == 3
+ assert handler.ok_response_counts[compile_wheel_path] == 2
+ # The entire file should have been pulled in with a single ranged GET.
+ assert handler.get_request_counts[has_script_path] == 2
+ assert handler.ok_response_counts[has_script_path] == 2
+ # The entire .dist-info dir should have been pulled in with a single
+ # ranged GET. The second GET is for the end of the download, pulling down
+ # the entire file contents.
+ assert handler.get_request_counts[translationstring_path] == 2
+ assert handler.ok_response_counts[translationstring_path] == 2
+ elif range_handler == RangeHandler.SupportsNegativeRange:
+ # The negative byte index worked, so no head requests.
+ assert not handler.head_request_paths
+ # The negative range request was in bounds and pulled in the entire
+ # .dist-info directory (at the end of the zip) for translationstring==0.1,
+ # so we didn't need another range request for it. compilewheel==2.0 has the
+ # .dist-info dir at the start of the zip, so we still need another request
+ # for that.
+ assert {
+ colander_wheel_path,
+ has_script_path,
+ compile_wheel_path,
+ } == handler.positive_range_request_paths
+ assert {
+ colander_wheel_path,
+ compile_wheel_path,
+ has_script_path,
+ translationstring_path,
+ } == handler.negative_range_request_paths
+ assert handler.get_request_counts[colander_wheel_path] == 3
+ assert handler.ok_response_counts[colander_wheel_path] == 1
+ # One more than translationstring, because the .dist-info dir is at the
+ # front of the wheel.
+ assert handler.get_request_counts[compile_wheel_path] == 3
+ assert handler.ok_response_counts[compile_wheel_path] == 1
+ # One more than NoNegativeRange, because the negative byte index failed.
+ assert handler.get_request_counts[has_script_path] == 3
+ assert handler.ok_response_counts[has_script_path] == 1
+ assert handler.get_request_counts[translationstring_path] == 2
+ assert handler.ok_response_counts[translationstring_path] == 1
+ else:
+ assert range_handler == RangeHandler.NegativeRangeOverflowing
+ # The negative byte index worked, so no head requests.
+ assert not handler.head_request_paths
+ # The negative range request was in bounds and pulled in the entire
+ # .dist-info directory (at the end of the zip) for translationstring==0.1,
+ # so we didn't need another range request for it. compilewheel==2.0 has the
+ # .dist-info dir at the start of the zip, so we still need another request
+ # for that.
+ assert {
+ colander_wheel_path,
+ compile_wheel_path,
+ } == handler.positive_range_request_paths
+ assert {
+ colander_wheel_path,
+ compile_wheel_path,
+ has_script_path,
+ translationstring_path,
+ has_script_path,
+ } == handler.negative_range_request_paths
+ assert handler.get_request_counts[colander_wheel_path] == 3
+ assert handler.ok_response_counts[colander_wheel_path] == 1
+ # One more than translationstring, because the .dist-info dir is at the
+ # front of the wheel.
+ assert handler.get_request_counts[compile_wheel_path] == 3
+ assert handler.ok_response_counts[compile_wheel_path] == 1
+ # One *less* request for has-script than SupportsNegativeRange, because the
+ # server returned a full 200 OK response when the negative byte range was
+ # larger than the actual file size.
+ assert handler.get_request_counts[has_script_path] == 2
+ assert handler.ok_response_counts[has_script_path] == 2
+ assert handler.get_request_counts[translationstring_path] == 2
+ assert handler.ok_response_counts[translationstring_path] == 1
diff --git a/tests/lib/__init__.py b/tests/lib/__init__.py
index e318f7155d2..df980cc01f1 100644
--- a/tests/lib/__init__.py
+++ b/tests/lib/__init__.py
@@ -1178,6 +1178,7 @@ def create_basic_wheel_for_package(
extras: Optional[Dict[str, List[str]]] = None,
requires_python: Optional[str] = None,
extra_files: Optional[Dict[str, Union[bytes, str]]] = None,
+ metadata_first: bool = True,
) -> pathlib.Path:
if depends is None:
depends = []
@@ -1224,6 +1225,7 @@ def hello():
extra_files=extra_files,
# Have an empty RECORD because we don't want to be checking hashes.
record="",
+ metadata_first=metadata_first,
)
wheel_builder.save_to(archive_path)
diff --git a/tests/lib/wheel.py b/tests/lib/wheel.py
index 342abbacaad..41098c60cb8 100644
--- a/tests/lib/wheel.py
+++ b/tests/lib/wheel.py
@@ -199,22 +199,19 @@ def record_file_maker_wrapper(
version: str,
files: Iterable[File],
record: Defaulted[Optional[AnyStr]],
-) -> Iterable[File]:
- records: List[Record] = []
- for file in files:
- records.append(
- Record(file.name, digest(file.contents), str(len(file.contents)))
- )
- yield file
+) -> Optional[File]:
+ records: List[Record] = [
+ Record(file.name, digest(file.contents), str(len(file.contents)))
+ for file in files
+ ]
if record is None:
- return
+ return None
record_path = dist_info_path(name, version, "RECORD")
if record is not _default:
- yield File(record_path, ensure_binary(record))
- return
+ return File(record_path, ensure_binary(record))
records.append(Record(record_path, "", ""))
@@ -224,7 +221,7 @@ def record_file_maker_wrapper(
writer.writerow(r)
contents = buf.getvalue().encode("utf-8")
- yield File(record_path, contents)
+ return File(record_path, contents)
def wheel_name(
@@ -302,6 +299,7 @@ def make_wheel(
console_scripts: Defaulted[List[str]] = _default,
entry_points: Defaulted[Dict[str, List[str]]] = _default,
record: Defaulted[Optional[AnyStr]] = _default,
+ metadata_first: bool = True,
) -> WheelBuilder:
"""
Helper function for generating test wheels which are compliant by default.
@@ -362,13 +360,14 @@ def make_wheel(
:param entry_points:
:param record: if provided and None, then no RECORD file is generated;
else if a string then sets the content of the RECORD file
+ :param metadata_first: Put the .dist-info metadata at the front of the zip file.
"""
pythons = ["py2", "py3"]
abis = ["none"]
platforms = ["any"]
tags = list(itertools.product(pythons, abis, platforms))
- possible_files = [
+ metadata_files = [
make_metadata_file(name, version, metadata, metadata_updates, metadata_body),
make_wheel_metadata_file(
name, version, wheel_metadata, tags, wheel_metadata_updates
@@ -376,20 +375,31 @@ def make_wheel(
make_entry_points_file(name, version, entry_points, console_scripts),
]
+ non_metadata_files = []
+
if extra_files is not _default:
- possible_files.extend(make_files(extra_files))
+ non_metadata_files.extend(make_files(extra_files))
if extra_metadata_files is not _default:
- possible_files.extend(make_metadata_files(name, version, extra_metadata_files))
+ metadata_files.extend(make_metadata_files(name, version, extra_metadata_files))
if extra_data_files is not _default:
- possible_files.extend(make_data_files(name, version, extra_data_files))
+ non_metadata_files.extend(make_data_files(name, version, extra_data_files))
- actual_files = filter(None, possible_files)
+ actual_metadata_files = list(filter(None, metadata_files))
+
+ if metadata_first:
+ actual_files = actual_metadata_files + non_metadata_files
+ else:
+ actual_files = non_metadata_files + actual_metadata_files
+
+ record_file = record_file_maker_wrapper(name, version, actual_files, record)
+ if record_file:
+ if metadata_first:
+ actual_files.insert(0, record_file)
+ else:
+ actual_files.append(record_file)
- files_and_record_file = record_file_maker_wrapper(
- name, version, actual_files, record
- )
wheel_file_name = wheel_name(name, version, pythons, abis, platforms)
- return WheelBuilder(wheel_file_name, files_and_record_file)
+ return WheelBuilder(wheel_file_name, actual_files)
diff --git a/tests/unit/test_network_lazy_wheel.py b/tests/unit/test_network_lazy_wheel.py
index 356387d2bba..1a6f6686e15 100644
--- a/tests/unit/test_network_lazy_wheel.py
+++ b/tests/unit/test_network_lazy_wheel.py
@@ -25,12 +25,12 @@
}
-@pytest.fixture
+@pytest.fixture(scope="session")
def session() -> PipSession:
return PipSession()
-@pytest.fixture
+@pytest.fixture(scope="session")
def mypy_whl_no_range(mock_server: MockServer, shared_data: TestData) -> Iterator[str]:
mypy_whl = shared_data.packages / "mypy-0.782-py3-none-any.whl"
mock_server.set_responses([file_response(mypy_whl)])