Use data-dist-info-metadata (PEP 658) to decouple resolution from dow…

…nloading (#11111) Co-authored-by: Tzu-ping Chung <uranusjr@gmail.com>
pypa · Sep 10, 2022 · bad03ef · bad03ef
1 parent a66406a
commit bad03ef
Show file tree

Hide file tree

Showing 14 changed files with 834 additions and 180 deletions.
diff --git a/news/11111.feature.rst b/news/11111.feature.rst
@@ -0,0 +1 @@
+Use the ``data-dist-info-metadata`` attribute from :pep:`658` to resolve distribution metadata without downloading the dist yet.
diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py
@@ -335,8 +335,8 @@ class MetadataInconsistent(InstallationError):
     """Built metadata contains inconsistent information.
 
     This is raised when the metadata contains values (e.g. name and version)
-    that do not match the information previously obtained from sdist filename
-    or user-supplied ``#egg=`` value.
+    that do not match the information previously obtained from sdist filename,
+    user-supplied ``#egg=`` value, or an install requirement name.
     """
 
     def __init__(
@@ -348,11 +348,10 @@ def __init__(
         self.m_val = m_val
 
     def __str__(self) -> str:
-        template = (
-            "Requested {} has inconsistent {}: "
-            "filename has {!r}, but metadata has {!r}"
+        return (
+            f"Requested {self.ireq} has inconsistent {self.field}: "
+            f"expected {self.f_val!r}, but metadata has {self.m_val!r}"
         )
-        return template.format(self.ireq, self.field, self.f_val, self.m_val)
 
 
 class LegacyInstallFailure(DiagnosticPipError):

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -9,10 +9,8 @@
 import json
 import logging
 import os
-import re
 import urllib.parse
 import urllib.request
-import xml.etree.ElementTree
 from html.parser import HTMLParser
 from optparse import Values
 from typing import (
@@ -39,7 +37,7 @@
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
 from pip._internal.utils.filetypes import is_archive_file
-from pip._internal.utils.misc import pairwise, redact_auth_from_url
+from pip._internal.utils.misc import redact_auth_from_url
 from pip._internal.vcs import vcs
 
 from .sources import CandidatesFromPage, LinkSource, build_source
@@ -51,7 +49,6 @@
 
 logger = logging.getLogger(__name__)
 
-HTMLElement = xml.etree.ElementTree.Element
 ResponseHeaders = MutableMapping[str, str]
 
 
@@ -191,94 +188,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
     return None
 
 
-def _clean_url_path_part(part: str) -> str:
-    """
-    Clean a "part" of a URL path (i.e. after splitting on "@" characters).
-    """
-    # We unquote prior to quoting to make sure nothing is double quoted.
-    return urllib.parse.quote(urllib.parse.unquote(part))
-
-
-def _clean_file_url_path(part: str) -> str:
-    """
-    Clean the first part of a URL path that corresponds to a local
-    filesystem path (i.e. the first part after splitting on "@" characters).
-    """
-    # We unquote prior to quoting to make sure nothing is double quoted.
-    # Also, on Windows the path part might contain a drive letter which
-    # should not be quoted. On Linux where drive letters do not
-    # exist, the colon should be quoted. We rely on urllib.request
-    # to do the right thing here.
-    return urllib.request.pathname2url(urllib.request.url2pathname(part))
-
-
-# percent-encoded:                   /
-_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
-
-
-def _clean_url_path(path: str, is_local_path: bool) -> str:
-    """
-    Clean the path portion of a URL.
-    """
-    if is_local_path:
-        clean_func = _clean_file_url_path
-    else:
-        clean_func = _clean_url_path_part
-
-    # Split on the reserved characters prior to cleaning so that
-    # revision strings in VCS URLs are properly preserved.
-    parts = _reserved_chars_re.split(path)
-
-    cleaned_parts = []
-    for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
-        cleaned_parts.append(clean_func(to_clean))
-        # Normalize %xx escapes (e.g. %2f -> %2F)
-        cleaned_parts.append(reserved.upper())
-
-    return "".join(cleaned_parts)
-
-
-def _clean_link(url: str) -> str:
-    """
-    Make sure a link is fully quoted.
-    For example, if ' ' occurs in the URL, it will be replaced with "%20",
-    and without double-quoting other characters.
-    """
-    # Split the URL into parts according to the general structure
-    # `scheme://netloc/path;parameters?query#fragment`.
-    result = urllib.parse.urlparse(url)
-    # If the netloc is empty, then the URL refers to a local filesystem path.
-    is_local_path = not result.netloc
-    path = _clean_url_path(result.path, is_local_path=is_local_path)
-    return urllib.parse.urlunparse(result._replace(path=path))
-
-
-def _create_link_from_element(
-    element_attribs: Dict[str, Optional[str]],
-    page_url: str,
-    base_url: str,
-) -> Optional[Link]:
-    """
-    Convert an anchor element's attributes in a simple repository page to a Link.
-    """
-    href = element_attribs.get("href")
-    if not href:
-        return None
-
-    url = _clean_link(urllib.parse.urljoin(base_url, href))
-    pyrequire = element_attribs.get("data-requires-python")
-    yanked_reason = element_attribs.get("data-yanked")
-
-    link = Link(
-        url,
-        comes_from=page_url,
-        requires_python=pyrequire,
-        yanked_reason=yanked_reason,
-    )
-
-    return link
-
-
 class CacheablePageContent:
     def __init__(self, page: "IndexContent") -> None:
         assert page.cache_link_parsing
@@ -326,25 +235,10 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
         data = json.loads(page.content)
         for file in data.get("files", []):
-            file_url = file.get("url")
-            if file_url is None:
+            link = Link.from_json(file, page.url)
+            if link is None:
                 continue
-
-            # The Link.yanked_reason expects an empty string instead of a boolean.
-            yanked_reason = file.get("yanked")
-            if yanked_reason and not isinstance(yanked_reason, str):
-                yanked_reason = ""
-            # The Link.yanked_reason expects None instead of False
-            elif not yanked_reason:
-                yanked_reason = None
-
-            yield Link(
-                _clean_link(urllib.parse.urljoin(page.url, file_url)),
-                comes_from=page.url,
-                requires_python=file.get("requires-python"),
-                yanked_reason=yanked_reason,
-                hashes=file.get("hashes", {}),
-            )
+            yield link
         return
 
     parser = HTMLLinkParser(page.url)
@@ -354,11 +248,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     url = page.url
     base_url = parser.base_url or url
     for anchor in parser.anchors:
-        link = _create_link_from_element(
-            anchor,
-            page_url=url,
-            base_url=base_url,
-        )
+        link = Link.from_element(anchor, page_url=url, base_url=base_url)
         if link is None:
             continue
         yield link

diff --git a/src/pip/_internal/metadata/__init__.py b/src/pip/_internal/metadata/__init__.py
@@ -103,3 +103,25 @@ def get_wheel_distribution(wheel: Wheel, canonical_name: str) -> BaseDistributio
     :param canonical_name: Normalized project name of the given wheel.
     """
     return select_backend().Distribution.from_wheel(wheel, canonical_name)
+
+
+def get_metadata_distribution(
+    metadata_contents: bytes,
+    filename: str,
+    canonical_name: str,
+) -> BaseDistribution:
+    """Get the dist representation of the specified METADATA file contents.
+
+    This returns a Distribution instance from the chosen backend sourced from the data
+    in `metadata_contents`.
+
+    :param metadata_contents: Contents of a METADATA file within a dist, or one served
+                              via PEP 658.
+    :param filename: Filename for the dist this metadata represents.
+    :param canonical_name: Normalized project name of the given dist.
+    """
+    return select_backend().Distribution.from_metadata_file_contents(
+        metadata_contents,
+        filename,
+        canonical_name,
+    )
diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py
@@ -113,6 +113,24 @@ def from_directory(cls, directory: str) -> "BaseDistribution":
         """
         raise NotImplementedError()
 
+    @classmethod
+    def from_metadata_file_contents(
+        cls,
+        metadata_contents: bytes,
+        filename: str,
+        project_name: str,
+    ) -> "BaseDistribution":
+        """Load the distribution from the contents of a METADATA file.
+
+        This is used to implement PEP 658 by generating a "shallow" dist object that can
+        be used for resolution without downloading or building the actual dist yet.
+
+        :param metadata_contents: The contents of a METADATA file.
+        :param filename: File name for the dist with this metadata.
+        :param project_name: Name of the project this dist represents.
+        """
+        raise NotImplementedError()
+
     @classmethod
     def from_wheel(cls, wheel: "Wheel", name: str) -> "BaseDistribution":
         """Load the distribution from a given wheel.

diff --git a/src/pip/_internal/metadata/importlib/_dists.py b/src/pip/_internal/metadata/importlib/_dists.py
@@ -28,6 +28,7 @@
 )
 from pip._internal.utils.misc import normalize_path
 from pip._internal.utils.packaging import safe_extra
+from pip._internal.utils.temp_dir import TempDirectory
 from pip._internal.utils.wheel import parse_wheel, read_wheel_metadata_file
 
 from ._compat import BasePath, get_dist_name
@@ -109,6 +110,23 @@ def from_directory(cls, directory: str) -> BaseDistribution:
         dist = importlib.metadata.Distribution.at(info_location)
         return cls(dist, info_location, info_location.parent)
 
+    @classmethod
+    def from_metadata_file_contents(
+        cls,
+        metadata_contents: bytes,
+        filename: str,
+        project_name: str,
+    ) -> BaseDistribution:
+        # Generate temp dir to contain the metadata file, and write the file contents.
+        temp_dir = pathlib.Path(
+            TempDirectory(kind="metadata", globally_managed=True).path
+        )
+        metadata_path = temp_dir / "METADATA"
+        metadata_path.write_bytes(metadata_contents)
+        # Construct dist pointing to the newly created directory.
+        dist = importlib.metadata.Distribution.at(metadata_path.parent)
+        return cls(dist, metadata_path.parent, None)
+
     @classmethod
     def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
         try:

diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py
@@ -33,7 +33,7 @@ class EntryPoint(NamedTuple):
     group: str
 
 
-class WheelMetadata:
+class InMemoryMetadata:
     """IMetadataProvider that reads metadata files from a dictionary.
 
     This also maps metadata decoding exceptions to our internal exception type.
@@ -92,12 +92,29 @@ def from_directory(cls, directory: str) -> BaseDistribution:
         dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata)
         return cls(dist)
 
+    @classmethod
+    def from_metadata_file_contents(
+        cls,
+        metadata_contents: bytes,
+        filename: str,
+        project_name: str,
+    ) -> BaseDistribution:
+        metadata_dict = {
+            "METADATA": metadata_contents,
+        }
+        dist = pkg_resources.DistInfoDistribution(
+            location=filename,
+            metadata=InMemoryMetadata(metadata_dict, filename),
+            project_name=project_name,
+        )
+        return cls(dist)
+
     @classmethod
     def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
         try:
             with wheel.as_zipfile() as zf:
                 info_dir, _ = parse_wheel(zf, name)
-                metadata_text = {
+                metadata_dict = {
                     path.split("/", 1)[-1]: read_wheel_metadata_file(zf, path)
                     for path in zf.namelist()
                     if path.startswith(f"{info_dir}/")
@@ -108,7 +125,7 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
             raise UnsupportedWheel(f"{name} has an invalid wheel, {e}")
         dist = pkg_resources.DistInfoDistribution(
             location=wheel.location,
-            metadata=WheelMetadata(metadata_text, wheel.location),
+            metadata=InMemoryMetadata(metadata_dict, wheel.location),
             project_name=name,
         )
         return cls(dist)