Skip to content

Commit

Permalink
Use data-dist-info-metadata (PEP 658) to decouple resolution from dow…
Browse files Browse the repository at this point in the history
…nloading (#11111)

Co-authored-by: Tzu-ping Chung <uranusjr@gmail.com>
  • Loading branch information
cosmicexplorer and uranusjr authored Sep 10, 2022
1 parent a66406a commit bad03ef
Show file tree
Hide file tree
Showing 14 changed files with 834 additions and 180 deletions.
1 change: 1 addition & 0 deletions news/11111.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Use the ``data-dist-info-metadata`` attribute from :pep:`658` to resolve distribution metadata without downloading the dist yet.
11 changes: 5 additions & 6 deletions src/pip/_internal/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,8 @@ class MetadataInconsistent(InstallationError):
"""Built metadata contains inconsistent information.
This is raised when the metadata contains values (e.g. name and version)
that do not match the information previously obtained from sdist filename
or user-supplied ``#egg=`` value.
that do not match the information previously obtained from sdist filename,
user-supplied ``#egg=`` value, or an install requirement name.
"""

def __init__(
Expand All @@ -348,11 +348,10 @@ def __init__(
self.m_val = m_val

def __str__(self) -> str:
template = (
"Requested {} has inconsistent {}: "
"filename has {!r}, but metadata has {!r}"
return (
f"Requested {self.ireq} has inconsistent {self.field}: "
f"expected {self.f_val!r}, but metadata has {self.m_val!r}"
)
return template.format(self.ireq, self.field, self.f_val, self.m_val)


class LegacyInstallFailure(DiagnosticPipError):
Expand Down
120 changes: 5 additions & 115 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@
import json
import logging
import os
import re
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from html.parser import HTMLParser
from optparse import Values
from typing import (
Expand All @@ -39,7 +37,7 @@
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.utils.misc import redact_auth_from_url
from pip._internal.vcs import vcs

from .sources import CandidatesFromPage, LinkSource, build_source
Expand All @@ -51,7 +49,6 @@

logger = logging.getLogger(__name__)

HTMLElement = xml.etree.ElementTree.Element
ResponseHeaders = MutableMapping[str, str]


Expand Down Expand Up @@ -191,94 +188,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
return None


def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
return urllib.parse.quote(urllib.parse.unquote(part))


def _clean_file_url_path(part: str) -> str:
"""
Clean the first part of a URL path that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
return urllib.request.pathname2url(urllib.request.url2pathname(part))


# percent-encoded: /
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)


def _clean_url_path(path: str, is_local_path: bool) -> str:
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part

# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)

cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
cleaned_parts.append(clean_func(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())

return "".join(cleaned_parts)


def _clean_link(url: str) -> str:
"""
Make sure a link is fully quoted.
For example, if ' ' occurs in the URL, it will be replaced with "%20",
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`.
result = urllib.parse.urlparse(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunparse(result._replace(path=path))


def _create_link_from_element(
element_attribs: Dict[str, Optional[str]],
page_url: str,
base_url: str,
) -> Optional[Link]:
"""
Convert an anchor element's attributes in a simple repository page to a Link.
"""
href = element_attribs.get("href")
if not href:
return None

url = _clean_link(urllib.parse.urljoin(base_url, href))
pyrequire = element_attribs.get("data-requires-python")
yanked_reason = element_attribs.get("data-yanked")

link = Link(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
)

return link


class CacheablePageContent:
def __init__(self, page: "IndexContent") -> None:
assert page.cache_link_parsing
Expand Down Expand Up @@ -326,25 +235,10 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
data = json.loads(page.content)
for file in data.get("files", []):
file_url = file.get("url")
if file_url is None:
link = Link.from_json(file, page.url)
if link is None:
continue

# The Link.yanked_reason expects an empty string instead of a boolean.
yanked_reason = file.get("yanked")
if yanked_reason and not isinstance(yanked_reason, str):
yanked_reason = ""
# The Link.yanked_reason expects None instead of False
elif not yanked_reason:
yanked_reason = None

yield Link(
_clean_link(urllib.parse.urljoin(page.url, file_url)),
comes_from=page.url,
requires_python=file.get("requires-python"),
yanked_reason=yanked_reason,
hashes=file.get("hashes", {}),
)
yield link
return

parser = HTMLLinkParser(page.url)
Expand All @@ -354,11 +248,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = _create_link_from_element(
anchor,
page_url=url,
base_url=base_url,
)
link = Link.from_element(anchor, page_url=url, base_url=base_url)
if link is None:
continue
yield link
Expand Down
22 changes: 22 additions & 0 deletions src/pip/_internal/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,25 @@ def get_wheel_distribution(wheel: Wheel, canonical_name: str) -> BaseDistributio
:param canonical_name: Normalized project name of the given wheel.
"""
return select_backend().Distribution.from_wheel(wheel, canonical_name)


def get_metadata_distribution(
metadata_contents: bytes,
filename: str,
canonical_name: str,
) -> BaseDistribution:
"""Get the dist representation of the specified METADATA file contents.
This returns a Distribution instance from the chosen backend sourced from the data
in `metadata_contents`.
:param metadata_contents: Contents of a METADATA file within a dist, or one served
via PEP 658.
:param filename: Filename for the dist this metadata represents.
:param canonical_name: Normalized project name of the given dist.
"""
return select_backend().Distribution.from_metadata_file_contents(
metadata_contents,
filename,
canonical_name,
)
18 changes: 18 additions & 0 deletions src/pip/_internal/metadata/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,24 @@ def from_directory(cls, directory: str) -> "BaseDistribution":
"""
raise NotImplementedError()

@classmethod
def from_metadata_file_contents(
cls,
metadata_contents: bytes,
filename: str,
project_name: str,
) -> "BaseDistribution":
"""Load the distribution from the contents of a METADATA file.
This is used to implement PEP 658 by generating a "shallow" dist object that can
be used for resolution without downloading or building the actual dist yet.
:param metadata_contents: The contents of a METADATA file.
:param filename: File name for the dist with this metadata.
:param project_name: Name of the project this dist represents.
"""
raise NotImplementedError()

@classmethod
def from_wheel(cls, wheel: "Wheel", name: str) -> "BaseDistribution":
"""Load the distribution from a given wheel.
Expand Down
18 changes: 18 additions & 0 deletions src/pip/_internal/metadata/importlib/_dists.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
)
from pip._internal.utils.misc import normalize_path
from pip._internal.utils.packaging import safe_extra
from pip._internal.utils.temp_dir import TempDirectory
from pip._internal.utils.wheel import parse_wheel, read_wheel_metadata_file

from ._compat import BasePath, get_dist_name
Expand Down Expand Up @@ -109,6 +110,23 @@ def from_directory(cls, directory: str) -> BaseDistribution:
dist = importlib.metadata.Distribution.at(info_location)
return cls(dist, info_location, info_location.parent)

@classmethod
def from_metadata_file_contents(
cls,
metadata_contents: bytes,
filename: str,
project_name: str,
) -> BaseDistribution:
# Generate temp dir to contain the metadata file, and write the file contents.
temp_dir = pathlib.Path(
TempDirectory(kind="metadata", globally_managed=True).path
)
metadata_path = temp_dir / "METADATA"
metadata_path.write_bytes(metadata_contents)
# Construct dist pointing to the newly created directory.
dist = importlib.metadata.Distribution.at(metadata_path.parent)
return cls(dist, metadata_path.parent, None)

@classmethod
def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
try:
Expand Down
23 changes: 20 additions & 3 deletions src/pip/_internal/metadata/pkg_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class EntryPoint(NamedTuple):
group: str


class WheelMetadata:
class InMemoryMetadata:
"""IMetadataProvider that reads metadata files from a dictionary.
This also maps metadata decoding exceptions to our internal exception type.
Expand Down Expand Up @@ -92,12 +92,29 @@ def from_directory(cls, directory: str) -> BaseDistribution:
dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata)
return cls(dist)

@classmethod
def from_metadata_file_contents(
cls,
metadata_contents: bytes,
filename: str,
project_name: str,
) -> BaseDistribution:
metadata_dict = {
"METADATA": metadata_contents,
}
dist = pkg_resources.DistInfoDistribution(
location=filename,
metadata=InMemoryMetadata(metadata_dict, filename),
project_name=project_name,
)
return cls(dist)

@classmethod
def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
try:
with wheel.as_zipfile() as zf:
info_dir, _ = parse_wheel(zf, name)
metadata_text = {
metadata_dict = {
path.split("/", 1)[-1]: read_wheel_metadata_file(zf, path)
for path in zf.namelist()
if path.startswith(f"{info_dir}/")
Expand All @@ -108,7 +125,7 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
raise UnsupportedWheel(f"{name} has an invalid wheel, {e}")
dist = pkg_resources.DistInfoDistribution(
location=wheel.location,
metadata=WheelMetadata(metadata_text, wheel.location),
metadata=InMemoryMetadata(metadata_dict, wheel.location),
project_name=name,
)
return cls(dist)
Expand Down
Loading

0 comments on commit bad03ef

Please sign in to comment.