diff --git a/docs/repositories.md b/docs/repositories.md index 490d78bf785..481437a46d2 100644 --- a/docs/repositories.md +++ b/docs/repositories.md @@ -237,6 +237,20 @@ Note the trailing `/simple/`. This is important when configuring {{% /note %}} +Repositories following the [PEP 503](https://peps.python.org/pep-0503/) +specification should expose a root page with individual links for each +package it serves. This isn't reliably implemented everywhere, which +leads to increased network traffic and slower resolve times. If you're +using a repository which has a valid listing, you can add the +`indexed` property to let Poetry prefetch and cache this package list. + +```toml +[[tool.poetry.source]] +name = "foo" +url = "https://foo.bar/simple/" +indexed = true +``` + In addition to [PEP 503](https://peps.python.org/pep-0503/), Poetry can also handle simple API repositories that implement [PEP 658](https://peps.python.org/pep-0658/) (*Introduced in 1.2.0*). This is helpful in reducing dependency resolution time for packages from these sources as Poetry can diff --git a/src/poetry/config/source.py b/src/poetry/config/source.py index f3af0c589e2..c4836a73139 100644 --- a/src/poetry/config/source.py +++ b/src/poetry/config/source.py @@ -9,6 +9,7 @@ class Source: url: str default: bool = dataclasses.field(default=False) secondary: bool = dataclasses.field(default=False) + indexed: bool = dataclasses.field(default=False) def to_dict(self) -> dict[str, str | bool]: return dataclasses.asdict(self) diff --git a/src/poetry/factory.py b/src/poetry/factory.py index f1ab8ec99fd..15c446f5528 100644 --- a/src/poetry/factory.py +++ b/src/poetry/factory.py @@ -174,6 +174,7 @@ def configure_sources( def create_package_source( cls, source: dict[str, str], auth_config: Config, disable_cache: bool = False ) -> LegacyRepository: + from poetry.repositories.indexed import IndexedLegacyRepository from poetry.repositories.legacy_repository import LegacyRepository from poetry.repositories.single_page_repository import SinglePageRepository @@ -185,11 +186,18 @@ def create_package_source( raise RuntimeError("Missing [name] in source.") name = source["name"] url = source["url"] + indexed = bool(source.get("indexed", False)) repository_class = LegacyRepository if re.match(r".*\.(htm|html)$", url): repository_class = SinglePageRepository + if indexed: + raise RuntimeError( + "cannot set indexed=True for a single-page repository" + ) + elif indexed: + repository_class = IndexedLegacyRepository return repository_class( name, diff --git a/src/poetry/repositories/indexed.py b/src/poetry/repositories/indexed.py new file mode 100644 index 00000000000..0ec18bed37e --- /dev/null +++ b/src/poetry/repositories/indexed.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from poetry.repositories.exceptions import RepositoryError +from poetry.repositories.legacy_repository import LegacyRepository +from poetry.repositories.link_sources.html import SimpleIndexPage + + +if TYPE_CHECKING: + from poetry.core.packages.dependency import Dependency + from poetry.core.packages.package import Package + + from poetry.config.config import Config + + +class IndexedLegacyRepository(LegacyRepository): + def __init__( + self, + name: str, + url: str, + config: Config | None = None, + disable_cache: bool = False, + ) -> None: + super().__init__(name, url.rstrip("/"), config, disable_cache) + + self._index_page = self._get_index_page() + + def find_packages(self, dependency: Dependency) -> list[Package]: + if not self._index_page.serves_package(dependency.name): + return [] + + return super().find_packages(dependency) + + def _get_index_page(self) -> SimpleIndexPage: + response = self._get_response("") + if not response: + raise RepositoryError( + f"Failed fetching index page for repository {self.name}" + ) + return SimpleIndexPage(response.url, response.text) diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index c3c3cc4ce40..ab002444bc8 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -9,6 +9,7 @@ from poetry.core.packages.utils.link import Link from poetry.repositories.link_sources.base import LinkSource +from poetry.utils.helpers import canonicalize_name if TYPE_CHECKING: @@ -46,3 +47,34 @@ def __init__(self, url: str, content: str) -> None: if not url.endswith("/"): url += "/" super().__init__(url=url, content=content) + + +class SimpleIndexPage: + """Describes the root page of a PEP 503 compliant repository. + + This contains a list of links, each one corresponding to a served project. + """ + + def __init__(self, url: str, content: str) -> None: + if not url.endswith("/"): + url += "/" + + self._url = url + self._content = content + self._parsed = html5lib.parse(content, namespaceHTMLElements=False) + self._cached_packages = set(self.links) + + @property + def links(self) -> Iterator[str]: + # Note: PEP426 specifies that comparisons should be + # case-insensitive. For simplicity, we'll do lookups using + # lowercase-naming, and treating - and _ equivalently. + for anchor in self._parsed.findall(".//a"): + text: str | None = anchor.text + if text is None: + continue + + yield canonicalize_name(text) + + def serves_package(self, name: str) -> bool: + return canonicalize_name(name) in self._cached_packages diff --git a/tests/repositories/fixtures/legacy/index.html b/tests/repositories/fixtures/legacy/index.html new file mode 100644 index 00000000000..a66b1c6d759 --- /dev/null +++ b/tests/repositories/fixtures/legacy/index.html @@ -0,0 +1,3 @@ +pyyaml +missing-version +black diff --git a/tests/repositories/test_legacy_repository.py b/tests/repositories/test_legacy_repository.py index bb0c33f6a76..f3b58750a0f 100644 --- a/tests/repositories/test_legacy_repository.py +++ b/tests/repositories/test_legacy_repository.py @@ -15,7 +15,9 @@ from poetry.factory import Factory from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.exceptions import RepositoryError +from poetry.repositories.indexed import IndexedLegacyRepository from poetry.repositories.legacy_repository import LegacyRepository +from poetry.repositories.link_sources.html import SimpleIndexPage from poetry.repositories.link_sources.html import SimpleRepositoryPage @@ -388,6 +390,64 @@ def test_get_package_retrieves_packages_with_no_hashes(): ] == package.files +class MockIndexedRepository(MockRepository, IndexedLegacyRepository): + def _get_index_page(self) -> SimpleIndexPage | None: + fixture = self.FIXTURES / "index.html" + if not fixture.exists(): + return + + with fixture.open(encoding="utf-8") as f: + return SimpleIndexPage(self._url + "/", f.read()) + + +def test_indexed_root_page_has_valid_content(): + repo = MockIndexedRepository() + assert repo._index_page.serves_package("pyyaml") + + +def test_indexed_fails_on_missing(): + repo = MockIndexedRepository() + + packages = repo.find_packages(Factory.create_dependency("this-doesnt-exist", "*")) + + assert packages == [] + + +def test_indexed_succeeds_on_existing(): + repo = MockIndexedRepository() + + packages = repo.find_packages(Factory.create_dependency("pyyaml", "*")) + + assert len(packages) == 1 + + +def test_indexed_pep426_underscore_hyphen(): + repo = MockIndexedRepository() + + # 'missing-version' in the index + assert repo._index_page.serves_package("missing_version") + + +def test_indexed_pep426_case_insensitive(): + repo = MockIndexedRepository() + + # 'black' in the index + assert repo._index_page.serves_package("Black") + + +def test_indexed_retrieves_package_with_no_hashes(): + repo = MockIndexedRepository() + + package = repo.package("jupyter", "1.0.0") + + assert [ + { + "file": "jupyter-1.0.0.tar.gz", + "hash": "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f", # noqa: E501 + } + ] == package.files + + class MockHttpRepository(LegacyRepository): def __init__( self, endpoint_responses: dict, http: type[httpretty.httpretty]