From 0c51d379e284de93773843e84984557c39d3008d Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Tue, 10 Sep 2024 16:19:13 +0200 Subject: [PATCH 1/3] Parse pypi mapping with ruamel.yaml instead of PyYAML For me this reduces loading time from 5.44s to 1.63s. --- conda_lock/lookup.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 60fa33d7..6908c3e2 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -1,14 +1,20 @@ +import logging +import time + from functools import cached_property from pathlib import Path from typing import Dict import requests -import yaml +import ruamel.yaml from packaging.utils import NormalizedName, canonicalize_name from typing_extensions import TypedDict +logger = logging.getLogger(__name__) + + class MappingEntry(TypedDict): conda_name: str # legacy field, generally not used by anything anymore @@ -50,7 +56,12 @@ def pypi_lookup(self) -> Dict[NormalizedName, MappingEntry]: else: path = url content = Path(path).read_bytes() - lookup = yaml.safe_load(content) + logger.debug("Parsing PyPI mapping") + load_start = time.monotonic() + yaml = ruamel.yaml.YAML(typ="safe") + lookup = yaml.load(content) + load_duration = time.monotonic() - load_start + logger.debug(f"Loaded {len(lookup)} entries in {load_duration:.2f}s") # lowercase and kebabcase the pypi names assert lookup is not None lookup = {canonicalize_name(k): v for k, v in lookup.items()} From 800f6ff315757fe7a2d1069721bd63e999330d59 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Tue, 10 Sep 2024 16:35:44 +0200 Subject: [PATCH 2/3] Cache the PyPI mapping on disk to avoid redundant downloads Not really necessary since the slowdown is due to parsing the YAML, but I wrote it anyways, so let's include it. --- conda_lock/lookup.py | 94 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 3 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 6908c3e2..b7be0626 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -1,3 +1,4 @@ +import hashlib import logging import time @@ -8,7 +9,9 @@ import requests import ruamel.yaml +from filelock import FileLock, Timeout from packaging.utils import NormalizedName, canonicalize_name +from platformdirs import user_cache_path from typing_extensions import TypedDict @@ -47,9 +50,7 @@ def mapping_url(self, value: str) -> None: def pypi_lookup(self) -> Dict[NormalizedName, MappingEntry]: url = self.mapping_url if url.startswith("http://") or url.startswith("https://"): - res = requests.get(self._mapping_url) - res.raise_for_status() - content = res.content + content = cached_download_file(url) else: if url.startswith("file://"): path = url[len("file://") :] @@ -106,3 +107,90 @@ def pypi_name_to_conda_name(name: str) -> str: """return the conda name for a pypi package""" cname = canonicalize_name(name) return get_forward_lookup().get(cname, {"conda_name": cname})["conda_name"] + + +def cached_download_file(url: str) -> bytes: + """Download a file and cache it in the user cache directory. + + If the file is already cached, return the cached contents. + If the file is not cached, download it and cache the contents + and the ETag. + + Protect against multiple processes downloading the same file. + """ + CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days + DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes + current_time = time.time() + cache = user_cache_path("conda-lock", appauthor=False) + cache.mkdir(parents=True, exist_ok=True) + + # clear out old cache files + for file in cache.iterdir(): + if file.name.startswith("pypi-mapping-"): + mtime = file.stat().st_mtime + age = current_time - mtime + if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS: + logger.debug("Removing old cache file %s", file) + file.unlink() + + url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] + destination_mapping = cache / f"pypi-mapping-{url_hash}.yaml" + destination_etag = destination_mapping.with_suffix(".etag") + destination_lock = destination_mapping.with_suffix(".lock") + + # Return the contents immediately if the file is fresh + try: + mtime = destination_mapping.stat().st_mtime + age = current_time - mtime + if age < DONT_CHECK_IF_NEWER_THAN_SECONDS: + contents = destination_mapping.read_bytes() + logger.debug( + f"Using cached mapping {destination_mapping} without " + f"checking for updates" + ) + return contents + except FileNotFoundError: + pass + + # Wait for any other process to finish downloading the file. + # Use the ETag to avoid downloading the file if it hasn't changed. + # Otherwise, download the file and cache the contents and ETag. + while True: + try: + with FileLock(destination_lock, timeout=5): + # Get the ETag from the last download, if it exists + if destination_mapping.exists() and destination_etag.exists(): + logger.debug(f"Old ETag found at {destination_etag}") + try: + old_etag = destination_etag.read_text().strip() + headers = {"If-None-Match": old_etag} + except FileNotFoundError: + logger.warning("Failed to read ETag") + headers = {} + else: + headers = {} + # Download the file and cache the result. + logger.debug(f"Requesting {url}") + res = requests.get(url, headers=headers) + if res.status_code == 304: + logger.debug( + f"{url} has not changed since last download, " + f"using {destination_mapping}" + ) + else: + res.raise_for_status() + time.sleep(10) + destination_mapping.write_bytes(res.content) + if "ETag" in res.headers: + destination_etag.write_text(res.headers["ETag"]) + else: + logger.warning("No ETag in response headers") + logger.debug(f"Downloaded {url} to {destination_mapping}") + return destination_mapping.read_bytes() + + except Timeout: + logger.warning( + f"Failed to acquire lock on {destination_lock}, it is likely " + f"being downloaded by another process. Retrying...", + destination_lock, + ) From 016af6c89cdbc7beb601cea3c9a5a3639ca9e0fe Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Wed, 11 Sep 2024 09:17:55 +0200 Subject: [PATCH 3/3] Fix broken string interpolation Co-authored-by: Marius van Niekerk --- conda_lock/lookup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index b7be0626..f2a8801c 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -191,6 +191,5 @@ def cached_download_file(url: str) -> bytes: except Timeout: logger.warning( f"Failed to acquire lock on {destination_lock}, it is likely " - f"being downloaded by another process. Retrying...", - destination_lock, + f"being downloaded by another process. Retrying..." )