Merge pull request #690 from maresb/pypi-lookup

Improve loading of PyPI mapping
conda · Sep 11, 2024 · 01555e7 · 01555e7
2 parents 7da401d + 016af6c
commit 01555e7
Showing 1 changed file with 103 additions and 5 deletions.
diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py
@@ -1,14 +1,23 @@
+import hashlib
+import logging
+import time
+
 from functools import cached_property
 from pathlib import Path
 from typing import Dict
 
 import requests
-import yaml
+import ruamel.yaml
 
+from filelock import FileLock, Timeout
 from packaging.utils import NormalizedName, canonicalize_name
+from platformdirs import user_cache_path
 from typing_extensions import TypedDict
 
 
+logger = logging.getLogger(__name__)
+
+
 class MappingEntry(TypedDict):
     conda_name: str
     # legacy field, generally not used by anything anymore
@@ -41,16 +50,19 @@ def mapping_url(self, value: str) -> None:
     def pypi_lookup(self) -> Dict[NormalizedName, MappingEntry]:
         url = self.mapping_url
         if url.startswith("http://") or url.startswith("https://"):
-            res = requests.get(self._mapping_url)
-            res.raise_for_status()
-            content = res.content
+            content = cached_download_file(url)
         else:
             if url.startswith("file://"):
                 path = url[len("file://") :]
             else:
                 path = url
             content = Path(path).read_bytes()
-        lookup = yaml.safe_load(content)
+        logger.debug("Parsing PyPI mapping")
+        load_start = time.monotonic()
+        yaml = ruamel.yaml.YAML(typ="safe")
+        lookup = yaml.load(content)
+        load_duration = time.monotonic() - load_start
+        logger.debug(f"Loaded {len(lookup)} entries in {load_duration:.2f}s")
         # lowercase and kebabcase the pypi names
         assert lookup is not None
         lookup = {canonicalize_name(k): v for k, v in lookup.items()}
@@ -95,3 +107,89 @@ def pypi_name_to_conda_name(name: str) -> str:
     """return the conda name for a pypi package"""
     cname = canonicalize_name(name)
     return get_forward_lookup().get(cname, {"conda_name": cname})["conda_name"]
+
+
+def cached_download_file(url: str) -> bytes:
+    """Download a file and cache it in the user cache directory.
+
+    If the file is already cached, return the cached contents.
+    If the file is not cached, download it and cache the contents
+    and the ETag.
+
+    Protect against multiple processes downloading the same file.
+    """
+    CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2  # 2 days
+    DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5  # 5 minutes
+    current_time = time.time()
+    cache = user_cache_path("conda-lock", appauthor=False)
+    cache.mkdir(parents=True, exist_ok=True)
+
+    # clear out old cache files
+    for file in cache.iterdir():
+        if file.name.startswith("pypi-mapping-"):
+            mtime = file.stat().st_mtime
+            age = current_time - mtime
+            if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS:
+                logger.debug("Removing old cache file %s", file)
+                file.unlink()
+
+    url_hash = hashlib.sha256(url.encode()).hexdigest()[:4]
+    destination_mapping = cache / f"pypi-mapping-{url_hash}.yaml"
+    destination_etag = destination_mapping.with_suffix(".etag")
+    destination_lock = destination_mapping.with_suffix(".lock")
+
+    # Return the contents immediately if the file is fresh
+    try:
+        mtime = destination_mapping.stat().st_mtime
+        age = current_time - mtime
+        if age < DONT_CHECK_IF_NEWER_THAN_SECONDS:
+            contents = destination_mapping.read_bytes()
+            logger.debug(
+                f"Using cached mapping {destination_mapping} without "
+                f"checking for updates"
+            )
+            return contents
+    except FileNotFoundError:
+        pass
+
+    # Wait for any other process to finish downloading the file.
+    # Use the ETag to avoid downloading the file if it hasn't changed.
+    # Otherwise, download the file and cache the contents and ETag.
+    while True:
+        try:
+            with FileLock(destination_lock, timeout=5):
+                # Get the ETag from the last download, if it exists
+                if destination_mapping.exists() and destination_etag.exists():
+                    logger.debug(f"Old ETag found at {destination_etag}")
+                    try:
+                        old_etag = destination_etag.read_text().strip()
+                        headers = {"If-None-Match": old_etag}
+                    except FileNotFoundError:
+                        logger.warning("Failed to read ETag")
+                        headers = {}
+                else:
+                    headers = {}
+                # Download the file and cache the result.
+                logger.debug(f"Requesting {url}")
+                res = requests.get(url, headers=headers)
+                if res.status_code == 304:
+                    logger.debug(
+                        f"{url} has not changed since last download, "
+                        f"using {destination_mapping}"
+                    )
+                else:
+                    res.raise_for_status()
+                    time.sleep(10)
+                    destination_mapping.write_bytes(res.content)
+                    if "ETag" in res.headers:
+                        destination_etag.write_text(res.headers["ETag"])
+                    else:
+                        logger.warning("No ETag in response headers")
+                logger.debug(f"Downloaded {url} to {destination_mapping}")
+                return destination_mapping.read_bytes()
+
+        except Timeout:
+            logger.warning(
+                f"Failed to acquire lock on {destination_lock}, it is likely "
+                f"being downloaded by another process. Retrying..."
+            )