Skip to content

Commit

Permalink
Merge pull request #690 from maresb/pypi-lookup
Browse files Browse the repository at this point in the history
Improve loading of PyPI mapping
  • Loading branch information
maresb authored Sep 11, 2024
2 parents 7da401d + 016af6c commit 01555e7
Showing 1 changed file with 103 additions and 5 deletions.
108 changes: 103 additions & 5 deletions conda_lock/lookup.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
import hashlib
import logging
import time

from functools import cached_property
from pathlib import Path
from typing import Dict

import requests
import yaml
import ruamel.yaml

from filelock import FileLock, Timeout
from packaging.utils import NormalizedName, canonicalize_name
from platformdirs import user_cache_path
from typing_extensions import TypedDict


logger = logging.getLogger(__name__)


class MappingEntry(TypedDict):
conda_name: str
# legacy field, generally not used by anything anymore
Expand Down Expand Up @@ -41,16 +50,19 @@ def mapping_url(self, value: str) -> None:
def pypi_lookup(self) -> Dict[NormalizedName, MappingEntry]:
url = self.mapping_url
if url.startswith("http://") or url.startswith("https://"):
res = requests.get(self._mapping_url)
res.raise_for_status()
content = res.content
content = cached_download_file(url)
else:
if url.startswith("file://"):
path = url[len("file://") :]
else:
path = url
content = Path(path).read_bytes()
lookup = yaml.safe_load(content)
logger.debug("Parsing PyPI mapping")
load_start = time.monotonic()
yaml = ruamel.yaml.YAML(typ="safe")
lookup = yaml.load(content)
load_duration = time.monotonic() - load_start
logger.debug(f"Loaded {len(lookup)} entries in {load_duration:.2f}s")
# lowercase and kebabcase the pypi names
assert lookup is not None
lookup = {canonicalize_name(k): v for k, v in lookup.items()}
Expand Down Expand Up @@ -95,3 +107,89 @@ def pypi_name_to_conda_name(name: str) -> str:
"""return the conda name for a pypi package"""
cname = canonicalize_name(name)
return get_forward_lookup().get(cname, {"conda_name": cname})["conda_name"]


def cached_download_file(url: str) -> bytes:
"""Download a file and cache it in the user cache directory.
If the file is already cached, return the cached contents.
If the file is not cached, download it and cache the contents
and the ETag.
Protect against multiple processes downloading the same file.
"""
CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days
DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes
current_time = time.time()
cache = user_cache_path("conda-lock", appauthor=False)
cache.mkdir(parents=True, exist_ok=True)

# clear out old cache files
for file in cache.iterdir():
if file.name.startswith("pypi-mapping-"):
mtime = file.stat().st_mtime
age = current_time - mtime
if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS:
logger.debug("Removing old cache file %s", file)
file.unlink()

url_hash = hashlib.sha256(url.encode()).hexdigest()[:4]
destination_mapping = cache / f"pypi-mapping-{url_hash}.yaml"
destination_etag = destination_mapping.with_suffix(".etag")
destination_lock = destination_mapping.with_suffix(".lock")

# Return the contents immediately if the file is fresh
try:
mtime = destination_mapping.stat().st_mtime
age = current_time - mtime
if age < DONT_CHECK_IF_NEWER_THAN_SECONDS:
contents = destination_mapping.read_bytes()
logger.debug(
f"Using cached mapping {destination_mapping} without "
f"checking for updates"
)
return contents
except FileNotFoundError:
pass

# Wait for any other process to finish downloading the file.
# Use the ETag to avoid downloading the file if it hasn't changed.
# Otherwise, download the file and cache the contents and ETag.
while True:
try:
with FileLock(destination_lock, timeout=5):
# Get the ETag from the last download, if it exists
if destination_mapping.exists() and destination_etag.exists():
logger.debug(f"Old ETag found at {destination_etag}")
try:
old_etag = destination_etag.read_text().strip()
headers = {"If-None-Match": old_etag}
except FileNotFoundError:
logger.warning("Failed to read ETag")
headers = {}
else:
headers = {}
# Download the file and cache the result.
logger.debug(f"Requesting {url}")
res = requests.get(url, headers=headers)
if res.status_code == 304:
logger.debug(
f"{url} has not changed since last download, "
f"using {destination_mapping}"
)
else:
res.raise_for_status()
time.sleep(10)
destination_mapping.write_bytes(res.content)
if "ETag" in res.headers:
destination_etag.write_text(res.headers["ETag"])
else:
logger.warning("No ETag in response headers")
logger.debug(f"Downloaded {url} to {destination_mapping}")
return destination_mapping.read_bytes()

except Timeout:
logger.warning(
f"Failed to acquire lock on {destination_lock}, it is likely "
f"being downloaded by another process. Retrying..."
)

0 comments on commit 01555e7

Please sign in to comment.