Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve loading of PyPI mapping #690

Merged
merged 3 commits into from
Sep 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 103 additions & 5 deletions conda_lock/lookup.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
import hashlib
import logging
import time

from functools import cached_property
from pathlib import Path
from typing import Dict

import requests
import yaml
import ruamel.yaml

from filelock import FileLock, Timeout
from packaging.utils import NormalizedName, canonicalize_name
from platformdirs import user_cache_path
from typing_extensions import TypedDict


logger = logging.getLogger(__name__)


class MappingEntry(TypedDict):
conda_name: str
# legacy field, generally not used by anything anymore
Expand Down Expand Up @@ -41,16 +50,19 @@ def mapping_url(self, value: str) -> None:
def pypi_lookup(self) -> Dict[NormalizedName, MappingEntry]:
url = self.mapping_url
if url.startswith("http://") or url.startswith("https://"):
res = requests.get(self._mapping_url)
res.raise_for_status()
content = res.content
content = cached_download_file(url)
else:
if url.startswith("file://"):
path = url[len("file://") :]
else:
path = url
content = Path(path).read_bytes()
lookup = yaml.safe_load(content)
logger.debug("Parsing PyPI mapping")
load_start = time.monotonic()
yaml = ruamel.yaml.YAML(typ="safe")
lookup = yaml.load(content)
load_duration = time.monotonic() - load_start
logger.debug(f"Loaded {len(lookup)} entries in {load_duration:.2f}s")
# lowercase and kebabcase the pypi names
assert lookup is not None
lookup = {canonicalize_name(k): v for k, v in lookup.items()}
Expand Down Expand Up @@ -95,3 +107,89 @@ def pypi_name_to_conda_name(name: str) -> str:
"""return the conda name for a pypi package"""
cname = canonicalize_name(name)
return get_forward_lookup().get(cname, {"conda_name": cname})["conda_name"]


def cached_download_file(url: str) -> bytes:
"""Download a file and cache it in the user cache directory.
If the file is already cached, return the cached contents.
If the file is not cached, download it and cache the contents
and the ETag.
Protect against multiple processes downloading the same file.
"""
CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days
DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes
current_time = time.time()
cache = user_cache_path("conda-lock", appauthor=False)
cache.mkdir(parents=True, exist_ok=True)

# clear out old cache files
for file in cache.iterdir():
if file.name.startswith("pypi-mapping-"):
mtime = file.stat().st_mtime
age = current_time - mtime
if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS:
logger.debug("Removing old cache file %s", file)
file.unlink()

url_hash = hashlib.sha256(url.encode()).hexdigest()[:4]
destination_mapping = cache / f"pypi-mapping-{url_hash}.yaml"
destination_etag = destination_mapping.with_suffix(".etag")
destination_lock = destination_mapping.with_suffix(".lock")

# Return the contents immediately if the file is fresh
try:
mtime = destination_mapping.stat().st_mtime
age = current_time - mtime
if age < DONT_CHECK_IF_NEWER_THAN_SECONDS:
contents = destination_mapping.read_bytes()
logger.debug(
f"Using cached mapping {destination_mapping} without "
f"checking for updates"
)
return contents
except FileNotFoundError:
pass

# Wait for any other process to finish downloading the file.
# Use the ETag to avoid downloading the file if it hasn't changed.
# Otherwise, download the file and cache the contents and ETag.
while True:
try:
with FileLock(destination_lock, timeout=5):
# Get the ETag from the last download, if it exists
if destination_mapping.exists() and destination_etag.exists():
logger.debug(f"Old ETag found at {destination_etag}")
try:
old_etag = destination_etag.read_text().strip()
headers = {"If-None-Match": old_etag}
except FileNotFoundError:
logger.warning("Failed to read ETag")
headers = {}
else:
headers = {}
# Download the file and cache the result.
logger.debug(f"Requesting {url}")
res = requests.get(url, headers=headers)
if res.status_code == 304:
logger.debug(
f"{url} has not changed since last download, "
f"using {destination_mapping}"
)
else:
res.raise_for_status()
time.sleep(10)
destination_mapping.write_bytes(res.content)
if "ETag" in res.headers:
destination_etag.write_text(res.headers["ETag"])
else:
logger.warning("No ETag in response headers")
logger.debug(f"Downloaded {url} to {destination_mapping}")
return destination_mapping.read_bytes()

except Timeout:
logger.warning(
f"Failed to acquire lock on {destination_lock}, it is likely "
maresb marked this conversation as resolved.
Show resolved Hide resolved
f"being downloaded by another process. Retrying..."
)
Loading