Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP add pep 503 support for indices #615

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 108 additions & 7 deletions s3_management/manage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python

import argparse
import hashlib
import io
import tempfile

from os import path
Expand All @@ -9,12 +11,15 @@
from re import sub, match

import botocore
import botocore.exceptions
import boto3
import tqdm


S3 = boto3.resource('s3')
CLIENT = boto3.client('s3')
BUCKET = S3.Bucket('pytorch')
BASE_URL = "https://download.pytorch.org"

ACCEPTED_FILE_EXTENSIONS = ("whl", "zip")
ACCEPTED_SUBDIR_PATTERNS = [
Expand All @@ -35,9 +40,15 @@


class S3Index:
def __init__(self: S3IndexType, objects: List[str], prefix: str) -> None:
def __init__(
self: S3IndexType,
objects: List[str],
prefix: str
) -> None:
self.objects = objects
self.prefix = prefix.rstrip("/")
# lazy load checksums since they could take a while to load
self.checksums = dict()
self.html_name = PREFIXES_WITH_HTML[self.prefix]
# should dynamically grab subdirectories like whl/test/cu101
# so we don't need to add them manually anymore
Expand Down Expand Up @@ -85,10 +96,98 @@ def normalize_package_version(self: S3IndexType, obj: str) -> str:
"-".join(path.basename(obj).split("-")[:2])
)

def to_legacy_html(
self,
subdir: Optional[str]=None
) -> str:
def pep503_normalize(self, obj: str) -> str:
return sub(r"[-_.]+", "-", obj).lower()

def get_sha256(self, obj: str) -> str:
checksum_key = obj.replace(".whl", ".sha256")
try:
# if checksum exists, don't upload
return BUCKET.objects.filter(Prefix=checksum_key).load()
except botocore.exceptions.ClientError as exc:
# If we don't get a 404 then something else went horribly wrong
if int(exc.response['Error']['Code']) != 404:
raise exc
fileobj = io.BytesIO()
BUCKET.download_fileobj(Key=obj, Fileobj=fileobj)
checksum = hashlib.sha256(fileobj.getbuffer()).hexdigest()
return checksum

def get_checksums(
self,
objects: Optional[List[str]]=None
) -> Dict[str, str]:
objects = objects or self.objects
if not self.checksums:
for obj in tqdm.tqdm(objects):
checksum_key = obj.replace(".whl", ".sha256")
self.checksums[checksum_key] = self.get_sha256(obj)
return self.checksums

def to_pep503_html(self) -> Dict[str, str]:
objects = (
self.nightly_packages_to_show() if self.prefix == 'whl/nightly'
else self.objects
)
links: Dict[str, List[str]] = defaultdict(list)
checksums = self.get_checksums(objects)
for obj in tqdm.tqdm(objects):
# escape '+'
sanitized_obj = obj.replace("+", "%2B")
sanitized_obj_base = path.basename(sanitized_obj)
package_name = self.pep503_normalize(
self.normalize_package_version(sanitized_obj).split("-")[0]
)
checksum = self.get_sha256(sanitized_obj)
hyperlink = (
f'{BASE_URL}/{sanitized_obj}#sha256={checksum}'
)
links[package_name].append(
f'<a href="{hyperlink}">{sanitized_obj_base}</a><br/>'
)
out: Dict[str, str] = {
f"{package_name}/index.html": "\n".join(contents)
for package_name, contents in links.items()
}
out["index.html"] = "\n".join([
f'<a href="{package_name}/">{package_name}</a><br/>'
for package_name in links.keys()
])
return out

def upload_pep503_html(self) -> None:
html = self.to_pep503_html()
for index_name, content in tqdm.tqdm(html.items()):
print(f"INFO Uploading {index_name}")
BUCKET.Object(
key=f"simple/{self.prefix}/{index_name}"
).put(
ACL='public-read',
CacheControl='no-cache,no-store,must-revalidate',
ContentType='text/html',
Body=content
)

def upload_checksums(self, force: bool = False) -> None:
for checksum_key, checksum in tqdm.tqdm(self.checksums.items()):
try:
# if checksum exists, don't upload
BUCKET.objects.filter(Prefix=checksum_key).load()
except botocore.exceptions.ClientError as exc:
# If we don't get a 404 then something else went horribly wrong
if int(exc.response['Error']['Code']) != 404:
raise exc
print(f"INFO Uploading {checksum_key}")
BUCKET.Object(
key=f"{checksum_key}"
).put(
ACL='public-read',
CacheControl='must-revalidate',
ContentType='text/html',
Body=checksum
)

def to_legacy_html(self, subdir: Optional[str]=None) -> str:
"""Generates a string that can be used as the HTML index

Takes our objects and transforms them into HTML that have historically
Expand All @@ -110,6 +209,8 @@ def to_legacy_html(
obj_at_root = path.dirname(obj) == self.prefix
if not obj_at_root and not obj.startswith(subdir):
continue
# escape '+'
sanitized_obj = obj.replace("+", "%2B")
# Strip our prefix
sanitized_obj = obj.replace(subdir, "", 1)
if sanitized_obj.startswith('/'):
Expand Down Expand Up @@ -140,6 +241,7 @@ def upload_legacy_html(self) -> None:
@classmethod
def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType:
objects = []
checksums: Dict[str, str] = dict()
prefix = prefix.rstrip("/")
for obj in BUCKET.objects.filter(Prefix=prefix):
is_acceptable = any([path.dirname(obj.key) == prefix] + [
Expand All @@ -150,8 +252,7 @@ def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType:
for pattern in ACCEPTED_SUBDIR_PATTERNS
]) and obj.key.endswith(ACCEPTED_FILE_EXTENSIONS)
if is_acceptable:
sanitized_key = obj.key.replace("+", "%2B")
objects.append(sanitized_key)
objects.append(obj.key)
return cls(objects, prefix)

def create_parser() -> argparse.ArgumentParser:
Expand Down
1 change: 1 addition & 0 deletions s3_management/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
boto3
tqdm