Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix deployment pull file path issue #874

Merged
merged 1 commit into from
Feb 11, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 25 additions & 184 deletions lib/ramble/spack/util/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
Utility functions for parsing, formatting, and manipulating URLs.
"""

import itertools
import posixpath
import re
import sys

Expand All @@ -23,26 +21,6 @@
is_windows = sys.platform == 'win32'


def _split_all(path):
"""Split path into its atomic components.

Returns the shortest list, L, of strings such that posixpath.join(*L) ==
path and posixpath.split(element) == ('', element) for every element in L
except possibly the first. This first element may possibly have the value
of '/'.
"""
result = []
a = path
old_a = None
while a != old_a:
(old_a, (a, b)) = a, posixpath.split(a)

if a or b:
result.insert(0, b or '/')

return result


def local_file_path(url):
"""Get a local file path from a url.

Expand Down Expand Up @@ -124,168 +102,31 @@ def format(parsed_url):
return parsed_url.geturl()


def join(base_url, path, *extra, **kwargs):
"""Joins a base URL with one or more local URL path components

If resolve_href is True, treat the base URL as though it where the locator
of a web page, and the remaining URL path components as though they formed
a relative URL to be resolved against it (i.e.: as in posixpath.join(...)).
The result is an absolute URL to the resource to which a user's browser
would navigate if they clicked on a link with an "href" attribute equal to
the relative URL.

If resolve_href is False (default), then the URL path components are joined
as in posixpath.join().

Note: file:// URL path components are not canonicalized as part of this
operation. To canonicalize, pass the joined url to format().

Examples:
base_url = 's3://bucket/index.html'
body = fetch_body(prefix)
link = get_href(body) # link == '../other-bucket/document.txt'

# wrong - link is a local URL that needs to be resolved against base_url
spack.util.url.join(base_url, link)
's3://bucket/other_bucket/document.txt'

# correct - resolve local URL against base_url
spack.util.url.join(base_url, link, resolve_href=True)
's3://other_bucket/document.txt'

prefix = 'https://mirror.spack.io/build_cache'

# wrong - prefix is just a URL prefix
spack.util.url.join(prefix, 'my-package', resolve_href=True)
'https://mirror.spack.io/my-package'

# correct - simply append additional URL path components
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
'https://mirror.spack.io/build_cache/my-package'

# For canonicalizing file:// URLs, take care to explicitly differentiate
# between absolute and relative join components.

# '$spack' is not an absolute path component
join_result = spack.util.url.join('/a/b/c', '$spack') ; join_result
'file:///a/b/c/$spack'
spack.util.url.format(join_result)
'file:///a/b/c/opt/spack'

# '/$spack' *is* an absolute path component
join_result = spack.util.url.join('/a/b/c', '/$spack') ; join_result
'file:///$spack'
spack.util.url.format(join_result)
'file:///opt/spack'
"""
paths = [
(x) if isinstance(x, str)
else x.geturl()
for x in itertools.chain((base_url, path), extra)]

paths = [convert_to_posix_path(x) for x in paths]
n = len(paths)
last_abs_component = None
scheme = ''
for i in range(n - 1, -1, -1):
obj = urllib.parse.urlparse(
paths[i], scheme='', allow_fragments=False)

scheme = obj.scheme

# in either case the component is absolute
if scheme or obj.path.startswith('/'):
if not scheme:
# Without a scheme, we have to go back looking for the
# next-last component that specifies a scheme.
for j in range(i - 1, -1, -1):
obj = urllib.parse.urlparse(
paths[j], scheme='', allow_fragments=False)

if obj.scheme:
paths[i] = '{SM}://{NL}{PATH}'.format(
SM=obj.scheme,
NL=(
(obj.netloc + '/')
if obj.scheme != 's3' else ''),
PATH=paths[i][1:])
break

last_abs_component = i
break

if last_abs_component is not None:
paths = paths[last_abs_component:]
if len(paths) == 1:
result = urllib.parse.urlparse(
paths[0], scheme='file', allow_fragments=False)

# another subtlety: If the last argument to join() is an absolute
# file:// URL component with a relative path, the relative path
# needs to be resolved.
if result.scheme == 'file' and result.netloc:
result = urllib.parse.ParseResult(
scheme=result.scheme,
netloc='',
path=posixpath.abspath(result.netloc + result.path),
params=result.params,
query=result.query,
fragment=None)

return result.geturl()

return _join(*paths, **kwargs)


def _join(base_url, path, *extra, **kwargs):
base_url = parse(base_url)
resolve_href = kwargs.get('resolve_href', False)

(scheme, netloc, base_path, params, query, _) = base_url
scheme = scheme.lower()

path_tokens = [
part for part in itertools.chain(
_split_all(path),
itertools.chain.from_iterable(
_split_all(extra_path) for extra_path in extra))
if part and part != '/']

base_path_args = ['/fake-root']
if scheme == 's3':
if netloc:
base_path_args.append(netloc)

if base_path.startswith('/'):
base_path = base_path[1:]

base_path_args.append(base_path)

if resolve_href:
new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
base_path_args = [new_base_path]

base_path_args.extend(path_tokens)
base_path = posixpath.relpath(posixpath.join(*base_path_args), '/fake-root')

if scheme == 's3':
path_tokens = [
part for part in _split_all(base_path)
if part and part != '/']

if path_tokens:
netloc = path_tokens.pop(0)
base_path = posixpath.join('', *path_tokens)

if sys.platform == "win32":
base_path = convert_to_posix_path(base_path)

return format(urllib.parse.ParseResult(scheme=scheme,
netloc=netloc,
path=base_path,
params=params,
query=query,
fragment=None))
def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str:
"""Convenience wrapper around ``urllib.parse.urljoin``, with a few differences:
1. By default resolve_href=False, which makes the function like os.path.join: for example
https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the
behavior is how a browser would resolve the URL: https://example.com/a/c/d.
2. s3://, gs://, oci:// URLs are joined like http:// URLs.
3. It accepts multiple components for convenience. Note that components[1:] are treated as
literal path components and appended to components[0] separated by slashes."""
# Ensure a trailing slash in the path component of the base URL to get os.path.join-like
# behavior instead of web browser behavior.
if not resolve_href:
parsed = urllib.parse.urlparse(base)
if not parsed.path.endswith("/"):
base = parsed._replace(path=f"{parsed.path}/").geturl()
uses_netloc = urllib.parse.uses_netloc
uses_relative = urllib.parse.uses_relative
try:
# NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http.
# This is non-portable, and may be forward incompatible with future cpython versions.
urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs", "oci"]
urllib.parse.uses_relative = [*uses_relative, "s3", "gs", "oci"]
return urllib.parse.urljoin(base, "/".join(components), **kwargs)
finally:
urllib.parse.uses_netloc = uses_netloc
urllib.parse.uses_relative = uses_relative


git_re = (
Expand Down