GoogleCloudPlatform · douglasjacobsen · Feb 11, 2025 · Feb 11, 2025
diff --git a/lib/ramble/spack/util/url.py b/lib/ramble/spack/util/url.py
@@ -7,8 +7,6 @@
 Utility functions for parsing, formatting, and manipulating URLs.
 """
 
-import itertools
-import posixpath
 import re
 import sys
 
@@ -23,26 +21,6 @@
 is_windows = sys.platform == 'win32'
 
 
-def _split_all(path):
-    """Split path into its atomic components.
-
-    Returns the shortest list, L, of strings such that posixpath.join(*L) ==
-    path and posixpath.split(element) == ('', element) for every element in L
-    except possibly the first.  This first element may possibly have the value
-    of '/'.
-    """
-    result = []
-    a = path
-    old_a = None
-    while a != old_a:
-        (old_a, (a, b)) = a, posixpath.split(a)
-
-        if a or b:
-            result.insert(0, b or '/')
-
-    return result
-
-
 def local_file_path(url):
     """Get a local file path from a url.
 
@@ -124,168 +102,31 @@ def format(parsed_url):
     return parsed_url.geturl()
 
 
-def join(base_url, path, *extra, **kwargs):
-    """Joins a base URL with one or more local URL path components
-
-    If resolve_href is True, treat the base URL as though it where the locator
-    of a web page, and the remaining URL path components as though they formed
-    a relative URL to be resolved against it (i.e.: as in posixpath.join(...)).
-    The result is an absolute URL to the resource to which a user's browser
-    would navigate if they clicked on a link with an "href" attribute equal to
-    the relative URL.
-
-    If resolve_href is False (default), then the URL path components are joined
-    as in posixpath.join().
-
-    Note: file:// URL path components are not canonicalized as part of this
-    operation.  To canonicalize, pass the joined url to format().
-
-    Examples:
-      base_url = 's3://bucket/index.html'
-      body = fetch_body(prefix)
-      link = get_href(body) # link == '../other-bucket/document.txt'
-
-      # wrong - link is a local URL that needs to be resolved against base_url
-      spack.util.url.join(base_url, link)
-      's3://bucket/other_bucket/document.txt'
-
-      # correct - resolve local URL against base_url
-      spack.util.url.join(base_url, link, resolve_href=True)
-      's3://other_bucket/document.txt'
-
-      prefix = 'https://mirror.spack.io/build_cache'
-
-      # wrong - prefix is just a URL prefix
-      spack.util.url.join(prefix, 'my-package', resolve_href=True)
-      'https://mirror.spack.io/my-package'
-
-      # correct - simply append additional URL path components
-      spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
-      'https://mirror.spack.io/build_cache/my-package'
-
-      # For canonicalizing file:// URLs, take care to explicitly differentiate
-      # between absolute and relative join components.
-
-      # '$spack' is not an absolute path component
-      join_result = spack.util.url.join('/a/b/c', '$spack') ; join_result
-      'file:///a/b/c/$spack'
-      spack.util.url.format(join_result)
-      'file:///a/b/c/opt/spack'
-
-      # '/$spack' *is* an absolute path component
-      join_result = spack.util.url.join('/a/b/c', '/$spack') ; join_result
-      'file:///$spack'
-      spack.util.url.format(join_result)
-      'file:///opt/spack'
-    """
-    paths = [
-        (x) if isinstance(x, str)
-        else x.geturl()
-        for x in itertools.chain((base_url, path), extra)]
-
-    paths = [convert_to_posix_path(x) for x in paths]
-    n = len(paths)
-    last_abs_component = None
-    scheme = ''
-    for i in range(n - 1, -1, -1):
-        obj = urllib.parse.urlparse(
-            paths[i], scheme='', allow_fragments=False)
-
-        scheme = obj.scheme
-
-        # in either case the component is absolute
-        if scheme or obj.path.startswith('/'):
-            if not scheme:
-                # Without a scheme, we have to go back looking for the
-                # next-last component that specifies a scheme.
-                for j in range(i - 1, -1, -1):
-                    obj = urllib.parse.urlparse(
-                        paths[j], scheme='', allow_fragments=False)
-
-                    if obj.scheme:
-                        paths[i] = '{SM}://{NL}{PATH}'.format(
-                            SM=obj.scheme,
-                            NL=(
-                                (obj.netloc + '/')
-                                if obj.scheme != 's3' else ''),
-                            PATH=paths[i][1:])
-                        break
-
-            last_abs_component = i
-            break
-
-    if last_abs_component is not None:
-        paths = paths[last_abs_component:]
-        if len(paths) == 1:
-            result = urllib.parse.urlparse(
-                paths[0], scheme='file', allow_fragments=False)
-
-            # another subtlety: If the last argument to join() is an absolute
-            # file:// URL component with a relative path, the relative path
-            # needs to be resolved.
-            if result.scheme == 'file' and result.netloc:
-                result = urllib.parse.ParseResult(
-                    scheme=result.scheme,
-                    netloc='',
-                    path=posixpath.abspath(result.netloc + result.path),
-                    params=result.params,
-                    query=result.query,
-                    fragment=None)
-
-            return result.geturl()
-
-    return _join(*paths, **kwargs)
-
-
-def _join(base_url, path, *extra, **kwargs):
-    base_url = parse(base_url)
-    resolve_href = kwargs.get('resolve_href', False)
-
-    (scheme, netloc, base_path, params, query, _) = base_url
-    scheme = scheme.lower()
-
-    path_tokens = [
-        part for part in itertools.chain(
-            _split_all(path),
-            itertools.chain.from_iterable(
-                _split_all(extra_path) for extra_path in extra))
-        if part and part != '/']
-
-    base_path_args = ['/fake-root']
-    if scheme == 's3':
-        if netloc:
-            base_path_args.append(netloc)
-
-    if base_path.startswith('/'):
-        base_path = base_path[1:]
-
-    base_path_args.append(base_path)
-
-    if resolve_href:
-        new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
-        base_path_args = [new_base_path]
-
-    base_path_args.extend(path_tokens)
-    base_path = posixpath.relpath(posixpath.join(*base_path_args), '/fake-root')
-
-    if scheme == 's3':
-        path_tokens = [
-            part for part in _split_all(base_path)
-            if part and part != '/']
-
-        if path_tokens:
-            netloc = path_tokens.pop(0)
-            base_path = posixpath.join('', *path_tokens)
-
-    if sys.platform == "win32":
-        base_path = convert_to_posix_path(base_path)
-
-    return format(urllib.parse.ParseResult(scheme=scheme,
-                                           netloc=netloc,
-                                           path=base_path,
-                                           params=params,
-                                           query=query,
-                                           fragment=None))
+def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str:
+    """Convenience wrapper around ``urllib.parse.urljoin``, with a few differences:
+    1. By default resolve_href=False, which makes the function like os.path.join: for example
+    https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the
+    behavior is how a browser would resolve the URL: https://example.com/a/c/d.
+    2. s3://, gs://, oci:// URLs are joined like http:// URLs.
+    3. It accepts multiple components for convenience. Note that components[1:] are treated as
+    literal path components and appended to components[0] separated by slashes."""
+    # Ensure a trailing slash in the path component of the base URL to get os.path.join-like
+    # behavior instead of web browser behavior.
+    if not resolve_href:
+        parsed = urllib.parse.urlparse(base)
+        if not parsed.path.endswith("/"):
+            base = parsed._replace(path=f"{parsed.path}/").geturl()
+    uses_netloc = urllib.parse.uses_netloc
+    uses_relative = urllib.parse.uses_relative
+    try:
+        # NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http.
+        # This is non-portable, and may be forward incompatible with future cpython versions.
+        urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs", "oci"]
+        urllib.parse.uses_relative = [*uses_relative, "s3", "gs", "oci"]
+        return urllib.parse.urljoin(base, "/".join(components), **kwargs)
+    finally:
+        urllib.parse.uses_netloc = uses_netloc
+        urllib.parse.uses_relative = uses_relative
 
 
 git_re = (