Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download: Seqera containers Patch 2 #3293

Merged
merged 11 commits into from
Dec 2, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
### Download

- First steps towards fixing [#3179](https://github.com/nf-core/tools/issues/3179): Modify `prioritize_direct_download()` to retain Seqera Singularity https:// Container URIs and hardcode Seqera Containers into `gather_registries()` ([#3244](https://github.com/nf-core/tools/pull/3244)).
- Further steps towards fixing [#3179](https://github.com/nf-core/tools/issues/3179): Enable limited support for `oras://` container paths (_only absolute URIs, no flexible registries like with Docker_) and prevent unnecessary image downloads for Seqera Container modules with `reconcile_seqera_container_uris()` ([#3293](https://github.com/nf-core/tools/pull/3293)).
- Update dawidd6/action-download-artifact action to v7 ([#3306](https://github.com/nf-core/tools/pull/3306))

### Linting
Expand Down
71 changes: 65 additions & 6 deletions nf_core/pipelines/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,11 +839,12 @@ def rectify_raw_container_matches(self, raw_findings):
url_regex = (
r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
)
oras_regex = r"oras:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
# Thanks Stack Overflow for the regex: https://stackoverflow.com/a/39672069/713980
docker_regex = r"^(?:(?=[^:\/]{1,253})(?!-)[a-zA-Z0-9-]{1,63}(?<!-)(?:\.(?!-)[a-zA-Z0-9-]{1,63}(?<!-))*(?::[0-9]{1,5})?/)?((?![._-])(?:[a-z0-9._-]*)(?<![._-])(?:/(?![._-])[a-z0-9._-]*(?<![._-]))*)(?::(?![.-])[a-zA-Z0-9_.-]{1,128})?$"

# at this point, we don't have to distinguish anymore, because we will later prioritize direct downloads over Docker URIs.
either_url_or_docker = re.compile(f"{url_regex}|{docker_regex}", re.S)
either_url_or_docker = re.compile(f"{url_regex}|{oras_regex}|{docker_regex}", re.S)

for _, container_value, search_space, file_path in raw_findings:
"""
Expand Down Expand Up @@ -1000,14 +1001,18 @@ def prioritize_direct_download(self, container_list: List[str]) -> List[str]:
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data'
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data'

Lastly, we want to remove at least a few Docker URIs for those modules, that have an oras:// download link.
"""
d: Dict[str, str] = {}
seqera_containers: List[str] = []
seqera_containers_http: List[str] = []
seqera_containers_oras: List[str] = []
all_others: List[str] = []

for c in container_list:
if bool(re.search(r"/data$", c)):
seqera_containers.append(c)
seqera_containers_http.append(c)
elif bool(re.search(r"^oras://", c)):
seqera_containers_oras.append(c)
else:
all_others.append(c)

Expand All @@ -1016,8 +1021,47 @@ def prioritize_direct_download(self, container_list: List[str]) -> List[str]:
log.debug(f"{c} matches and will be saved as {k}")
d[k] = c

# combine deduplicated others and Seqera containers
return sorted(list(d.values()) + seqera_containers)
combined_with_oras = self.reconcile_seqera_container_uris(seqera_containers_oras, list(d.values()))

# combine deduplicated others (Seqera containers oras, http others and Docker URI others) and Seqera containers http
return sorted(list(set(combined_with_oras + seqera_containers_http)))

@staticmethod
def reconcile_seqera_container_uris(prioritized_container_list: List[str], other_list: List[str]) -> List[str]:
"""
Helper function that takes a list of Seqera container URIs,
extracts the software string and builds a regex from them to filter out
similar containers from the second container list.

prioritzed_container_list = [
... "oras://community.wave.seqera.io/library/multiqc:1.25.1--f0e743d16869c0bf",
... "oras://community.wave.seqera.io/library/multiqc_pip_multiqc-plugins:e1f4877f1515d03c"
... ]

will be cleaned to

['library/multiqc:1.25.1', 'library/multiqc_pip_multiqc-plugins']

Subsequently, build a regex from those and filter out matching duplicates in other_list:
"""
if not prioritized_container_list:
return other_list
else:
# trim the URIs to the stem that contains the tool string, assign with Walrus operator to account for non-matching patterns
trimmed_priority_list = [
match.group()
for c in set(prioritized_container_list)
if (match := re.search(r"library/.*?:[\d.]+", c) if "--" in c else re.search(r"library/[^\s:]+", c))
]

# build regex
prioritized_containers = re.compile("|".join(f"{re.escape(c)}" for c in trimmed_priority_list))

# filter out matches in other list
filtered_containers = [c for c in other_list if not re.search(prioritized_containers, c)]

# combine prioritized and regular container lists
return sorted(list(set(prioritized_container_list + filtered_containers)))

def gather_registries(self, workflow_directory: str) -> None:
"""Fetch the registries from the pipeline config and CLI arguments and store them in a set.
Expand Down Expand Up @@ -1419,9 +1463,10 @@ def singularity_pull_image(
# Sometimes, container still contain an explicit library specification, which
# resulted in attempted pulls e.g. from docker://quay.io/quay.io/qiime2/core:2022.11
# Thus, if an explicit registry is specified, the provided -l value is ignored.
# Additionally, check if the container to be pulled is native Singularity: oras:// protocol.
container_parts = container.split("/")
if len(container_parts) > 2:
address = f"docker://{container}"
address = container if container.startswith("oras://") else f"docker://{container}"
absolute_URI = True
else:
address = f"docker://{library}/{container.replace('docker://', '')}"
Expand Down Expand Up @@ -1843,6 +1888,9 @@ def __init__(
elif re.search(r"manifest\sunknown", line):
self.error_type = self.InvalidTagError(self)
break
elif re.search(r"ORAS\sSIF\simage\sshould\shave\sa\ssingle\slayer", line):
mashehu marked this conversation as resolved.
Show resolved Hide resolved
self.error_type = self.NoSingularityContainerError(self)
break
elif re.search(r"Image\sfile\salready\sexists", line):
self.error_type = self.ImageExistsError(self)
break
Expand Down Expand Up @@ -1907,6 +1955,17 @@ def __init__(self, error_log):
self.helpmessage = f'Saving image of "{self.error_log.container}" failed, because "{self.error_log.out_path}" exists.\nPlease troubleshoot the command \n"{" ".join(self.error_log.singularity_command)}" manually.\n'
super().__init__(self.message)

class NoSingularityContainerError(RuntimeError):
"""The container image is no native Singularity Image Format."""

def __init__(self, error_log):
self.error_log = error_log
self.message = (
f'[bold red]"{self.error_log.container}" is no valid Singularity Image Format container.[/]\n'
)
self.helpmessage = f"Pulling \"{self.error_log.container}\" failed, because it appears invalid. To convert form Docker's OCI format, prefix the URI with 'docker://' instead of 'oras://'.\n"
MatthiasZepper marked this conversation as resolved.
Show resolved Hide resolved
super().__init__(self.message)

class OtherError(RuntimeError):
"""Undefined error with the container"""

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
process UMI_TRANSFER {
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6' :
'community.wave.seqera.io/library/umi-transfer:1.0.0--d30e8812ea280fa1' }"

// truncated

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
process UMI_TRANSFER_MULLED {
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'oras://community.wave.seqera.io/library/umi-transfer_umicollapse:796a995ff53da9e3' :
'community.wave.seqera.io/library/umi-transfer_umicollapse:3298d4f1b49e33bd' }"

// truncated

}
93 changes: 89 additions & 4 deletions tests/pipelines/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,20 @@ def test_find_container_images_modules(self, tmp_path, mock_fetch_wf_config):
not in download_obj.containers
)

# mock_seqera_container.nf
# mock_seqera_container_oras.nf
assert "oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6" in download_obj.containers
assert "community.wave.seqera.io/library/umi-transfer:1.0.0--d30e8812ea280fa1" not in download_obj.containers

# mock_seqera_container_oras_mulled.nf
assert (
"oras://community.wave.seqera.io/library/umi-transfer_umicollapse:796a995ff53da9e3"
in download_obj.containers
)
assert (
"community.wave.seqera.io/library/umi-transfer_umicollapse:3298d4f1b49e33bd" not in download_obj.containers
)

# mock_seqera_container_http.nf
assert (
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data"
in download_obj.containers
Expand Down Expand Up @@ -294,6 +307,7 @@ def test_prioritize_direct_download(self, tmp_path):
"https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1",
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data",
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data",
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data",
]

result = download_obj.prioritize_direct_download(test_container)
Expand All @@ -316,7 +330,7 @@ def test_prioritize_direct_download(self, tmp_path):
assert "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_0" in result
assert "https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1" in result

# Verify that Seqera containers are not deduplicated
# Verify that Seqera containers are not deduplicated...
assert (
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data"
in result
Expand All @@ -325,6 +339,58 @@ def test_prioritize_direct_download(self, tmp_path):
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data"
in result
)
# ...but identical ones are.
assert (
result.count(
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data"
)
== 1
)

#
# Test for 'reconcile_seqera_container_uris'
#
@with_temporary_folder
def test_reconcile_seqera_container_uris(self, tmp_path):
download_obj = DownloadWorkflow(pipeline="dummy", outdir=tmp_path)

prioritized_container = [
"oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6",
"oras://community.wave.seqera.io/library/sylph:0.6.1--b97274cdc1caa649",
]

test_container = [
"https://depot.galaxyproject.org/singularity/ubuntu:22.04",
"nf-core/ubuntu:22.04",
"nf-core/ubuntu:22.04",
"nf-core/ubuntu:22.04",
"community.wave.seqera.io/library/umi-transfer:1.5.0--73c1a6b65e5b0b81",
"community.wave.seqera.io/library/sylph:0.6.1--a21713a57a65a373",
"biocontainers/sylph:0.6.1--b97274cdc1caa649",
]

# test that the test_container list is returned as it is, if no prioritized_containers are specified
result_empty = download_obj.reconcile_seqera_container_uris([], test_container)
assert result_empty == test_container

result = download_obj.reconcile_seqera_container_uris(prioritized_container, test_container)

# Verify that unrelated images are retained
assert "https://depot.galaxyproject.org/singularity/ubuntu:22.04" in result
assert "nf-core/ubuntu:22.04" in result

# Verify that the priority works for regular Seqera container (Native Singularity over Docker, but only for Seqera registry)
assert "oras://community.wave.seqera.io/library/sylph:0.6.1--b97274cdc1caa649" in result
assert "community.wave.seqera.io/library/sylph:0.6.1--a21713a57a65a373" not in result
assert "biocontainers/sylph:0.6.1--b97274cdc1caa649" in result

# Verify that version strings are respected: Version 1.0.0 does not replace version 1.5.0
assert "oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6" in result
assert "community.wave.seqera.io/library/umi-transfer:1.5.0--73c1a6b65e5b0b81" in result

# assert that the deduplication works
assert test_container.count("nf-core/ubuntu:22.04") == 3
assert result.count("nf-core/ubuntu:22.04") == 1

#
# Tests for 'singularity_pull_image'
Expand Down Expand Up @@ -356,11 +422,30 @@ def test_singularity_pull_image_singularity_installed(self, tmp_dir, mock_rich_p
"docker.io/bschiffthaler/sed", f"{tmp_dir}/sed.sif", None, "docker.io", mock_rich_progress
)

# Test successful pull with absolute oras:// URI
download_obj.singularity_pull_image(
"oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6",
f"{tmp_dir}/umi-transfer-oras.sif",
None,
"docker.io",
mock_rich_progress,
)

# try pulling Docker container image with oras://
with pytest.raises(ContainerError.NoSingularityContainerError):
download_obj.singularity_pull_image(
"oras://ghcr.io/matthiaszepper/umi-transfer:dev",
f"{tmp_dir}/umi-transfer-oras_impostor.sif",
None,
"docker.io",
mock_rich_progress,
)

# try to pull from non-existing registry (Name change hello-world_new.sif is needed, otherwise ImageExistsError is raised before attempting to pull.)
with pytest.raises(ContainerError.RegistryNotFoundError):
download_obj.singularity_pull_image(
"hello-world",
f"{tmp_dir}/hello-world_new.sif",
f"{tmp_dir}/break_the_registry_test.sif",
None,
"register-this-domain-to-break-the-test.io",
mock_rich_progress,
Expand Down Expand Up @@ -396,7 +481,7 @@ def test_singularity_pull_image_singularity_installed(self, tmp_dir, mock_rich_p
with pytest.raises(ContainerError.InvalidTagError):
download_obj.singularity_pull_image(
"ewels/multiqc:go-rewrite",
f"{tmp_dir}/umi-transfer.sif",
f"{tmp_dir}/multiqc-go.sif",
None,
"ghcr.io",
mock_rich_progress,
Expand Down
Loading