From bc0ba13c80b03c4f51a45695add19bc8847520b7 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 23 Oct 2024 17:18:52 +0100 Subject: [PATCH 1/7] Add seqera containers example --- .../modules/mock_seqera_container.nf | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tests/data/mock_module_containers/modules/mock_seqera_container.nf diff --git a/tests/data/mock_module_containers/modules/mock_seqera_container.nf b/tests/data/mock_module_containers/modules/mock_seqera_container.nf new file mode 100644 index 000000000..4364a389b --- /dev/null +++ b/tests/data/mock_module_containers/modules/mock_seqera_container.nf @@ -0,0 +1,79 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' : + 'community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + echo '' | gzip > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + """ + echo '' | gzip > ${prefix}_1.merged.fastq.gz + echo '' | gzip > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } +} From 993031c1211aee2705eabbf2021487872ecfda03 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 23 Oct 2024 17:20:42 +0100 Subject: [PATCH 2/7] bump ci --- .../data/mock_module_containers/modules/mock_seqera_container.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data/mock_module_containers/modules/mock_seqera_container.nf b/tests/data/mock_module_containers/modules/mock_seqera_container.nf index 4364a389b..6274b1eec 100644 --- a/tests/data/mock_module_containers/modules/mock_seqera_container.nf +++ b/tests/data/mock_module_containers/modules/mock_seqera_container.nf @@ -76,4 +76,5 @@ process CAT_FASTQ { """ } } + } From 32f1b22e9b6d42bdf6e1b0e0e92f61ae6f1f4c25 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 23 Oct 2024 19:31:52 +0200 Subject: [PATCH 3/7] Modify test_find_container_images_modules() for Seqera Containers. --- .../modules/mock_seqera_container.nf | 71 +------------------ tests/pipelines/test_download.py | 15 ++++ 2 files changed, 16 insertions(+), 70 deletions(-) diff --git a/tests/data/mock_module_containers/modules/mock_seqera_container.nf b/tests/data/mock_module_containers/modules/mock_seqera_container.nf index 6274b1eec..20c707548 100644 --- a/tests/data/mock_module_containers/modules/mock_seqera_container.nf +++ b/tests/data/mock_module_containers/modules/mock_seqera_container.nf @@ -1,5 +1,4 @@ process CAT_FASTQ { - tag "$meta.id" label 'process_single' conda "${moduleDir}/environment.yml" @@ -7,74 +6,6 @@ process CAT_FASTQ { 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' : 'community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264' }" - input: - tuple val(meta), path(reads, stageAs: "input*/*") - - output: - tuple val(meta), path("*.merged.fastq.gz"), emit: reads - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] - if (meta.single_end) { - if (readList.size >= 1) { - """ - cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') - END_VERSIONS - """ - } - } else { - if (readList.size >= 2) { - def read1 = [] - def read2 = [] - readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } - """ - cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz - cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') - END_VERSIONS - """ - } - } - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] - if (meta.single_end) { - if (readList.size >= 1) { - """ - echo '' | gzip > ${prefix}.merged.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') - END_VERSIONS - """ - } - } else { - if (readList.size >= 2) { - """ - echo '' | gzip > ${prefix}_1.merged.fastq.gz - echo '' | gzip > ${prefix}_2.merged.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') - END_VERSIONS - """ - } - } + // truncated } diff --git a/tests/pipelines/test_download.py b/tests/pipelines/test_download.py index e55258252..fbda70e56 100644 --- a/tests/pipelines/test_download.py +++ b/tests/pipelines/test_download.py @@ -257,6 +257,21 @@ def test_find_container_images_modules(self, tmp_path, mock_fetch_wf_config): not in download_obj.containers ) + # mock_seqera_container.nf + assert ( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data" + in download_obj.containers + ) + + # ToDO: This URI should actually NOT be in there, but prioritize_direct_download() can not handle this case. + # + # It works purely by comparing the strings, thus can establish the equivalence of 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.5--py39hf95cd2a_0' + # and 'biocontainers/umi_tools:1.1.5--py39hf95cd2a_0' because of the identical string 'umi_tools:1.1.5--py39hf95cd2a_0', but has no means to establish that + # 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' and + # 'community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264' are the equivalent container. It would need to query an API at Seqera for this. + + assert "community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264" in download_obj.containers + # # Tests for 'singularity_pull_image' # From c1f21725c099cddf06c35cebaebf50e275eb4cd8 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 23 Oct 2024 23:22:49 +0200 Subject: [PATCH 4/7] Enable Seqera Containers download. --- nf_core/pipelines/download.py | 17 +++- tests/pipelines/test_download.py | 133 +++++++++++++++++++++++++++---- 2 files changed, 131 insertions(+), 19 deletions(-) diff --git a/nf_core/pipelines/download.py b/nf_core/pipelines/download.py index 917665ff3..51babb84e 100644 --- a/nf_core/pipelines/download.py +++ b/nf_core/pipelines/download.py @@ -1023,7 +1023,13 @@ def gather_registries(self, workflow_directory: str) -> None: self.registry_set.add(self.nf_config[registry]) # add depot.galaxyproject.org to the set, because it is the default registry for singularity hardcoded in modules - self.registry_set.add("depot.galaxyproject.org") + self.registry_set.add("depot.galaxyproject.org/singularity") + + # add community.wave.seqera.io/library to the set to support the new Seqera Docker container registry + self.registry_set.add("community.wave.seqera.io/library") + + # add chttps://community-cr-prod.seqera.io/docker/registry/v2/ to the set to support the new Seqera Singularity container registry + self.registry_set.add("community-cr-prod.seqera.io/docker/registry/v2") def symlink_singularity_images(self, image_out_path: str) -> None: """Create a symlink for each registry in the registry set that points to the image. @@ -1040,10 +1046,13 @@ def symlink_singularity_images(self, image_out_path: str) -> None: if self.registry_set: # Create a regex pattern from the set, in case trimming is needed. - trim_pattern = "|".join(f"^{re.escape(registry)}-?" for registry in self.registry_set) + trim_pattern = "|".join(f"^{re.escape(registry)}-?".replace("/", "[/-]") for registry in self.registry_set) for registry in self.registry_set: - if not os.path.basename(image_out_path).startswith(registry): + # Nextflow will convert it like this as well, so we need it mimic its behavior + registry = registry.replace("/", "-") + + if not bool(re.search(trim_pattern, os.path.basename(image_out_path))): symlink_name = os.path.join("./", f"{registry}-{os.path.basename(image_out_path)}") else: trimmed_name = re.sub(f"{trim_pattern}", "", os.path.basename(image_out_path)) @@ -1263,7 +1272,7 @@ def singularity_image_filenames(self, container: str) -> Tuple[str, Optional[str # if docker.registry / singularity.registry are set to empty strings at runtime, which can be included in the HPC config profiles easily. if self.registry_set: # Create a regex pattern from the set of registries - trim_pattern = "|".join(f"^{re.escape(registry)}-?" for registry in self.registry_set) + trim_pattern = "|".join(f"^{re.escape(registry)}-?".replace("/", "[/-]") for registry in self.registry_set) # Use the pattern to trim the string out_name = re.sub(f"{trim_pattern}", "", out_name) diff --git a/tests/pipelines/test_download.py b/tests/pipelines/test_download.py index fbda70e56..4ce238e4e 100644 --- a/tests/pipelines/test_download.py +++ b/tests/pipelines/test_download.py @@ -268,7 +268,7 @@ def test_find_container_images_modules(self, tmp_path, mock_fetch_wf_config): # It works purely by comparing the strings, thus can establish the equivalence of 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.5--py39hf95cd2a_0' # and 'biocontainers/umi_tools:1.1.5--py39hf95cd2a_0' because of the identical string 'umi_tools:1.1.5--py39hf95cd2a_0', but has no means to establish that # 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' and - # 'community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264' are the equivalent container. It would need to query an API at Seqera for this. + # 'community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264' are the equivalent container. It would need to query an API at Seqera for that. assert "community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264" in download_obj.containers @@ -391,10 +391,72 @@ def test_get_singularity_images(self, tmp_path, mock_fetch_wf_config): @mock.patch("os.symlink") @mock.patch("os.open") @mock.patch("os.close") - @mock.patch("re.sub") @mock.patch("os.path.basename") @mock.patch("os.path.dirname") def test_symlink_singularity_images( + self, + tmp_path, + mock_dirname, + mock_basename, + mock_close, + mock_open, + mock_symlink, + mock_makedirs, + ): + # Setup + mock_dirname.return_value = f"{tmp_path}/path/to" + mock_basename.return_value = "singularity-image.img" + mock_open.return_value = 12 # file descriptor + mock_close.return_value = 12 # file descriptor + + download_obj = DownloadWorkflow( + pipeline="dummy", + outdir=tmp_path, + container_library=( + "quay.io", + "community-cr-prod.seqera.io/docker/registry/v2", + "depot.galaxyproject.org/singularity", + ), + ) + + # Call the method + download_obj.symlink_singularity_images(f"{tmp_path}/path/to/singularity-image.img") + + # Check that os.makedirs was called with the correct arguments + mock_makedirs.assert_any_call(f"{tmp_path}/path/to", exist_ok=True) + + # Check that os.open was called with the correct arguments + mock_open.assert_any_call(f"{tmp_path}/path/to", os.O_RDONLY) + + # Check that os.symlink was called with the correct arguments + expected_calls = [ + mock.call( + "./singularity-image.img", + "./quay.io-singularity-image.img", + dir_fd=12, + ), + mock.call( + "./singularity-image.img", + "./community-cr-prod.seqera.io-docker-registry-v2-singularity-image.img", + dir_fd=12, + ), + mock.call( + "./singularity-image.img", + "./depot.galaxyproject.org-singularity-singularity-image.img", + dir_fd=12, + ), + ] + mock_symlink.assert_has_calls(expected_calls, any_order=True) + + @with_temporary_folder + @mock.patch("os.makedirs") + @mock.patch("os.symlink") + @mock.patch("os.open") + @mock.patch("os.close") + @mock.patch("re.sub") + @mock.patch("os.path.basename") + @mock.patch("os.path.dirname") + def test_symlink_singularity_images_registry( self, tmp_path, mock_dirname, @@ -415,23 +477,22 @@ def test_symlink_singularity_images( download_obj = DownloadWorkflow( pipeline="dummy", outdir=tmp_path, - container_library=("mirage-the-imaginative-registry.io", "quay.io"), + container_library=("quay.io", "community-cr-prod.seqera.io/docker/registry/v2"), ) - # Call the method + download_obj.registry_set = {"quay.io", "community-cr-prod.seqera.io/docker/registry/v2"} + + # Call the method with registry - should not happen, but preserve it then. download_obj.symlink_singularity_images(f"{tmp_path}/path/to/quay.io-singularity-image.img") print(mock_resub.call_args) # Check that os.makedirs was called with the correct arguments mock_makedirs.assert_any_call(f"{tmp_path}/path/to", exist_ok=True) - # Check that os.open was called with the correct arguments - mock_open.assert_called_once_with(f"{tmp_path}/path/to", os.O_RDONLY) - # Check that os.symlink was called with the correct arguments - mock_symlink.assert_any_call( + mock_symlink.assert_called_with( "./quay.io-singularity-image.img", - "./mirage-the-imaginative-registry.io-quay.io-singularity-image.img", + "./community-cr-prod.seqera.io-docker-registry-v2-singularity-image.img", dir_fd=12, ) # Check that there is no attempt to symlink to itself (test parameters would result in that behavior if not checked in the function) @@ -440,6 +501,10 @@ def test_symlink_singularity_images( not in mock_symlink.call_args_list ) + # Normally it would be called for each registry, but since quay.io is part of the name, it + # will only be called once, as no symlink to itself must be created. + mock_open.assert_called_once_with(f"{tmp_path}/path/to", os.O_RDONLY) + # # Test for gather_registries' # @@ -461,10 +526,16 @@ def test_gather_registries(self, tmp_path, mock_fetch_wf_config): download_obj.gather_registries(tmp_path) assert download_obj.registry_set assert isinstance(download_obj.registry_set, set) - assert len(download_obj.registry_set) == 6 + assert len(download_obj.registry_set) == 8 assert "quay.io" in download_obj.registry_set # default registry, if no container library is provided. - assert "depot.galaxyproject.org" in download_obj.registry_set # default registry, often hardcoded in modules + assert ( + "depot.galaxyproject.org/singularity" in download_obj.registry_set + ) # default registry, often hardcoded in modules + assert "community.wave.seqera.io/library" in download_obj.registry_set # Seqera containers Docker + assert ( + "community-cr-prod.seqera.io/docker/registry/v2" in download_obj.registry_set + ) # Seqera containers Singularity https:// download assert "apptainer-registry.io" in download_obj.registry_set assert "docker.io" in download_obj.registry_set assert "podman-registry.io" in download_obj.registry_set @@ -498,7 +569,14 @@ def test_singularity_image_filenames(self, tmp_path): download_obj = DownloadWorkflow(pipeline="dummy", outdir=tmp_path) download_obj.outdir = tmp_path download_obj.container_cache_utilisation = "amend" - download_obj.registry_set = {"docker.io", "quay.io", "depot.galaxyproject.org"} + + download_obj.registry_set = { + "docker.io", + "quay.io", + "depot.galaxyproject.org/singularity", + "community.wave.seqera.io/library", + "community-cr-prod.seqera.io/docker/registry/v2", + } ## Test phase I: Container not yet cached, should be amended to cache # out_path: str, Path to cache @@ -516,11 +594,13 @@ def test_singularity_image_filenames(self, tmp_path): self.assertTrue(all((isinstance(element, str), element is None) for element in result)) # assert that the correct out_path is returned that points to the cache - assert result[0].endswith("/cachedir/singularity-bbmap-38.93--he522d1c_0.img") + assert result[0].endswith("/cachedir/bbmap-38.93--he522d1c_0.img") ## Test phase II: Test various container names # out_path: str, Path to cache # cache_path: None + + # Test --- mulled containers # result = download_obj.singularity_image_filenames( "quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0" ) @@ -528,9 +608,32 @@ def test_singularity_image_filenames(self, tmp_path): "/cachedir/biocontainers-mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2-59cdd445419f14abac76b31dd0d71217994cbcc9-0.img" ) + # Test --- Docker containers without registry # result = download_obj.singularity_image_filenames("nf-core/ubuntu:20.04") assert result[0].endswith("/cachedir/nf-core-ubuntu-20.04.img") + # Test --- Docker container with explicit registry -> should be trimmed # + result = download_obj.singularity_image_filenames("docker.io/nf-core/ubuntu:20.04") + assert result[0].endswith("/cachedir/nf-core-ubuntu-20.04.img") + + # Test --- Docker container with explicit registry not in registry set -> can't be trimmed + result = download_obj.singularity_image_filenames("mirage-the-imaginative-registry.io/nf-core/ubuntu:20.04") + assert result[0].endswith("/cachedir/mirage-the-imaginative-registry.io-nf-core-ubuntu-20.04.img") + + # Test --- Seqera Docker containers: Trimmed, because it is hard-coded in the registry set. + result = download_obj.singularity_image_filenames( + "community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264" + ) + assert result[0].endswith("/cachedir/coreutils-9.5--ae99c88a9b28c264.img") + + # Test --- Seqera Singularity containers: Trimmed, because it is hard-coded in the registry set. + result = download_obj.singularity_image_filenames( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data" + ) + assert result[0].endswith( + "cachedir/blobs-sha256-c2-c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975-data.img" + ) + ## Test phase III: Container will be cached but also copied to out_path # out_path: str, Path to cache # cache_path: str, Path to cache @@ -540,8 +643,8 @@ def test_singularity_image_filenames(self, tmp_path): ) self.assertTrue(all(isinstance(element, str) for element in result)) - assert result[0].endswith("/singularity-images/singularity-bbmap-38.93--he522d1c_0.img") - assert result[1].endswith("/cachedir/singularity-bbmap-38.93--he522d1c_0.img") + assert result[0].endswith("/singularity-images/bbmap-38.93--he522d1c_0.img") + assert result[1].endswith("/cachedir/bbmap-38.93--he522d1c_0.img") ## Test phase IV: Expect an error if no NXF_SINGULARITY_CACHEDIR is defined os.environ["NXF_SINGULARITY_CACHEDIR"] = "" From b79525bb0d5e203575cea6e385df062ad70a18fd Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Thu, 24 Oct 2024 12:48:01 +0200 Subject: [PATCH 5/7] nf-core download: Fix forgotten symlinks in cache. --- .../.github/workflows/download_pipeline.yml | 2 +- nf_core/pipelines/download.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nf_core/pipeline-template/.github/workflows/download_pipeline.yml b/nf_core/pipeline-template/.github/workflows/download_pipeline.yml index 6ba251e52..20f4fc0fa 100644 --- a/nf_core/pipeline-template/.github/workflows/download_pipeline.yml +++ b/nf_core/pipeline-template/.github/workflows/download_pipeline.yml @@ -69,7 +69,7 @@ jobs: --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ --compress "none" \ --container-system 'singularity' \ - --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io" \ + --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io/library/" \ --container-cache-utilisation 'amend' \ --download-configuration 'yes' diff --git a/nf_core/pipelines/download.py b/nf_core/pipelines/download.py index 51babb84e..234265e2c 100644 --- a/nf_core/pipelines/download.py +++ b/nf_core/pipelines/download.py @@ -1354,9 +1354,10 @@ def singularity_download_image( log.debug(f"Copying {container} from cache: '{os.path.basename(out_path)}'") progress.update(task, description="Copying from cache to target directory") shutil.copyfile(cache_path, out_path) + self.symlink_singularity_images(cache_path) # symlinks inside the cache directory # Create symlinks to ensure that the images are found even with different registries being used. - self.symlink_singularity_images(output_path) + self.symlink_singularity_images(out_path) progress.remove_task(task) @@ -1465,9 +1466,10 @@ def singularity_pull_image( log.debug(f"Copying {container} from cache: '{os.path.basename(out_path)}'") progress.update(task, current_log="Copying from cache to target directory") shutil.copyfile(cache_path, out_path) + self.symlink_singularity_images(cache_path) # symlinks inside the cache directory # Create symlinks to ensure that the images are found even with different registries being used. - self.symlink_singularity_images(output_path) + self.symlink_singularity_images(out_path) progress.remove_task(task) From fec99292efe707c2fa1fb5ab761ba42da5a1adaa Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Thu, 24 Oct 2024 19:14:41 +0200 Subject: [PATCH 6/7] Ensure that prioritize_direct_download() retains Seqera Singularity Containers and write additional test. --- CHANGELOG.md | 2 ++ nf_core/pipelines/download.py | 24 ++++++++++++-- tests/pipelines/test_download.py | 54 ++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eac5adcdd..2c820882f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ ### Download +- First steps towards fixing [#3179](https://github.com/nf-core/tools/issues/3179): Modify `prioritize_direct_download()` to retain Seqera Singularity https:// Container URIs and hardcode Seqera Containers into `gather_registries()` ([#3244](https://github.com/nf-core/tools/pull/3244)). + ### Linting ### Modules diff --git a/nf_core/pipelines/download.py b/nf_core/pipelines/download.py index 234265e2c..94763b0fc 100644 --- a/nf_core/pipelines/download.py +++ b/nf_core/pipelines/download.py @@ -970,7 +970,7 @@ def rectify_raw_container_matches(self, raw_findings): """ return self.prioritize_direct_download(cleaned_matches) - def prioritize_direct_download(self, container_list): + def prioritize_direct_download(self, container_list: List[str]) -> List[str]: """ Helper function that takes a list of container images (URLs and Docker URIs), eliminates all Docker URIs for which also a URL is contained and returns the @@ -993,13 +993,31 @@ def prioritize_direct_download(self, container_list): we want to keep it and not replace with with whatever we have now (which might be the Docker URI). A regex that matches http, r"^$|^http" could thus be used to prioritize the Docker URIs over http Downloads + + We also need to handle a special case: The https:// Singularity downloads from Seqera Containers all end in 'data', although + they are not equivalent: + + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data' + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' + """ - d = {} + d: dict[str, str] = {} + seqera_containers: list[str] = [] + all_others: list[str] = [] + for c in container_list: + if bool(re.search(r"/data$", c)): + seqera_containers.append(c) + else: + all_others.append(c) + + for c in all_others: if re.match(r"^$|(?!^http)", d.get(k := re.sub(".*/(.*)", "\\1", c), "")): log.debug(f"{c} matches and will be saved as {k}") d[k] = c - return sorted(list(d.values())) + + # combine deduplicated others and Seqera containers + return sorted(list(d.values()) + seqera_containers) def gather_registries(self, workflow_directory: str) -> None: """Fetch the registries from the pipeline config and CLI arguments and store them in a set. diff --git a/tests/pipelines/test_download.py b/tests/pipelines/test_download.py index 4ce238e4e..86b07ef7f 100644 --- a/tests/pipelines/test_download.py +++ b/tests/pipelines/test_download.py @@ -272,6 +272,60 @@ def test_find_container_images_modules(self, tmp_path, mock_fetch_wf_config): assert "community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264" in download_obj.containers + # + # Test for 'prioritize_direct_download' + # + @with_temporary_folder + def test_prioritize_direct_download(self, tmp_path): + download_obj = DownloadWorkflow(pipeline="dummy", outdir=tmp_path) + + # tests deduplication and https priority as well as Seqera Container exception + + test_container = [ + "https://depot.galaxyproject.org/singularity/ubuntu:22.04", + "nf-core/ubuntu:22.04", + "biocontainers/umi-transfer:1.5.0--h715e4b3_0", + "https://depot.galaxyproject.org/singularity/umi-transfer:1.5.0--h715e4b3_0", + "biocontainers/umi-transfer:1.5.0--h715e4b3_0", + "quay.io/nf-core/sortmerna:4.3.7--6502243397c065ba", + "nf-core/sortmerna:4.3.7--6502243397c065ba", + "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_1", + "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_0", + "https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1", + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data", + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data", + ] + + result = download_obj.prioritize_direct_download(test_container) + + # Verify that the priority works for regular https downloads (https encountered first) + assert "https://depot.galaxyproject.org/singularity/ubuntu:22.04" in result + assert "nf-core/ubuntu:22.04" not in result + + # Verify that the priority works for regular https downloads (https encountered second) + assert "biocontainers/umi-transfer:1.5.0--h715e4b3_0" not in result + assert "https://depot.galaxyproject.org/singularity/umi-transfer:1.5.0--h715e4b3_0" in result + + # Verify that the priority works for images with and without explicit registry + # No priority here, though - the first is retained. + assert "nf-core/sortmerna:4.3.7--6502243397c065ba" in result + assert "quay.io/nf-core/sortmerna:4.3.7--6502243397c065ba" not in result + + # Verify that different versions of the same tool and different build numbers are retained + assert "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_1" in result + assert "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_0" in result + assert "https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1" in result + + # Verify that Seqera containers are not deduplicated + assert ( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data" + in result + ) + assert ( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data" + in result + ) + # # Tests for 'singularity_pull_image' # From 32a7e6b049906ad7cf9366a355672a29d3dcc958 Mon Sep 17 00:00:00 2001 From: Matthias Zepper <6963520+MatthiasZepper@users.noreply.github.com> Date: Fri, 25 Oct 2024 13:26:14 +0200 Subject: [PATCH 7/7] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- nf_core/pipelines/download.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nf_core/pipelines/download.py b/nf_core/pipelines/download.py index 94763b0fc..9a329aeaf 100644 --- a/nf_core/pipelines/download.py +++ b/nf_core/pipelines/download.py @@ -995,15 +995,15 @@ def prioritize_direct_download(self, container_list: List[str]) -> List[str]: A regex that matches http, r"^$|^http" could thus be used to prioritize the Docker URIs over http Downloads We also need to handle a special case: The https:// Singularity downloads from Seqera Containers all end in 'data', although - they are not equivalent: + they are not equivalent, e.g.: 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data' 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' """ - d: dict[str, str] = {} - seqera_containers: list[str] = [] - all_others: list[str] = [] + d: Dict[str, str] = {} + seqera_containers: List[str] = [] + all_others: List[str] = [] for c in container_list: if bool(re.search(r"/data$", c)):