diff --git a/CHANGELOG.md b/CHANGELOG.md index 286603206a..f2bf6fcae8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ ### Download +- First steps towards fixing [#3179](https://github.com/nf-core/tools/issues/3179): Modify `prioritize_direct_download()` to retain Seqera Singularity https:// Container URIs and hardcode Seqera Containers into `gather_registries()` ([#3244](https://github.com/nf-core/tools/pull/3244)). + ### Linting ### Modules diff --git a/nf_core/pipeline-template/.github/workflows/download_pipeline.yml b/nf_core/pipeline-template/.github/workflows/download_pipeline.yml index 56208537c5..1bc42469c4 100644 --- a/nf_core/pipeline-template/.github/workflows/download_pipeline.yml +++ b/nf_core/pipeline-template/.github/workflows/download_pipeline.yml @@ -69,7 +69,7 @@ jobs: --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ --compress "none" \ --container-system 'singularity' \ - --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io" \ + --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io/library/" \ --container-cache-utilisation 'amend' \ --download-configuration 'yes' diff --git a/nf_core/pipelines/download.py b/nf_core/pipelines/download.py index 917665ff36..9a329aeaff 100644 --- a/nf_core/pipelines/download.py +++ b/nf_core/pipelines/download.py @@ -970,7 +970,7 @@ def rectify_raw_container_matches(self, raw_findings): """ return self.prioritize_direct_download(cleaned_matches) - def prioritize_direct_download(self, container_list): + def prioritize_direct_download(self, container_list: List[str]) -> List[str]: """ Helper function that takes a list of container images (URLs and Docker URIs), eliminates all Docker URIs for which also a URL is contained and returns the @@ -993,13 +993,31 @@ def prioritize_direct_download(self, container_list): we want to keep it and not replace with with whatever we have now (which might be the Docker URI). A regex that matches http, r"^$|^http" could thus be used to prioritize the Docker URIs over http Downloads + + We also need to handle a special case: The https:// Singularity downloads from Seqera Containers all end in 'data', although + they are not equivalent, e.g.: + + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data' + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' + """ - d = {} + d: Dict[str, str] = {} + seqera_containers: List[str] = [] + all_others: List[str] = [] + for c in container_list: + if bool(re.search(r"/data$", c)): + seqera_containers.append(c) + else: + all_others.append(c) + + for c in all_others: if re.match(r"^$|(?!^http)", d.get(k := re.sub(".*/(.*)", "\\1", c), "")): log.debug(f"{c} matches and will be saved as {k}") d[k] = c - return sorted(list(d.values())) + + # combine deduplicated others and Seqera containers + return sorted(list(d.values()) + seqera_containers) def gather_registries(self, workflow_directory: str) -> None: """Fetch the registries from the pipeline config and CLI arguments and store them in a set. @@ -1023,7 +1041,13 @@ def gather_registries(self, workflow_directory: str) -> None: self.registry_set.add(self.nf_config[registry]) # add depot.galaxyproject.org to the set, because it is the default registry for singularity hardcoded in modules - self.registry_set.add("depot.galaxyproject.org") + self.registry_set.add("depot.galaxyproject.org/singularity") + + # add community.wave.seqera.io/library to the set to support the new Seqera Docker container registry + self.registry_set.add("community.wave.seqera.io/library") + + # add chttps://community-cr-prod.seqera.io/docker/registry/v2/ to the set to support the new Seqera Singularity container registry + self.registry_set.add("community-cr-prod.seqera.io/docker/registry/v2") def symlink_singularity_images(self, image_out_path: str) -> None: """Create a symlink for each registry in the registry set that points to the image. @@ -1040,10 +1064,13 @@ def symlink_singularity_images(self, image_out_path: str) -> None: if self.registry_set: # Create a regex pattern from the set, in case trimming is needed. - trim_pattern = "|".join(f"^{re.escape(registry)}-?" for registry in self.registry_set) + trim_pattern = "|".join(f"^{re.escape(registry)}-?".replace("/", "[/-]") for registry in self.registry_set) for registry in self.registry_set: - if not os.path.basename(image_out_path).startswith(registry): + # Nextflow will convert it like this as well, so we need it mimic its behavior + registry = registry.replace("/", "-") + + if not bool(re.search(trim_pattern, os.path.basename(image_out_path))): symlink_name = os.path.join("./", f"{registry}-{os.path.basename(image_out_path)}") else: trimmed_name = re.sub(f"{trim_pattern}", "", os.path.basename(image_out_path)) @@ -1263,7 +1290,7 @@ def singularity_image_filenames(self, container: str) -> Tuple[str, Optional[str # if docker.registry / singularity.registry are set to empty strings at runtime, which can be included in the HPC config profiles easily. if self.registry_set: # Create a regex pattern from the set of registries - trim_pattern = "|".join(f"^{re.escape(registry)}-?" for registry in self.registry_set) + trim_pattern = "|".join(f"^{re.escape(registry)}-?".replace("/", "[/-]") for registry in self.registry_set) # Use the pattern to trim the string out_name = re.sub(f"{trim_pattern}", "", out_name) @@ -1345,9 +1372,10 @@ def singularity_download_image( log.debug(f"Copying {container} from cache: '{os.path.basename(out_path)}'") progress.update(task, description="Copying from cache to target directory") shutil.copyfile(cache_path, out_path) + self.symlink_singularity_images(cache_path) # symlinks inside the cache directory # Create symlinks to ensure that the images are found even with different registries being used. - self.symlink_singularity_images(output_path) + self.symlink_singularity_images(out_path) progress.remove_task(task) @@ -1456,9 +1484,10 @@ def singularity_pull_image( log.debug(f"Copying {container} from cache: '{os.path.basename(out_path)}'") progress.update(task, current_log="Copying from cache to target directory") shutil.copyfile(cache_path, out_path) + self.symlink_singularity_images(cache_path) # symlinks inside the cache directory # Create symlinks to ensure that the images are found even with different registries being used. - self.symlink_singularity_images(output_path) + self.symlink_singularity_images(out_path) progress.remove_task(task) diff --git a/tests/data/mock_module_containers/modules/mock_seqera_container.nf b/tests/data/mock_module_containers/modules/mock_seqera_container.nf new file mode 100644 index 0000000000..20c7075481 --- /dev/null +++ b/tests/data/mock_module_containers/modules/mock_seqera_container.nf @@ -0,0 +1,11 @@ +process CAT_FASTQ { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' : + 'community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264' }" + + // truncated + +} diff --git a/tests/pipelines/test_download.py b/tests/pipelines/test_download.py index e552582527..86b07ef7f8 100644 --- a/tests/pipelines/test_download.py +++ b/tests/pipelines/test_download.py @@ -257,6 +257,75 @@ def test_find_container_images_modules(self, tmp_path, mock_fetch_wf_config): not in download_obj.containers ) + # mock_seqera_container.nf + assert ( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data" + in download_obj.containers + ) + + # ToDO: This URI should actually NOT be in there, but prioritize_direct_download() can not handle this case. + # + # It works purely by comparing the strings, thus can establish the equivalence of 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.5--py39hf95cd2a_0' + # and 'biocontainers/umi_tools:1.1.5--py39hf95cd2a_0' because of the identical string 'umi_tools:1.1.5--py39hf95cd2a_0', but has no means to establish that + # 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data' and + # 'community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264' are the equivalent container. It would need to query an API at Seqera for that. + + assert "community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264" in download_obj.containers + + # + # Test for 'prioritize_direct_download' + # + @with_temporary_folder + def test_prioritize_direct_download(self, tmp_path): + download_obj = DownloadWorkflow(pipeline="dummy", outdir=tmp_path) + + # tests deduplication and https priority as well as Seqera Container exception + + test_container = [ + "https://depot.galaxyproject.org/singularity/ubuntu:22.04", + "nf-core/ubuntu:22.04", + "biocontainers/umi-transfer:1.5.0--h715e4b3_0", + "https://depot.galaxyproject.org/singularity/umi-transfer:1.5.0--h715e4b3_0", + "biocontainers/umi-transfer:1.5.0--h715e4b3_0", + "quay.io/nf-core/sortmerna:4.3.7--6502243397c065ba", + "nf-core/sortmerna:4.3.7--6502243397c065ba", + "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_1", + "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_0", + "https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1", + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data", + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data", + ] + + result = download_obj.prioritize_direct_download(test_container) + + # Verify that the priority works for regular https downloads (https encountered first) + assert "https://depot.galaxyproject.org/singularity/ubuntu:22.04" in result + assert "nf-core/ubuntu:22.04" not in result + + # Verify that the priority works for regular https downloads (https encountered second) + assert "biocontainers/umi-transfer:1.5.0--h715e4b3_0" not in result + assert "https://depot.galaxyproject.org/singularity/umi-transfer:1.5.0--h715e4b3_0" in result + + # Verify that the priority works for images with and without explicit registry + # No priority here, though - the first is retained. + assert "nf-core/sortmerna:4.3.7--6502243397c065ba" in result + assert "quay.io/nf-core/sortmerna:4.3.7--6502243397c065ba" not in result + + # Verify that different versions of the same tool and different build numbers are retained + assert "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_1" in result + assert "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_0" in result + assert "https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1" in result + + # Verify that Seqera containers are not deduplicated + assert ( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data" + in result + ) + assert ( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data" + in result + ) + # # Tests for 'singularity_pull_image' # @@ -376,10 +445,72 @@ def test_get_singularity_images(self, tmp_path, mock_fetch_wf_config): @mock.patch("os.symlink") @mock.patch("os.open") @mock.patch("os.close") - @mock.patch("re.sub") @mock.patch("os.path.basename") @mock.patch("os.path.dirname") def test_symlink_singularity_images( + self, + tmp_path, + mock_dirname, + mock_basename, + mock_close, + mock_open, + mock_symlink, + mock_makedirs, + ): + # Setup + mock_dirname.return_value = f"{tmp_path}/path/to" + mock_basename.return_value = "singularity-image.img" + mock_open.return_value = 12 # file descriptor + mock_close.return_value = 12 # file descriptor + + download_obj = DownloadWorkflow( + pipeline="dummy", + outdir=tmp_path, + container_library=( + "quay.io", + "community-cr-prod.seqera.io/docker/registry/v2", + "depot.galaxyproject.org/singularity", + ), + ) + + # Call the method + download_obj.symlink_singularity_images(f"{tmp_path}/path/to/singularity-image.img") + + # Check that os.makedirs was called with the correct arguments + mock_makedirs.assert_any_call(f"{tmp_path}/path/to", exist_ok=True) + + # Check that os.open was called with the correct arguments + mock_open.assert_any_call(f"{tmp_path}/path/to", os.O_RDONLY) + + # Check that os.symlink was called with the correct arguments + expected_calls = [ + mock.call( + "./singularity-image.img", + "./quay.io-singularity-image.img", + dir_fd=12, + ), + mock.call( + "./singularity-image.img", + "./community-cr-prod.seqera.io-docker-registry-v2-singularity-image.img", + dir_fd=12, + ), + mock.call( + "./singularity-image.img", + "./depot.galaxyproject.org-singularity-singularity-image.img", + dir_fd=12, + ), + ] + mock_symlink.assert_has_calls(expected_calls, any_order=True) + + @with_temporary_folder + @mock.patch("os.makedirs") + @mock.patch("os.symlink") + @mock.patch("os.open") + @mock.patch("os.close") + @mock.patch("re.sub") + @mock.patch("os.path.basename") + @mock.patch("os.path.dirname") + def test_symlink_singularity_images_registry( self, tmp_path, mock_dirname, @@ -400,23 +531,22 @@ def test_symlink_singularity_images( download_obj = DownloadWorkflow( pipeline="dummy", outdir=tmp_path, - container_library=("mirage-the-imaginative-registry.io", "quay.io"), + container_library=("quay.io", "community-cr-prod.seqera.io/docker/registry/v2"), ) - # Call the method + download_obj.registry_set = {"quay.io", "community-cr-prod.seqera.io/docker/registry/v2"} + + # Call the method with registry - should not happen, but preserve it then. download_obj.symlink_singularity_images(f"{tmp_path}/path/to/quay.io-singularity-image.img") print(mock_resub.call_args) # Check that os.makedirs was called with the correct arguments mock_makedirs.assert_any_call(f"{tmp_path}/path/to", exist_ok=True) - # Check that os.open was called with the correct arguments - mock_open.assert_called_once_with(f"{tmp_path}/path/to", os.O_RDONLY) - # Check that os.symlink was called with the correct arguments - mock_symlink.assert_any_call( + mock_symlink.assert_called_with( "./quay.io-singularity-image.img", - "./mirage-the-imaginative-registry.io-quay.io-singularity-image.img", + "./community-cr-prod.seqera.io-docker-registry-v2-singularity-image.img", dir_fd=12, ) # Check that there is no attempt to symlink to itself (test parameters would result in that behavior if not checked in the function) @@ -425,6 +555,10 @@ def test_symlink_singularity_images( not in mock_symlink.call_args_list ) + # Normally it would be called for each registry, but since quay.io is part of the name, it + # will only be called once, as no symlink to itself must be created. + mock_open.assert_called_once_with(f"{tmp_path}/path/to", os.O_RDONLY) + # # Test for gather_registries' # @@ -446,10 +580,16 @@ def test_gather_registries(self, tmp_path, mock_fetch_wf_config): download_obj.gather_registries(tmp_path) assert download_obj.registry_set assert isinstance(download_obj.registry_set, set) - assert len(download_obj.registry_set) == 6 + assert len(download_obj.registry_set) == 8 assert "quay.io" in download_obj.registry_set # default registry, if no container library is provided. - assert "depot.galaxyproject.org" in download_obj.registry_set # default registry, often hardcoded in modules + assert ( + "depot.galaxyproject.org/singularity" in download_obj.registry_set + ) # default registry, often hardcoded in modules + assert "community.wave.seqera.io/library" in download_obj.registry_set # Seqera containers Docker + assert ( + "community-cr-prod.seqera.io/docker/registry/v2" in download_obj.registry_set + ) # Seqera containers Singularity https:// download assert "apptainer-registry.io" in download_obj.registry_set assert "docker.io" in download_obj.registry_set assert "podman-registry.io" in download_obj.registry_set @@ -483,7 +623,14 @@ def test_singularity_image_filenames(self, tmp_path): download_obj = DownloadWorkflow(pipeline="dummy", outdir=tmp_path) download_obj.outdir = tmp_path download_obj.container_cache_utilisation = "amend" - download_obj.registry_set = {"docker.io", "quay.io", "depot.galaxyproject.org"} + + download_obj.registry_set = { + "docker.io", + "quay.io", + "depot.galaxyproject.org/singularity", + "community.wave.seqera.io/library", + "community-cr-prod.seqera.io/docker/registry/v2", + } ## Test phase I: Container not yet cached, should be amended to cache # out_path: str, Path to cache @@ -501,11 +648,13 @@ def test_singularity_image_filenames(self, tmp_path): self.assertTrue(all((isinstance(element, str), element is None) for element in result)) # assert that the correct out_path is returned that points to the cache - assert result[0].endswith("/cachedir/singularity-bbmap-38.93--he522d1c_0.img") + assert result[0].endswith("/cachedir/bbmap-38.93--he522d1c_0.img") ## Test phase II: Test various container names # out_path: str, Path to cache # cache_path: None + + # Test --- mulled containers # result = download_obj.singularity_image_filenames( "quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0" ) @@ -513,9 +662,32 @@ def test_singularity_image_filenames(self, tmp_path): "/cachedir/biocontainers-mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2-59cdd445419f14abac76b31dd0d71217994cbcc9-0.img" ) + # Test --- Docker containers without registry # result = download_obj.singularity_image_filenames("nf-core/ubuntu:20.04") assert result[0].endswith("/cachedir/nf-core-ubuntu-20.04.img") + # Test --- Docker container with explicit registry -> should be trimmed # + result = download_obj.singularity_image_filenames("docker.io/nf-core/ubuntu:20.04") + assert result[0].endswith("/cachedir/nf-core-ubuntu-20.04.img") + + # Test --- Docker container with explicit registry not in registry set -> can't be trimmed + result = download_obj.singularity_image_filenames("mirage-the-imaginative-registry.io/nf-core/ubuntu:20.04") + assert result[0].endswith("/cachedir/mirage-the-imaginative-registry.io-nf-core-ubuntu-20.04.img") + + # Test --- Seqera Docker containers: Trimmed, because it is hard-coded in the registry set. + result = download_obj.singularity_image_filenames( + "community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264" + ) + assert result[0].endswith("/cachedir/coreutils-9.5--ae99c88a9b28c264.img") + + # Test --- Seqera Singularity containers: Trimmed, because it is hard-coded in the registry set. + result = download_obj.singularity_image_filenames( + "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data" + ) + assert result[0].endswith( + "cachedir/blobs-sha256-c2-c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975-data.img" + ) + ## Test phase III: Container will be cached but also copied to out_path # out_path: str, Path to cache # cache_path: str, Path to cache @@ -525,8 +697,8 @@ def test_singularity_image_filenames(self, tmp_path): ) self.assertTrue(all(isinstance(element, str) for element in result)) - assert result[0].endswith("/singularity-images/singularity-bbmap-38.93--he522d1c_0.img") - assert result[1].endswith("/cachedir/singularity-bbmap-38.93--he522d1c_0.img") + assert result[0].endswith("/singularity-images/bbmap-38.93--he522d1c_0.img") + assert result[1].endswith("/cachedir/bbmap-38.93--he522d1c_0.img") ## Test phase IV: Expect an error if no NXF_SINGULARITY_CACHEDIR is defined os.environ["NXF_SINGULARITY_CACHEDIR"] = ""