-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #221 from EGA-archive/EE-2610_autocorrect
EE-2610 Infer or autocorrect format in the genomic range cli args
- Loading branch information
Showing
6 changed files
with
172 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
5.1.0 | ||
5.2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import logging | ||
import re | ||
|
||
|
||
def autocorrect_format_in_genomic_range_args(name: str, genomic_range_args: tuple, possible_format_list: list) -> tuple: | ||
file_format_from_user = genomic_range_args[4] if len(genomic_range_args) == 5 else None | ||
detected_file_format = detect_file_format(name, possible_format_list) | ||
if file_format_from_user and detected_file_format and file_format_from_user != detected_file_format: | ||
logging.warning( | ||
f"Warning: The specified format {file_format_from_user} does not match the detected format in the " | ||
f"file name, {name} , detected format: {detected_file_format}. The detected format will be used " | ||
f"since transcoding is not yet supported by the file distribution service.") | ||
|
||
if not file_format_from_user and not detected_file_format: | ||
logging.warning( | ||
"Warning: No file format was specified nor detected. The file distribution service will use 'BAM' as " | ||
"the default format. If you require a different format, please specify it using the '--format' option," | ||
"followed by the desired format (e.g., '--format CRAM'). For a list of supported formats, " | ||
"use the '--help' option.") | ||
|
||
genomic_range_args_list = list(genomic_range_args) | ||
genomic_range_args_list[4] = detected_file_format if detected_file_format else file_format_from_user | ||
updated_genomic_range_args = tuple(genomic_range_args_list) | ||
return updated_genomic_range_args | ||
|
||
|
||
def is_bam_or_cram_file(filename: str): | ||
return search_format_in_filename("bam", filename) or search_format_in_filename("cram", filename) | ||
|
||
|
||
def search_format_in_filename(file_format: str, filename: str): | ||
return re.search(f"\.{file_format}", filename, re.IGNORECASE) | ||
|
||
|
||
def detect_file_format(filename, possible_format_list): | ||
detected_file_format = None | ||
|
||
for file_format in possible_format_list: | ||
if search_format_in_filename(file_format.lower(), filename): | ||
detected_file_format = file_format | ||
break | ||
|
||
return detected_file_format |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from unittest.mock import patch | ||
|
||
import pytest | ||
from pyega3.libs.file_format import autocorrect_format_in_genomic_range_args, is_bam_or_cram_file | ||
|
||
|
||
@pytest.fixture | ||
def mock_warning(): | ||
with patch('logging.warning') as mock_warning: | ||
yield mock_warning | ||
|
||
|
||
def test_format_matches_detected(mock_warning): | ||
name = "example.bam.cip" | ||
genomic_range_args = ("chr1", 100, 200, "reference", "BAM") | ||
format_list = ["BAM", "CRAM"] | ||
|
||
result = autocorrect_format_in_genomic_range_args(name, genomic_range_args, format_list) | ||
|
||
assert result == genomic_range_args | ||
mock_warning.assert_not_called() | ||
|
||
|
||
def test_format_does_not_match_detected(mock_warning): | ||
name = "example.cram.cip" | ||
genomic_range_args = ("chr1", 100, 200, "reference", "BAM") | ||
format_list = ["BAM", "CRAM"] | ||
|
||
result = autocorrect_format_in_genomic_range_args(name, genomic_range_args, format_list) | ||
|
||
expected_result = ("chr1", 100, 200, "reference", "CRAM") | ||
assert result == expected_result | ||
mock_warning.assert_called_once_with("Warning: The specified format BAM does not match the detected format in " | ||
"the file name, example.cram.cip , detected format: CRAM. The detected " | ||
"format will be used since transcoding is not yet supported by the file " | ||
"distribution service.") | ||
|
||
|
||
def test_no_format_specified_and_not_detected(mock_warning): | ||
name = "example.unknown.cip" | ||
genomic_range_args = ("chr1", 100, 200, "reference", None) | ||
format_list = ["BAM", "CRAM"] | ||
|
||
result = autocorrect_format_in_genomic_range_args(name, genomic_range_args, format_list) | ||
|
||
assert result == genomic_range_args | ||
mock_warning.assert_called_once_with( | ||
"Warning: No file format was specified nor detected. The file distribution service will use 'BAM' as the " | ||
"default format. If you require a different format, please specify it using the '--format' option," | ||
"followed by the desired format (e.g., '--format CRAM'). For a list of supported formats, " | ||
"use the '--help' option.") | ||
|
||
|
||
def test_no_format_specified_but_detected(mock_warning): | ||
name = "example.cram.cip" | ||
genomic_range_args = ("chr1", 100, 200, "reference", None) | ||
format_list = ["BAM", "CRAM"] | ||
|
||
result = autocorrect_format_in_genomic_range_args(name, genomic_range_args, format_list) | ||
|
||
expected_result = ("chr1", 100, 200, "reference", "CRAM") | ||
assert result == expected_result | ||
mock_warning.assert_not_called() | ||
|
||
|
||
def test_is_bam_or_cram_file_returns_true(): | ||
name = "example.bam" | ||
assert is_bam_or_cram_file(name) | ||
|
||
name = "example.cram" | ||
assert is_bam_or_cram_file(name) | ||
|
||
name = "example.cram.cip" | ||
assert is_bam_or_cram_file(name) | ||
|
||
name = "example.2.bam.cip" | ||
assert is_bam_or_cram_file(name) | ||
|
||
name = "example.bam.cip" | ||
assert is_bam_or_cram_file(name) | ||
|
||
|
||
def test_is_bam_or_cram_file_returns_false(): | ||
name = "example.vcf.cip" | ||
assert not is_bam_or_cram_file(name) | ||
|
||
name = "example.txt" | ||
assert not is_bam_or_cram_file(name) |