From af4683ca91ac60db1deafdc6ad2ac12f26c3f937 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 22 Nov 2024 15:18:32 -0700 Subject: [PATCH 01/23] Wrap pandas functions to support not including `None` with the NA values argument --- schematic/models/validate_attribute.py | 3 +- .../database/synapse_database_wrapper.py | 3 +- schematic/store/synapse.py | 21 ++++++-- schematic/utils/df_utils.py | 34 ++++++++++--- schematic_api/api/routes.py | 3 +- tests/integration/test_commands.py | 12 ++--- tests/integration/test_manifest_submission.py | 3 +- tests/integration/test_metadata_model.py | 3 +- tests/test_api.py | 8 +-- tests/test_store.py | 9 ++-- tests/test_utils.py | 6 +-- tests/test_viz.py | 9 ++-- tests/unit/test_df_utils.py | 49 +++++++++++++++++++ 13 files changed, 126 insertions(+), 37 deletions(-) create mode 100644 tests/unit/test_df_utils.py diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 9b13bebaf..52c30414f 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -17,6 +17,7 @@ from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.validate_rules_utils import validation_rule_info from schematic.utils.validate_utils import ( comma_separated_list_regex, @@ -868,7 +869,7 @@ def _get_target_manifest_dataframes( entity: File = self.synStore.getDatasetManifest( datasetId=dataset_id, downloadFile=True ) - manifests.append(pd.read_csv(entity.path)) + manifests.append(read_csv(entity.path)) return dict(zip(manifest_ids, manifests)) def get_target_manifests( diff --git a/schematic/store/database/synapse_database_wrapper.py b/schematic/store/database/synapse_database_wrapper.py index b827b140f..ba0ed2dc9 100644 --- a/schematic/store/database/synapse_database_wrapper.py +++ b/schematic/store/database/synapse_database_wrapper.py @@ -8,6 +8,7 @@ from opentelemetry import trace from schematic.store.synapse_tracker import SynapseEntityTracker +from schematic.utils.df_utils import read_csv class SynapseTableNameError(Exception): @@ -108,7 +109,7 @@ def execute_sql_query( pandas.DataFrame: The queried table """ result = self.execute_sql_statement(query, include_row_data) - table = pandas.read_csv(result.filepath) + table = read_csv(result.filepath) return table def execute_sql_statement( diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 7ccd98362..bf5bb6c49 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -58,7 +58,12 @@ from schematic.store.base import BaseStorage from schematic.store.database.synapse_database import SynapseDatabase from schematic.store.synapse_tracker import SynapseEntityTracker -from schematic.utils.df_utils import col_in_dataframe, load_df, update_df +from schematic.utils.df_utils import ( + STR_NA_VALUES_FILTERED, + col_in_dataframe, + load_df, + update_df, +) # entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment # Please do not remove these import statements @@ -401,7 +406,7 @@ def query_fileview( try: self.storageFileviewTable = self.syn.tableQuery( query=self.fileview_query, - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) except SynapseHTTPError as exc: exception_text = str(exc) if "Unknown column path" in exception_text: @@ -1433,7 +1438,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable """ results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) - df = results.asDataFrame(rowIdAndVersionInIndex=False) + df = results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) return df, results @@ -3485,7 +3494,11 @@ def query(self, tidy=True, force=False): if self.table is None or force: fileview_id = self.view_schema["id"] self.results = self.synapse.tableQuery(f"select * from {fileview_id}") - self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False) + self.table = self.results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) if tidy: self.tidy_table() return self.table diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index b25e7db82..cc9f5ed9b 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -4,17 +4,41 @@ import logging from copy import deepcopy -from time import perf_counter -from typing import Union, Any, Optional from datetime import datetime +from time import perf_counter +from typing import Any, Optional, Union + import dateparser as dp -import pandas as pd import numpy as np +import pandas as pd from pandarallel import pandarallel # type: ignore +from pandas._libs.parsers import STR_NA_VALUES + +STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES) + +try: + STR_NA_VALUES_FILTERED.remove("None") +except KeyError: + pass logger = logging.getLogger(__name__) +def read_csv( + path_or_buffer: str, keep_default_na=False, encoding="utf8", **load_args: Any +) -> pd.DataFrame: + na_values = load_args.pop( + "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None + ) + return pd.read_csv( # type: ignore + path_or_buffer, + na_values=na_values, + keep_default_na=keep_default_na, + encoding=encoding, + **load_args, + ) + + def load_df( file_path: str, preserve_raw_input: bool = True, @@ -45,9 +69,7 @@ def load_df( t_load_df = perf_counter() # Read CSV to df as type specified in kwargs - org_df = pd.read_csv( # type: ignore - file_path, keep_default_na=True, encoding="utf8", **load_args - ) + org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore if not isinstance(org_df, pd.DataFrame): raise ValueError( ( diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 97484c15c..ad1e25279 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -20,6 +20,7 @@ from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.synapse import ManifestDownload, SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder, entity_type_mapping from schematic.utils.schema_utils import ( DisplayLabelType, @@ -178,7 +179,7 @@ def parse_bool(str_bool): def return_as_json(manifest_local_file_path): - manifest_csv = pd.read_csv(manifest_local_file_path) + manifest_csv = read_csv(manifest_local_file_path) manifest_json = manifest_csv.to_dict(orient="records") return manifest_json diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py index 8658f6e6b..e0edfd853 100644 --- a/tests/integration/test_commands.py +++ b/tests/integration/test_commands.py @@ -4,16 +4,16 @@ import uuid from io import BytesIO +import numpy as np import pytest import requests -from openpyxl import load_workbook from click.testing import CliRunner -import pandas as pd -import numpy as np +from openpyxl import load_workbook -from schematic.configuration.configuration import Configuration, CONFIG +from schematic.configuration.configuration import CONFIG, Configuration from schematic.manifest.commands import manifest from schematic.models.commands import model +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting LIGHT_BLUE = "FFEAF7F9" # Required cell @@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None: # command has no (python) errors, has exit code 0 assert result.exit_code == 0 - biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv") - patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv") + biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv") + patient_df = read_csv("tests/data/example.Patient.manifest.csv") # Remove created files: finally: diff --git a/tests/integration/test_manifest_submission.py b/tests/integration/test_manifest_submission.py index 92e6911c1..92a52ebe9 100644 --- a/tests/integration/test_manifest_submission.py +++ b/tests/integration/test_manifest_submission.py @@ -12,6 +12,7 @@ from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting, Helpers from tests.utils import CleanupItem @@ -73,7 +74,7 @@ def validate_submitted_manifest_file( manifest_file_path = os.path.join( download_location, manifest_data["properties"]["name"] ) - manifest_submitted_df = pd.read_csv(manifest_file_path) + manifest_submitted_df = read_csv(manifest_file_path) assert "entityId" in manifest_submitted_df.columns assert "Id" in manifest_submitted_df.columns diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 2178a83b8..3d42b13b5 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -23,6 +23,7 @@ from synapseclient.models import File, Folder from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import create_temp_folder from tests.conftest import Helpers, metadata_model from tests.utils import CleanupItem @@ -531,7 +532,7 @@ def _submit_and_verify_manifest( ) manifest_table = synapse_store.syn.tableQuery( f"select * from {expected_table_id}", downloadLocation=download_dir - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) # AND the columns in the manifest table should reflect the ones in the file table_columns = manifest_table.columns diff --git a/tests/test_api.py b/tests/test_api.py index 1f2d79add..842210d5b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -14,11 +14,11 @@ from flask.testing import FlaskClient from opentelemetry import trace -from schematic.configuration.configuration import Configuration +from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder -from schematic.configuration.configuration import CONFIG logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -838,7 +838,7 @@ def test_generate_manifest_file_based_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) @@ -894,7 +894,7 @@ def test_generate_manifest_not_file_based_with_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) diff --git a/tests/test_store.py b/tests/test_store.py index c9f32bec2..e90908876 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -28,6 +28,7 @@ from schematic.schemas.data_model_parser import DataModelParser from schematic.store.base import BaseStorage from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import check_synapse_cache_size, create_temp_folder from tests.conftest import Helpers from tests.utils import CleanupItem @@ -1244,7 +1245,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1281,7 +1282,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1343,7 +1344,7 @@ async def test_upsert_table( # Query table for DaystoFollowUp column table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1384,7 +1385,7 @@ async def test_upsert_table( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2a0744439..f708e777b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -27,7 +27,7 @@ ) from schematic.schemas.data_model_parser import DataModelParser from schematic.utils import cli_utils, df_utils, general, io_utils, validate_utils -from schematic.utils.df_utils import load_df +from schematic.utils.df_utils import load_df, read_csv from schematic.utils.general import ( calculate_datetime, check_synapse_cache_size, @@ -473,7 +473,7 @@ def test_load_df(self, helpers, preserve_raw_input): test_col = "Check NA" file_path = helpers.get_data_path("mock_manifests", "Invalid_Test_Manifest.csv") - unprocessed_df = pd.read_csv(file_path, encoding="utf8") + unprocessed_df = read_csv(file_path, encoding="utf8") df = df_utils.load_df( file_path, preserve_raw_input=preserve_raw_input, data_model=False ) @@ -1100,7 +1100,7 @@ def test_convert_nan_entries_to_empty_strings( manifest_path = helpers.get_data_path(manifest) model_path = helpers.get_data_path(model) - ## Gather parmeters needed to run validate_manifest_rules + # Gather parmeters needed to run validate_manifest_rules errors = [] load_args = { "dtype": "string", diff --git a/tests/test_viz.py b/tests/test_viz.py index b94d79688..2ab78b9ce 100644 --- a/tests/test_viz.py +++ b/tests/test_viz.py @@ -1,11 +1,10 @@ import json import logging -import os from io import StringIO -import pandas as pd import pytest +from schematic.utils.df_utils import read_csv from schematic.visualization.attributes_explorer import AttributesExplorer from schematic.visualization.tangled_tree import TangledTree @@ -44,7 +43,7 @@ class TestVisualization: def test_ae(self, helpers, attributes_explorer): attributes_str = attributes_explorer.parse_attributes(save_file=False) - df = pd.read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) # For the attributes df define expected columns expect_col_names = [ @@ -76,7 +75,7 @@ def test_ce(self, component, attributes_explorer): component=component, save_file=False, include_index=False ) # convert to dataframe - component_attributes = pd.read_csv(StringIO(component_attributes_str)) + component_attributes = read_csv(StringIO(component_attributes_str)) # For the attributes df define expected columns expect_col_names = [ @@ -103,7 +102,7 @@ def test_text(self, helpers, tangled_tree): # Get text for tangled tree. text_str = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False) - df = pd.read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) # Define expected text associated with 'Patient' and 'Imaging' tree expected_patient_text = ["Biospecimen", "BulkRNA-seqAssay"] diff --git a/tests/unit/test_df_utils.py b/tests/unit/test_df_utils.py new file mode 100644 index 000000000..28db591a2 --- /dev/null +++ b/tests/unit/test_df_utils.py @@ -0,0 +1,49 @@ +from io import BytesIO + +import numpy as np +import pandas as pd +from pandas._libs.parsers import STR_NA_VALUES + +from schematic.utils.df_utils import read_csv + + +class TestReadCsv: + def test_none_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND None is included in the STR_NA_VALUES + if "None" not in STR_NA_VALUES: + STR_NA_VALUES.add("None") + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer, na_values=STR_NA_VALUES) + + # THEN the None string value is not preserved + assert not result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] is np.nan + + def test_none_not_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer) + + # THEN the None string value is preserved + assert result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] == "None" From 21421dfb1824d2c183bab3d3b2bebe37c966ca88 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 22 Nov 2024 15:28:18 -0700 Subject: [PATCH 02/23] Ignore types --- schematic/utils/df_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index cc9f5ed9b..4bf5612ef 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -12,7 +12,7 @@ import numpy as np import pandas as pd from pandarallel import pandarallel # type: ignore -from pandas._libs.parsers import STR_NA_VALUES +from pandas._libs.parsers import STR_NA_VALUES # type: ignore STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES) @@ -25,7 +25,10 @@ def read_csv( - path_or_buffer: str, keep_default_na=False, encoding="utf8", **load_args: Any + path_or_buffer: str, + keep_default_na: bool = False, + encoding: str = "utf8", + **load_args: Any, ) -> pd.DataFrame: na_values = load_args.pop( "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None From 0b221ce2445efcaac063a1fb2aeb4e3e7ba1f0c7 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 22 Nov 2024 15:36:36 -0700 Subject: [PATCH 03/23] pylint issues --- schematic/utils/df_utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 4bf5612ef..6c08dae1d 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -12,7 +12,9 @@ import numpy as np import pandas as pd from pandarallel import pandarallel # type: ignore -from pandas._libs.parsers import STR_NA_VALUES # type: ignore + +# type: ignore # pylint:disable=no-name-in-module +from pandas._libs.parsers import STR_NA_VALUES STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES) @@ -30,6 +32,18 @@ def read_csv( encoding: str = "utf8", **load_args: Any, ) -> pd.DataFrame: + """ + A wrapper around pd.read_csv that filters out "None" from the na_values list. + + Args: + path_or_buffer: The path to the file or a buffer containing the file. + keep_default_na: Whether to keep the default na_values list. + encoding: The encoding of the file. + **load_args: Additional arguments to pass to pd.read_csv. + + Returns: + pd.DataFrame: The dataframe created from the CSV file or buffer. + """ na_values = load_args.pop( "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None ) From 5af786df107ecd424ca918f8a21832724e5be30d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 22 Nov 2024 15:40:48 -0700 Subject: [PATCH 04/23] ordering of ignore --- schematic/utils/df_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 6c08dae1d..ee458ba17 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -13,8 +13,8 @@ import pandas as pd from pandarallel import pandarallel # type: ignore -# type: ignore # pylint:disable=no-name-in-module -from pandas._libs.parsers import STR_NA_VALUES +# pylint:disable=no-name-in-module +from pandas._libs.parsers import STR_NA_VALUES # type: ignore STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES) From dcb20b1913d53eabe6fbedb49bcabed002079129 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 25 Nov 2024 09:45:22 -0700 Subject: [PATCH 05/23] Add to integration test to cover none in a manifest --- tests/integration/test_manifest_submission.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_manifest_submission.py b/tests/integration/test_manifest_submission.py index 92a52ebe9..84581f7d8 100644 --- a/tests/integration/test_manifest_submission.py +++ b/tests/integration/test_manifest_submission.py @@ -4,7 +4,6 @@ import uuid from typing import Any, Callable, Dict -import pandas as pd import pytest import requests from flask.testing import FlaskClient @@ -1036,6 +1035,11 @@ def test_submit_manifest_with_hide_blanks( randomized_annotation_content = str(uuid.uuid4()) df["RandomizedAnnotation"] = randomized_annotation_content + # AND a "None" string remains in the manifest + df["NoneString"] = "None" + df["NoneString1"] = "none" + df["NoneString2"] = "NoNe" + with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file: # Write the DF to a temporary file df.to_csv(tmp_file.name, index=False) @@ -1071,6 +1075,9 @@ def test_submit_manifest_with_hide_blanks( modified_file = syn.get(df["entityId"][0], downloadFile=False) assert modified_file is not None assert modified_file["RandomizedAnnotation"][0] == randomized_annotation_content + assert modified_file["NoneString"][0] == "None" + assert modified_file["NoneString1"][0] == "none" + assert modified_file["NoneString2"][0] == "NoNe" # AND the blank annotations are not present assert "Genome Build" not in modified_file From 6eeacd5ddbfcd62aad94e301859a2d59c0b7def6 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:19:30 -0700 Subject: [PATCH 06/23] Add additional test for manifest --- tests/data/example_test_nones.model.csv | 2 +- .../mock_manifests/Valid_Test_Manifest_with_nones.csv | 10 +++++----- tests/test_utils.py | 9 +++++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/data/example_test_nones.model.csv b/tests/data/example_test_nones.model.csv index e39ec8e5d..77afc1f3c 100644 --- a/tests/data/example_test_nones.model.csv +++ b/tests/data/example_test_nones.model.csv @@ -6,7 +6,7 @@ Sex,,,"Female, Male, Other",,,TRUE,DataProperty,, Year of Birth,,,,,,FALSE,DataProperty,, Diagnosis,,,"Healthy, Cancer",,,TRUE,DataProperty,, Cancer,,,,"Cancer Type, Family History",,FALSE,ValidValue,, -Cancer Type,,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,, +Cancer Type,,,"Breast, Colorectal, Lung, Prostate, Skin, None",,,TRUE,DataProperty,, Family History,list strict,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,, Biospecimen,,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient, Sample ID,,,,,,TRUE,DataProperty,, diff --git a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv index 0bbc8c71f..6790e0a3b 100644 --- a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv +++ b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv @@ -1,5 +1,5 @@ -Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA -MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable -MockComponent,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 -MockComponent,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, -MockComponent,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file +Component,Cancer Type,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA +MockComponent,None,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable +MockComponent,None,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 +MockComponent,None,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, +MockComponent,None,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 704090648..6115c4d21 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -943,13 +943,13 @@ def test_convert_nan_entries_to_empty_strings( **load_args, ) - metadataModel = get_metadataModel(helpers, model) + get_metadataModel(helpers, model) # Instantiate Validate manifest, and run manifest validation # In this step the manifest is modified while running rule # validation so need to do this step to get the updated manfest. vm = ValidateManifest(errors, manifest, manifest_path, dmge, json_schema) - manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules( + manifest, _, _ = vm.validate_manifest_rules( manifest, dmge, restrict_rules=False, @@ -972,6 +972,11 @@ def test_convert_nan_entries_to_empty_strings( assert output["Check List"][2] == [""] assert output["Check List Like Enum"][2] == [] + assert output["Cancer Type"][0] == "None" + assert output["Cancer Type"][1] == "None" + assert output["Cancer Type"][2] == "None" + assert output["Cancer Type"][3] == "None" + def test_get_list_robustness(self, helpers): return From 28f810e229e879a45f808cabc03d2f245ac31062 Mon Sep 17 00:00:00 2001 From: Gianna Jordan <61707471+GiaJordan@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:58:30 -0700 Subject: [PATCH 07/23] [SCHEMATIC-210] Add attribute to nones data model (#1555) Update example_test_nones.model.csv component and add new invalid manifest with nones --- tests/data/example_test_nones.model.csv | 2 +- tests/data/example_test_nones.model.jsonld | 241 +++++++++++++++++- .../Invalid_Test_Manifest_with_nones.csv | 5 + 3 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv diff --git a/tests/data/example_test_nones.model.csv b/tests/data/example_test_nones.model.csv index 77afc1f3c..bb691c793 100644 --- a/tests/data/example_test_nones.model.csv +++ b/tests/data/example_test_nones.model.csv @@ -19,7 +19,7 @@ CRAM,,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,, CSV/TSV,,,,Genome Build,,FALSE,ValidValue,, Genome Build,,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,, Genome FASTA,,,,,,TRUE,DataProperty,, -MockComponent,,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,, +MockComponent,,,,"Component, Cancer Type, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,, Check List,list,,,,,FALSE,DataProperty,, Check List Enum,list,,"ab, cd, ef, gh",,,FALSE,DataProperty,, Check List Like,list like,,,,,FALSE,DataProperty,, diff --git a/tests/data/example_test_nones.model.jsonld b/tests/data/example_test_nones.model.jsonld index c47804236..07a01596b 100644 --- a/tests/data/example_test_nones.model.jsonld +++ b/tests/data/example_test_nones.model.jsonld @@ -309,6 +309,9 @@ }, { "@id": "bts:Skin" + }, + { + "@id": "bts:None" } ], "sms:displayName": "Cancer Type", @@ -468,6 +471,23 @@ "sms:required": "sms:false", "sms:validationRules": [] }, + { + "@id": "bts:None", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "None", + "rdfs:subClassOf": [ + { + "@id": "bts:CancerType" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "None", + "sms:required": "sms:false", + "sms:validationRules": [] + }, { "@id": "bts:Biospecimen", "@type": "rdfs:Class", @@ -877,12 +897,36 @@ { "@id": "bts:Component" }, + { + "@id": "bts:CancerType" + }, { "@id": "bts:CheckList" }, + { + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLike" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListStrict" + }, + { + "@id": "bts:CheckListEnumStrict" + }, { "@id": "bts:CheckRegexList" }, + { + "@id": "bts:CheckRegexListLike" + }, + { + "@id": "bts:CheckRegexListStrict" + }, { "@id": "bts:CheckRegexSingle" }, @@ -953,6 +997,25 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, + "sms:displayName": "Check List", + "sms:required": "sms:false", + "sms:validationRules": [ + "list" + ] + }, + { + "@id": "bts:CheckListEnum", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListEnum", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, "schema:rangeIncludes": [ { "@id": "bts:Ab" @@ -967,7 +1030,111 @@ "@id": "bts:Gh" } ], - "sms:displayName": "Check List", + "sms:displayName": "Check List Enum", + "sms:required": "sms:false", + "sms:validationRules": [ + "list" + ] + }, + { + "@id": "bts:CheckListLike", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListLike", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check List Like", + "sms:required": "sms:false", + "sms:validationRules": [ + "list like" + ] + }, + { + "@id": "bts:CheckListLikeEnum", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListLikeEnum", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "schema:rangeIncludes": [ + { + "@id": "bts:Ab" + }, + { + "@id": "bts:Cd" + }, + { + "@id": "bts:Ef" + }, + { + "@id": "bts:Gh" + } + ], + "sms:displayName": "Check List Like Enum", + "sms:required": "sms:false", + "sms:validationRules": [ + "list like" + ] + }, + { + "@id": "bts:CheckListStrict", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListStrict", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check List Strict", + "sms:required": "sms:false", + "sms:validationRules": [ + "list strict" + ] + }, + { + "@id": "bts:CheckListEnumStrict", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListEnumStrict", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "schema:rangeIncludes": [ + { + "@id": "bts:Ab" + }, + { + "@id": "bts:Cd" + }, + { + "@id": "bts:Ef" + }, + { + "@id": "bts:Gh" + } + ], + "sms:displayName": "Check List Enum Strict", "sms:required": "sms:false", "sms:validationRules": [ "list strict" @@ -988,6 +1155,46 @@ }, "sms:displayName": "Check Regex List", "sms:required": "sms:false", + "sms:validationRules": [ + "list", + "regex match [a-f]" + ] + }, + { + "@id": "bts:CheckRegexListLike", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckRegexListLike", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check Regex List Like", + "sms:required": "sms:false", + "sms:validationRules": [ + "list like", + "regex match [a-f]" + ] + }, + { + "@id": "bts:CheckRegexListStrict", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckRegexListStrict", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check Regex List Strict", + "sms:required": "sms:false", "sms:validationRules": [ "list strict", "regex match [a-f]" @@ -1343,7 +1550,13 @@ "rdfs:label": "Ab", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { @@ -1360,7 +1573,13 @@ "rdfs:label": "Cd", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { @@ -1377,7 +1596,13 @@ "rdfs:label": "Ef", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { @@ -1394,7 +1619,13 @@ "rdfs:label": "Gh", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { diff --git a/tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv b/tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv new file mode 100644 index 000000000..aeab9e410 --- /dev/null +++ b/tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv @@ -0,0 +1,5 @@ +Component,Cancer Type,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA +MockComponent,None,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable +MockComponent,None,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 +MockComponent,None,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, +MockComponent,InvalidValue,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file From e129361982bd53118f925ecccd3dd9d0d9cf51f2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 10 Dec 2024 09:37:09 -0800 Subject: [PATCH 08/23] first commit --- tests/integration/test_validate_manifest.py | 62 +++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 tests/integration/test_validate_manifest.py diff --git a/tests/integration/test_validate_manifest.py b/tests/integration/test_validate_manifest.py new file mode 100644 index 000000000..d302551e7 --- /dev/null +++ b/tests/integration/test_validate_manifest.py @@ -0,0 +1,62 @@ + +import pytest + +from schematic.models.validate_manifest import ValidateManifest +from schematic.schemas.data_model_json_schema import DataModelJSONSchema +from schematic.utils.df_utils import load_df + + +class TestValidateManifest: + + @pytest.mark.parametrize( + ("manifest", "model", "root_node"), + [ + ( + "mock_manifests/Valid_Test_Manifest_with_nones.csv", + "example_test_nones.model.csv", + "MockComponent", + ), + #( + # "mock_manifests/Valid_Test_Manifest_with_nones.csv", + # "example_test_nones.model2.csv", + # "MockComponent", + #), + ], + ) + def test_validate_manifest_values( + self, helpers, manifest, model, root_node + ): + # Get manifest and data model path + manifest_path = helpers.get_data_path(manifest) + model_path = helpers.get_data_path(model) + + # Gather parmeters needed to run validate_manifest_rules + errors = [] + load_args = { + "dtype": "string", + } + + dmge = helpers.get_data_model_graph_explorer(path=model) + + self.data_model_js = DataModelJSONSchema( + jsonld_path=model_path, graph=dmge.graph + ) + json_schema = self.data_model_js.get_json_validation_schema( + root_node, root_node + "_validation" + ) + + manifest = load_df( + manifest_path, + preserve_raw_input=False, + allow_na_values=True, + **load_args, + ) + + vm = ValidateManifest(errors, manifest, manifest_path, dmge, json_schema) + errors, warnings = vm.validate_manifest_values(manifest, json_schema, dmge) + import logging + assert not warnings + assert errors + for error in errors: + logging.warning(error) + assert False From 7a3bb1cc0f83950a064644bb45ecb6626d8b2661 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 10 Dec 2024 10:06:26 -0800 Subject: [PATCH 09/23] ran black --- tests/integration/test_validate_manifest.py | 54 +++++++++++++-------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/tests/integration/test_validate_manifest.py b/tests/integration/test_validate_manifest.py index d302551e7..98530a65f 100644 --- a/tests/integration/test_validate_manifest.py +++ b/tests/integration/test_validate_manifest.py @@ -1,3 +1,4 @@ +from typing import Any import pytest @@ -7,6 +8,7 @@ class TestValidateManifest: + """Tests for ValidateManifest class""" @pytest.mark.parametrize( ("manifest", "model", "root_node"), @@ -16,21 +18,29 @@ class TestValidateManifest: "example_test_nones.model.csv", "MockComponent", ), - #( - # "mock_manifests/Valid_Test_Manifest_with_nones.csv", - # "example_test_nones.model2.csv", - # "MockComponent", - #), + ( + "mock_manifests/Invalid_Test_Manifest_with_nones.csv", + "example_test_nones.model.csv", + "MockComponent", + ), ], ) def test_validate_manifest_values( - self, helpers, manifest, model, root_node + self, helpers: Any, manifest: str, model: str, root_node: str ): + """Tests for ValidateManifest.validate_manifest_values + + Args: + helpers (Any): An object with helper functions + manifest (str): A path to the manifest to be tested + model (str): A path to the model to be tested + root_node (str): The name of the component to be tested + """ # Get manifest and data model path - manifest_path = helpers.get_data_path(manifest) - model_path = helpers.get_data_path(model) + manifest_path: str = helpers.get_data_path(manifest) + model_path: str = helpers.get_data_path(model) - # Gather parmeters needed to run validate_manifest_rules + # Gather parameters needed to run validate_manifest_rules errors = [] load_args = { "dtype": "string", @@ -38,25 +48,29 @@ def test_validate_manifest_values( dmge = helpers.get_data_model_graph_explorer(path=model) - self.data_model_js = DataModelJSONSchema( - jsonld_path=model_path, graph=dmge.graph - ) - json_schema = self.data_model_js.get_json_validation_schema( + data_model_js = DataModelJSONSchema(jsonld_path=model_path, graph=dmge.graph) + json_schema = data_model_js.get_json_validation_schema( root_node, root_node + "_validation" ) - manifest = load_df( + manifest_df = load_df( manifest_path, preserve_raw_input=False, allow_na_values=True, **load_args, ) - vm = ValidateManifest(errors, manifest, manifest_path, dmge, json_schema) - errors, warnings = vm.validate_manifest_values(manifest, json_schema, dmge) - import logging + vm = ValidateManifest(errors, manifest_df, manifest_path, dmge, json_schema) + errors, warnings = vm.validate_manifest_values(manifest_df, json_schema, dmge) assert not warnings assert errors - for error in errors: - logging.warning(error) - assert False + + # Both manifests will have errors of type " is not of type 'array' + # Only the invalid manifest will have errors of type " Date: Wed, 11 Dec 2024 10:47:56 -0800 Subject: [PATCH 10/23] add test for validateModelManifest --- tests/integration/test_metadata_model.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 3d42b13b5..192e159c4 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -13,7 +13,7 @@ import tempfile import uuid from contextlib import nullcontext as does_not_raise -from typing import Callable, Optional +from typing import Callable, Optional, Any import pandas as pd import pytest @@ -22,6 +22,7 @@ from synapseclient.core import utils from synapseclient.models import File, Folder +from schematic.models.metadata import MetadataModel from schematic.store.synapse import SynapseStorage from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import create_temp_folder @@ -567,3 +568,14 @@ def _submit_and_verify_manifest( spy_upload_file_as_table.call_count == 1 spy_upload_file_as_csv.assert_not_called() spy_upload_file_combo.assert_not_called() + + def test_validate_model_manifest(self, helpers: Any) -> None: + mdm = MetadataModel( + inputMModelLocation=helpers.get_data_path("example.model.jsonld"), + data_model_labels="class_label", + inputMModelLocationType="local", + ) + errors, warnings = mdm.validateModelManifest( + manifestPath=helpers.get_data_path("mock_manifests/Valid_Test_Manifest_with_nones.csv"), + rootNode="MockComponent" + ) From 929275f4b860c6eb192b2625d0897a2f5246e414 Mon Sep 17 00:00:00 2001 From: Gianna Jordan <61707471+GiaJordan@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:48:50 -0700 Subject: [PATCH 11/23] [SCHEMATIC-214] change data model and component (#1556) * add valid values to Patient attributes * update data model * add test manifests * update test for new model * update test for new valid value --- tests/data/example.model.csv | 6 +++--- tests/data/example.model.jsonld | 20 +++++++++++++++++++ .../Invalid_none_value_test_manifest.csv | 6 ++++++ .../Valid_none_value_test_manifest.csv | 6 ++++++ tests/integration/test_commands.py | 2 +- tests/test_schemas.py | 2 +- 6 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 tests/data/mock_manifests/Invalid_none_value_test_manifest.csv create mode 100644 tests/data/mock_manifests/Valid_none_value_test_manifest.csv diff --git a/tests/data/example.model.csv b/tests/data/example.model.csv index 7438c7145..1c2923535 100644 --- a/tests/data/example.model.csv +++ b/tests/data/example.model.csv @@ -10,7 +10,7 @@ Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,, Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,, Sample ID,,,,,TRUE,DataProperty,,, -Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,, +Tissue Status,,"Healthy, Malignant, None",,,TRUE,DataProperty,,, Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,, Filename,,,,,TRUE,DataProperty,,,#MockFilename filenameExists^^ File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,, @@ -24,8 +24,8 @@ Check List,,,,,TRUE,DataProperty,,,list Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list Check List Like,,,,,TRUE,DataProperty,,,list like Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like -Check List Strict,,,,,TRUE,DataProperty,,,list strict -Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict +Check List Strict,,,,,TRUE,DataProperty,,,list strict +Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f] Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f] Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f] diff --git a/tests/data/example.model.jsonld b/tests/data/example.model.jsonld index c4279a605..7e4560d50 100644 --- a/tests/data/example.model.jsonld +++ b/tests/data/example.model.jsonld @@ -540,6 +540,9 @@ }, { "@id": "bts:Malignant" + }, + { + "@id": "bts:None" } ], "sms:displayName": "Tissue Status", @@ -563,6 +566,23 @@ "sms:required": "sms:false", "sms:validationRules": [] }, + { + "@id": "bts:None", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "None", + "rdfs:subClassOf": [ + { + "@id": "bts:TissueStatus" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "None", + "sms:required": "sms:false", + "sms:validationRules": [] + }, { "@id": "bts:BulkRNA-seqAssay", "@type": "rdfs:Class", diff --git a/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv new file mode 100644 index 000000000..8230442f3 --- /dev/null +++ b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv @@ -0,0 +1,6 @@ +Sample ID,Patient ID,Tissue Status,Component +1,1,Healthy,Biospecimen +2,2,Malignant,Biospecimen +3,3,None,Biospecimen +4,4,None,Biospecimen +5,5,InvalidValue,Biospecimen diff --git a/tests/data/mock_manifests/Valid_none_value_test_manifest.csv b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv new file mode 100644 index 000000000..b7a0666d6 --- /dev/null +++ b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv @@ -0,0 +1,6 @@ +Sample ID,Patient ID,Tissue Status,Component +1,1,Healthy,Biospecimen +2,2,Malignant,Biospecimen +3,3,None,Biospecimen +4,4,None,Biospecimen +5,5,None,Biospecimen \ No newline at end of file diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py index 7471101ec..342736ab1 100644 --- a/tests/integration/test_commands.py +++ b/tests/integration/test_commands.py @@ -339,7 +339,7 @@ def test_generate_empty_google_sheet_manifests( assert False, f"Unexpected data validation found: {dv}" assert tissue_status_validation is not None assert tissue_status_validation.type == "list" - assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$3" + assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$4" # required fields are marked as “light blue”, while other non-required fields are marked as white. for col in ["Sample ID", "Patient ID", "Tissue Status", "Component"]: diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 409c653b0..670aa2474 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -462,7 +462,7 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels) ) # Check Edge directions - assert 4 == (len(graph.out_edges("TissueStatus"))) + assert 6 == (len(graph.out_edges("TissueStatus"))) assert 2 == (len(graph.in_edges("TissueStatus"))) From 2c67d1eb72a8f3d8781d438cf5eda6aecef46f15 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 09:43:55 -0800 Subject: [PATCH 12/23] change test to use new manifests --- tests/integration/test_metadata_model.py | 45 +++++++++++++++++++++--- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 3277e3fad..eec8a7186 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -572,13 +572,50 @@ def _submit_and_verify_manifest( spy_upload_file_as_csv.assert_not_called() spy_upload_file_combo.assert_not_called() - def test_validate_model_manifest(self, helpers: Any) -> None: + + @pytest.mark.parametrize( + ("manifest", "model", "component"), + [ + ( + "tests/data/mock_manifests/Valid_none_value_test_manifest.csv", + "tests/data/example.model.csv", + "Biospecimen", + ), + ( + "tests/data/mock_manifests/Invalid_none_value_test_manifest.csv", + "tests/data/example.model.csv", + "Biospecimen", + ), + ], + ) + def test_validate_model_manifest(self, manifest: str, model: str, component: str) -> None: + """ + Tests for MetadataModel.validateModelManifest + + Args: + manifest (str): The path to the manifest + model (str): The path to the model + component (str): The component to be tested + """ mdm = MetadataModel( - inputMModelLocation=helpers.get_data_path("example.model.jsonld"), + inputMModelLocation=model, data_model_labels="class_label", inputMModelLocationType="local", ) errors, warnings = mdm.validateModelManifest( - manifestPath=helpers.get_data_path("mock_manifests/Valid_Test_Manifest_with_nones.csv"), - rootNode="MockComponent" + manifestPath=manifest, + rootNode=component ) + + assert not warnings + if manifest == "tests/data/mock_manifests/Valid_none_value_test_manifest.csv": + assert not errors + # The order of the valid values in the error message are random, so the test must be + # slightly complicated: + elif manifest == "tests/data/mock_manifests/Invalid_none_value_test_manifest.csv": + assert errors[0][0] == "6" + assert errors[0][1] == "Tissue Status" + assert errors[0][3] == "InvalidValue" + error_message = errors [0][2] + assert isinstance(error_message, str) + assert error_message.startswith("'InvalidValue' is not one of") From d53d9888519971ee20bbaa7ccb99590b57107569 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 09:47:13 -0800 Subject: [PATCH 13/23] remove uneeded test file --- tests/integration/test_validate_manifest.py | 76 --------------------- 1 file changed, 76 deletions(-) delete mode 100644 tests/integration/test_validate_manifest.py diff --git a/tests/integration/test_validate_manifest.py b/tests/integration/test_validate_manifest.py deleted file mode 100644 index 98530a65f..000000000 --- a/tests/integration/test_validate_manifest.py +++ /dev/null @@ -1,76 +0,0 @@ -from typing import Any - -import pytest - -from schematic.models.validate_manifest import ValidateManifest -from schematic.schemas.data_model_json_schema import DataModelJSONSchema -from schematic.utils.df_utils import load_df - - -class TestValidateManifest: - """Tests for ValidateManifest class""" - - @pytest.mark.parametrize( - ("manifest", "model", "root_node"), - [ - ( - "mock_manifests/Valid_Test_Manifest_with_nones.csv", - "example_test_nones.model.csv", - "MockComponent", - ), - ( - "mock_manifests/Invalid_Test_Manifest_with_nones.csv", - "example_test_nones.model.csv", - "MockComponent", - ), - ], - ) - def test_validate_manifest_values( - self, helpers: Any, manifest: str, model: str, root_node: str - ): - """Tests for ValidateManifest.validate_manifest_values - - Args: - helpers (Any): An object with helper functions - manifest (str): A path to the manifest to be tested - model (str): A path to the model to be tested - root_node (str): The name of the component to be tested - """ - # Get manifest and data model path - manifest_path: str = helpers.get_data_path(manifest) - model_path: str = helpers.get_data_path(model) - - # Gather parameters needed to run validate_manifest_rules - errors = [] - load_args = { - "dtype": "string", - } - - dmge = helpers.get_data_model_graph_explorer(path=model) - - data_model_js = DataModelJSONSchema(jsonld_path=model_path, graph=dmge.graph) - json_schema = data_model_js.get_json_validation_schema( - root_node, root_node + "_validation" - ) - - manifest_df = load_df( - manifest_path, - preserve_raw_input=False, - allow_na_values=True, - **load_args, - ) - - vm = ValidateManifest(errors, manifest_df, manifest_path, dmge, json_schema) - errors, warnings = vm.validate_manifest_values(manifest_df, json_schema, dmge) - assert not warnings - assert errors - - # Both manifests will have errors of type " is not of type 'array' - # Only the invalid manifest will have errors of type " Date: Thu, 12 Dec 2024 09:47:47 -0800 Subject: [PATCH 14/23] revert file --- tests/data/example_test_nones.model.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/example_test_nones.model.csv b/tests/data/example_test_nones.model.csv index bb691c793..e39ec8e5d 100644 --- a/tests/data/example_test_nones.model.csv +++ b/tests/data/example_test_nones.model.csv @@ -6,7 +6,7 @@ Sex,,,"Female, Male, Other",,,TRUE,DataProperty,, Year of Birth,,,,,,FALSE,DataProperty,, Diagnosis,,,"Healthy, Cancer",,,TRUE,DataProperty,, Cancer,,,,"Cancer Type, Family History",,FALSE,ValidValue,, -Cancer Type,,,"Breast, Colorectal, Lung, Prostate, Skin, None",,,TRUE,DataProperty,, +Cancer Type,,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,, Family History,list strict,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,, Biospecimen,,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient, Sample ID,,,,,,TRUE,DataProperty,, @@ -19,7 +19,7 @@ CRAM,,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,, CSV/TSV,,,,Genome Build,,FALSE,ValidValue,, Genome Build,,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,, Genome FASTA,,,,,,TRUE,DataProperty,, -MockComponent,,,,"Component, Cancer Type, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,, +MockComponent,,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,, Check List,list,,,,,FALSE,DataProperty,, Check List Enum,list,,"ab, cd, ef, gh",,,FALSE,DataProperty,, Check List Like,list like,,,,,FALSE,DataProperty,, From b5960a4da57866667385c466c60ea0f327124043 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 09:48:51 -0800 Subject: [PATCH 15/23] revert file --- tests/data/example_test_nones.model.jsonld | 241 +-------------------- 1 file changed, 5 insertions(+), 236 deletions(-) diff --git a/tests/data/example_test_nones.model.jsonld b/tests/data/example_test_nones.model.jsonld index 07a01596b..c47804236 100644 --- a/tests/data/example_test_nones.model.jsonld +++ b/tests/data/example_test_nones.model.jsonld @@ -309,9 +309,6 @@ }, { "@id": "bts:Skin" - }, - { - "@id": "bts:None" } ], "sms:displayName": "Cancer Type", @@ -471,23 +468,6 @@ "sms:required": "sms:false", "sms:validationRules": [] }, - { - "@id": "bts:None", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "None", - "rdfs:subClassOf": [ - { - "@id": "bts:CancerType" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "None", - "sms:required": "sms:false", - "sms:validationRules": [] - }, { "@id": "bts:Biospecimen", "@type": "rdfs:Class", @@ -897,36 +877,12 @@ { "@id": "bts:Component" }, - { - "@id": "bts:CancerType" - }, { "@id": "bts:CheckList" }, - { - "@id": "bts:CheckListEnum" - }, - { - "@id": "bts:CheckListLike" - }, - { - "@id": "bts:CheckListLikeEnum" - }, - { - "@id": "bts:CheckListStrict" - }, - { - "@id": "bts:CheckListEnumStrict" - }, { "@id": "bts:CheckRegexList" }, - { - "@id": "bts:CheckRegexListLike" - }, - { - "@id": "bts:CheckRegexListStrict" - }, { "@id": "bts:CheckRegexSingle" }, @@ -997,77 +953,6 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check List", - "sms:required": "sms:false", - "sms:validationRules": [ - "list" - ] - }, - { - "@id": "bts:CheckListEnum", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckListEnum", - "rdfs:subClassOf": [ - { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": [ - { - "@id": "bts:Ab" - }, - { - "@id": "bts:Cd" - }, - { - "@id": "bts:Ef" - }, - { - "@id": "bts:Gh" - } - ], - "sms:displayName": "Check List Enum", - "sms:required": "sms:false", - "sms:validationRules": [ - "list" - ] - }, - { - "@id": "bts:CheckListLike", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckListLike", - "rdfs:subClassOf": [ - { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "Check List Like", - "sms:required": "sms:false", - "sms:validationRules": [ - "list like" - ] - }, - { - "@id": "bts:CheckListLikeEnum", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckListLikeEnum", - "rdfs:subClassOf": [ - { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, "schema:rangeIncludes": [ { "@id": "bts:Ab" @@ -1082,59 +967,7 @@ "@id": "bts:Gh" } ], - "sms:displayName": "Check List Like Enum", - "sms:required": "sms:false", - "sms:validationRules": [ - "list like" - ] - }, - { - "@id": "bts:CheckListStrict", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckListStrict", - "rdfs:subClassOf": [ - { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "Check List Strict", - "sms:required": "sms:false", - "sms:validationRules": [ - "list strict" - ] - }, - { - "@id": "bts:CheckListEnumStrict", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckListEnumStrict", - "rdfs:subClassOf": [ - { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": [ - { - "@id": "bts:Ab" - }, - { - "@id": "bts:Cd" - }, - { - "@id": "bts:Ef" - }, - { - "@id": "bts:Gh" - } - ], - "sms:displayName": "Check List Enum Strict", + "sms:displayName": "Check List", "sms:required": "sms:false", "sms:validationRules": [ "list strict" @@ -1155,46 +988,6 @@ }, "sms:displayName": "Check Regex List", "sms:required": "sms:false", - "sms:validationRules": [ - "list", - "regex match [a-f]" - ] - }, - { - "@id": "bts:CheckRegexListLike", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckRegexListLike", - "rdfs:subClassOf": [ - { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "Check Regex List Like", - "sms:required": "sms:false", - "sms:validationRules": [ - "list like", - "regex match [a-f]" - ] - }, - { - "@id": "bts:CheckRegexListStrict", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckRegexListStrict", - "rdfs:subClassOf": [ - { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "Check Regex List Strict", - "sms:required": "sms:false", "sms:validationRules": [ "list strict", "regex match [a-f]" @@ -1550,13 +1343,7 @@ "rdfs:label": "Ab", "rdfs:subClassOf": [ { - "@id": "bts:CheckListEnum" - }, - { - "@id": "bts:CheckListLikeEnum" - }, - { - "@id": "bts:CheckListEnumStrict" + "@id": "bts:CheckList" } ], "schema:isPartOf": { @@ -1573,13 +1360,7 @@ "rdfs:label": "Cd", "rdfs:subClassOf": [ { - "@id": "bts:CheckListEnum" - }, - { - "@id": "bts:CheckListLikeEnum" - }, - { - "@id": "bts:CheckListEnumStrict" + "@id": "bts:CheckList" } ], "schema:isPartOf": { @@ -1596,13 +1377,7 @@ "rdfs:label": "Ef", "rdfs:subClassOf": [ { - "@id": "bts:CheckListEnum" - }, - { - "@id": "bts:CheckListLikeEnum" - }, - { - "@id": "bts:CheckListEnumStrict" + "@id": "bts:CheckList" } ], "schema:isPartOf": { @@ -1619,13 +1394,7 @@ "rdfs:label": "Gh", "rdfs:subClassOf": [ { - "@id": "bts:CheckListEnum" - }, - { - "@id": "bts:CheckListLikeEnum" - }, - { - "@id": "bts:CheckListEnumStrict" + "@id": "bts:CheckList" } ], "schema:isPartOf": { From a291b3214eeab8834eeb74134655179049e917ae Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 10:13:23 -0800 Subject: [PATCH 16/23] change tests to use new manifests --- tests/test_utils.py | 22 +++++++--------------- tests/test_validation.py | 6 +++--- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 6115c4d21..b124eaec8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -908,9 +908,9 @@ def test_validate_property_schema(self, helpers): "Patient", ), ( - "mock_manifests/Valid_Test_Manifest_with_nones.csv", - "example_test_nones.model.csv", - "MockComponent", + "mock_manifests/Valid_none_value_test_manifest.csv", + "example.model.csv", + "Biospecimen", ), ], ) @@ -964,18 +964,10 @@ def test_convert_nan_entries_to_empty_strings( if root_node == "Patient": assert manifest["Family History"][0] == [""] assert output["Family History"][0] == [""] - elif root_node == "MockComponent": - assert manifest["Check List"][2] == [""] - assert manifest["Check List Like Enum"][2] == [] - assert type(manifest["Check NA"][2]) == type(pd.NA) - - assert output["Check List"][2] == [""] - assert output["Check List Like Enum"][2] == [] - - assert output["Cancer Type"][0] == "None" - assert output["Cancer Type"][1] == "None" - assert output["Cancer Type"][2] == "None" - assert output["Cancer Type"][3] == "None" + elif root_node == "Biospecimen": + assert output["Tissue Status"][2] == "None" + assert output["Tissue Status"][3] == "None" + assert output["Tissue Status"][4] == "None" def test_get_list_robustness(self, helpers): return diff --git a/tests/test_validation.py b/tests/test_validation.py index c16a95bb7..06330417b 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -50,9 +50,9 @@ class TestManifestValidation: "Patient", ), ( - "example_test_nones.model.csv", - "mock_manifests/Valid_Test_Manifest_with_nones.csv", - "MockComponent", + "example.model.csv", + "mock_manifests/Valid_none_value_test_manifest.csv", + "Biospecimen", ), ], ids=[ From 1708262a0644d82bfbd78cffeed6afa62f870551 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 10:14:35 -0800 Subject: [PATCH 17/23] remove uneeded manifests --- .../data/mock_manifests/Invalid_Test_Manifest_with_nones.csv | 5 ----- tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv | 5 ----- 2 files changed, 10 deletions(-) delete mode 100644 tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv delete mode 100644 tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv diff --git a/tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv b/tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv deleted file mode 100644 index aeab9e410..000000000 --- a/tests/data/mock_manifests/Invalid_Test_Manifest_with_nones.csv +++ /dev/null @@ -1,5 +0,0 @@ -Component,Cancer Type,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA -MockComponent,None,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable -MockComponent,None,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 -MockComponent,None,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, -MockComponent,InvalidValue,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file diff --git a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv deleted file mode 100644 index 6790e0a3b..000000000 --- a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv +++ /dev/null @@ -1,5 +0,0 @@ -Component,Cancer Type,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA -MockComponent,None,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable -MockComponent,None,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 -MockComponent,None,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, -MockComponent,None,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file From 2cbcae47ca3c0ebeb46bb51114b11150b1ac1b48 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 10:15:07 -0800 Subject: [PATCH 18/23] ran black --- tests/integration/test_metadata_model.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index eec8a7186..b7b120b48 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -572,7 +572,6 @@ def _submit_and_verify_manifest( spy_upload_file_as_csv.assert_not_called() spy_upload_file_combo.assert_not_called() - @pytest.mark.parametrize( ("manifest", "model", "component"), [ @@ -588,7 +587,9 @@ def _submit_and_verify_manifest( ), ], ) - def test_validate_model_manifest(self, manifest: str, model: str, component: str) -> None: + def test_validate_model_manifest( + self, manifest: str, model: str, component: str + ) -> None: """ Tests for MetadataModel.validateModelManifest @@ -603,8 +604,7 @@ def test_validate_model_manifest(self, manifest: str, model: str, component: str inputMModelLocationType="local", ) errors, warnings = mdm.validateModelManifest( - manifestPath=manifest, - rootNode=component + manifestPath=manifest, rootNode=component ) assert not warnings @@ -612,10 +612,12 @@ def test_validate_model_manifest(self, manifest: str, model: str, component: str assert not errors # The order of the valid values in the error message are random, so the test must be # slightly complicated: - elif manifest == "tests/data/mock_manifests/Invalid_none_value_test_manifest.csv": + elif ( + manifest == "tests/data/mock_manifests/Invalid_none_value_test_manifest.csv" + ): assert errors[0][0] == "6" assert errors[0][1] == "Tissue Status" assert errors[0][3] == "InvalidValue" - error_message = errors [0][2] + error_message = errors[0][2] assert isinstance(error_message, str) assert error_message.startswith("'InvalidValue' is not one of") From b89c597c7ef04298b66de9f3090487a1902d980f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 10:53:55 -0800 Subject: [PATCH 19/23] add tests back in --- .../Valid_Test_Manifest_with_nones.csv | 5 +++++ tests/test_utils.py | 12 ++++++++++++ tests/test_validation.py | 8 +++++++- 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv diff --git a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv new file mode 100644 index 000000000..2219e328d --- /dev/null +++ b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv @@ -0,0 +1,5 @@ +Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA +MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable +MockComponent,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 +MockComponent,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, +MockComponent,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index b124eaec8..4a8d30cd3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -912,6 +912,11 @@ def test_validate_property_schema(self, helpers): "example.model.csv", "Biospecimen", ), + ( + "mock_manifests/Valid_Test_Manifest_with_nones.csv", + "example_test_nones.model.csv", + "MockComponent", + ), ], ) def test_convert_nan_entries_to_empty_strings( @@ -968,6 +973,13 @@ def test_convert_nan_entries_to_empty_strings( assert output["Tissue Status"][2] == "None" assert output["Tissue Status"][3] == "None" assert output["Tissue Status"][4] == "None" + elif root_node == "MockComponent": + assert manifest["Check List"][2] == [""] + assert manifest["Check List Like Enum"][2] == [] + assert type(manifest["Check NA"][2]) == type(pd.NA) + + assert output["Check List"][2] == [""] + assert output["Check List Like Enum"][2] == [] def test_get_list_robustness(self, helpers): return diff --git a/tests/test_validation.py b/tests/test_validation.py index 06330417b..4ab24ac2e 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -49,6 +49,11 @@ class TestManifestValidation: "mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", "Patient", ), + ( + "example_test_nones.model.csv", + "mock_manifests/Valid_Test_Manifest_with_nones.csv", + "MockComponent", + ), ( "example.model.csv", "mock_manifests/Valid_none_value_test_manifest.csv", @@ -58,7 +63,8 @@ class TestManifestValidation: ids=[ "example_model", "example_with_no_entry_for_cond_required_columns", - "example_with_nones", + "example_with_nones_mock_component", + "example_with_nones_biospecimen" ], ) @pytest.mark.parametrize( From ae6f658dc406fbb3ba5875029783f1874124c79e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 10:55:03 -0800 Subject: [PATCH 20/23] ran black --- tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 4ab24ac2e..646a2e543 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -64,7 +64,7 @@ class TestManifestValidation: "example_model", "example_with_no_entry_for_cond_required_columns", "example_with_nones_mock_component", - "example_with_nones_biospecimen" + "example_with_nones_biospecimen", ], ) @pytest.mark.parametrize( From 6ffa9d61a53ec5f3bf85125f69ed351df69c72d9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 12 Dec 2024 12:54:05 -0800 Subject: [PATCH 21/23] revert manifest --- .../mock_manifests/Valid_Test_Manifest_with_nones.csv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv index 2219e328d..0bbc8c71f 100644 --- a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv +++ b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv @@ -1,5 +1,5 @@ -Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA -MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable -MockComponent,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 -MockComponent,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, +Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA +MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable +MockComponent,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 +MockComponent,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, MockComponent,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file From d0a8e1577d439eb487370d31d565f71f1ddfbe95 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 12 Dec 2024 16:25:26 -0800 Subject: [PATCH 22/23] Split up valid and errored test as separate testing functions --- tests/integration/test_metadata_model.py | 79 +++++++++++------------- 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index b7b120b48..c175219dc 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -1,5 +1,5 @@ """ -This script contains a test suite for verifying the submission and annotation of +This script contains a test suite for verifying the validation, submission and annotation of file-based manifests using the `TestMetadataModel` class to communicate with Synapse and verify the expected behavior of uploading annotation manifest CSVs using the metadata model. @@ -13,7 +13,7 @@ import tempfile import uuid from contextlib import nullcontext as does_not_raise -from typing import Callable, Optional, Any +from typing import Any, Callable, Optional import pandas as pd import pytest @@ -22,7 +22,7 @@ from synapseclient.core import utils from synapseclient.models import File, Folder -from schematic.models.metadata import MetadataModel +# from schematic.models.metadata import MetadataModel from schematic.store.synapse import SynapseStorage from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import create_temp_folder @@ -572,52 +572,45 @@ def _submit_and_verify_manifest( spy_upload_file_as_csv.assert_not_called() spy_upload_file_combo.assert_not_called() - @pytest.mark.parametrize( - ("manifest", "model", "component"), - [ - ( - "tests/data/mock_manifests/Valid_none_value_test_manifest.csv", - "tests/data/example.model.csv", - "Biospecimen", - ), - ( - "tests/data/mock_manifests/Invalid_none_value_test_manifest.csv", - "tests/data/example.model.csv", - "Biospecimen", - ), - ], - ) - def test_validate_model_manifest( - self, manifest: str, model: str, component: str + def test_validate_model_manifest_valid_with_none_string( + self, helpers: Helpers ) -> None: """ - Tests for MetadataModel.validateModelManifest + Tests for validateModelManifest when the manifest is valid with 'None' values Args: - manifest (str): The path to the manifest - model (str): The path to the model - component (str): The component to be tested + helpers: Test helper functions """ - mdm = MetadataModel( - inputMModelLocation=model, - data_model_labels="class_label", - inputMModelLocationType="local", - ) - errors, warnings = mdm.validateModelManifest( - manifestPath=manifest, rootNode=component + meta_data_model = metadata_model(helpers, "class_label") + + errors, warnings = meta_data_model.validateModelManifest( + manifestPath="tests/data/mock_manifests/Valid_none_value_test_manifest.csv", + rootNode="Biospecimen", ) + assert not warnings + assert not errors + + def test_validate_model_manifest_invalid(self, helpers: Helpers) -> None: + """ + Tests for validateModelManifest when the manifest requires values to be from a set of values containing 'None' + Args: + helpers: Test helper functions + """ + meta_data_model = metadata_model(helpers, "class_label") + + errors, warnings = meta_data_model.validateModelManifest( + manifestPath="tests/data/mock_manifests/Invalid_none_value_test_manifest.csv", + rootNode="Biospecimen", + ) assert not warnings - if manifest == "tests/data/mock_manifests/Valid_none_value_test_manifest.csv": - assert not errors + assert errors[0][0] == "6" + assert errors[0][1] == "Tissue Status" + assert errors[0][3] == "InvalidValue" # The order of the valid values in the error message are random, so the test must be - # slightly complicated: - elif ( - manifest == "tests/data/mock_manifests/Invalid_none_value_test_manifest.csv" - ): - assert errors[0][0] == "6" - assert errors[0][1] == "Tissue Status" - assert errors[0][3] == "InvalidValue" - error_message = errors[0][2] - assert isinstance(error_message, str) - assert error_message.startswith("'InvalidValue' is not one of") + # slightly complicated: + # 'InvalidValue' is not one of ['Malignant', 'Healthy', 'None'] + # 'InvalidValue' is not one of ['Healthy', 'Malignant', 'None'] + error_message = errors[0][2] + assert isinstance(error_message, str) + assert error_message.startswith("'InvalidValue' is not one of") From 7310340037cee2ce0ccef65829b2617f7b7c12a4 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 12 Dec 2024 16:27:31 -0800 Subject: [PATCH 23/23] Remove unused import --- tests/integration/test_metadata_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index c175219dc..21da9cfd8 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -22,7 +22,6 @@ from synapseclient.core import utils from synapseclient.models import File, Folder -# from schematic.models.metadata import MetadataModel from schematic.store.synapse import SynapseStorage from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import create_temp_folder