From 92865034e300d3429ce5416d876ba4d0700e6330 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 10 Sep 2024 11:40:41 -0700 Subject: [PATCH 1/5] add new tests --- schematic/models/validate_attribute.py | 37 +- tests/unit/test_validate_attribute.py | 483 +++++++++++++++++++------ 2 files changed, 396 insertions(+), 124 deletions(-) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 8ff1cf0ea..aaa3fe35a 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -907,14 +907,14 @@ def get_target_manifests( def list_validation( self, val_rule: str, - manifest_col: pd.core.series.Series, - ) -> tuple[list[list[str]], list[list[str]], pd.core.series.Series]: + manifest_col: pd.Series, + ) -> tuple[list[list[str]], list[list[str]], pd.Series]: """ Purpose: Determine if values for a particular attribute are comma separated. Input: - val_rule: str, Validation rule - - manifest_col: pd.core.series.Series, column for a given attribute + - manifest_col: pd.Series, column for a given attribute Returns: - manifest_col: Input values in manifest arere-formatted to a list logger.error or logger.warning. @@ -925,8 +925,8 @@ def list_validation( # For each 'list' (input as a string with a , delimiter) entered, # convert to a real list of strings, with leading and trailing # white spaces removed. - errors = [] - warnings = [] + errors:list[list[str]] = [] + warnings:list[list[str]] = [] replace_null = True csv_re = comma_separated_list_regex() @@ -947,7 +947,8 @@ def list_validation( entry=list_string, node_display_name=manifest_col.name, ) - + # Since this column has been turned into a string, it's unclear if this can ever be + # anything other than a string if not isinstance(list_string, str) and entry_has_value: list_error = "not_a_string" elif not re.fullmatch(csv_re, list_string) and entry_has_value: @@ -976,7 +977,7 @@ def list_validation( def regex_validation( self, val_rule: str, - manifest_col: pd.core.series.Series, + manifest_col: pd.Series, ) -> tuple[list[list[str]], list[list[str]]]: """ Purpose: @@ -984,7 +985,7 @@ def regex_validation( provided in val_rule. Input: - val_rule: str, Validation rule - - manifest_col: pd.core.series.Series, column for a given + - manifest_col: pd.Series, column for a given attribute in the manifest - dmge: DataModelGraphExplorer Object Using this module requres validation rules written in the following manner: @@ -998,8 +999,8 @@ def regex_validation( - This function will return errors when the user input value does not match schema specifications. logger.error or logger.warning. - Errors: list[str] Error details for further storage. - warnings: list[str] Warning details for further storage. + Errors: list[list[str]] Error details for further storage. + warnings: list[list[str]] Warning details for further storage. TODO: move validation to convert step. """ @@ -1015,13 +1016,13 @@ def regex_validation( f" They should be provided as follows ['regex', 'module name', 'regular expression']" ) - errors = [] - warnings = [] + errors:list[list[str]] = [] + warnings:list[list[str]] = [] validation_rules = self.dmge.get_node_validation_rules( node_display_name=manifest_col.name ) - + # TODO: Write test to trigger this if statemment: if validation_rules and "::" in validation_rules[0]: validation_rules = validation_rules[0].split("::") # Handle case where validating re's within a list. @@ -1089,7 +1090,7 @@ def regex_validation( def type_validation( self, val_rule: str, - manifest_col: pd.core.series.Series, + manifest_col: pd.Series, ) -> tuple[list[list[str]], list[list[str]]]: """ Purpose: @@ -1098,7 +1099,7 @@ def type_validation( Input: - val_rule: str, Validation rule, specifying input type, either 'float', 'int', 'num', 'str' - - manifest_col: pd.core.series.Series, column for a given + - manifest_col: pd.Series, column for a given attribute in the manifest Returns: -This function will return errors when the user input value @@ -1116,8 +1117,8 @@ def type_validation( "str": (str), } - errors = [] - warnings = [] + errors:list[list[str]] = [] + warnings:list[list[str]] = [] # num indicates either a float or int. if val_rule == "num": @@ -1140,6 +1141,7 @@ def type_validation( ) if vr_errors: errors.append(vr_errors) + # It seems impossible to get warnings with type rules if vr_warnings: warnings.append(vr_warnings) elif val_rule in ["int", "float", "str"]: @@ -1162,6 +1164,7 @@ def type_validation( ) if vr_errors: errors.append(vr_errors) + # It seems impossible to get warnings with type rules if vr_warnings: warnings.append(vr_warnings) return errors, warnings diff --git a/tests/unit/test_validate_attribute.py b/tests/unit/test_validate_attribute.py index 26ab9f167..7ca026331 100644 --- a/tests/unit/test_validate_attribute.py +++ b/tests/unit/test_validate_attribute.py @@ -4,6 +4,7 @@ from unittest.mock import patch import pytest +from jsonschema import ValidationError from pandas import Series, DataFrame, concat import numpy as np @@ -73,6 +74,15 @@ } ) +TEST_DF3 = DataFrame( + { + "PatientID": ["A", "A", "A", "B", "C"], + "component": ["comp1", "comp1", "comp1", "comp1", "comp1"], + "id": ["id1", "id2", "id3", "id4", "id5"], + "entityid": ["x", "x", "x", "x", "x"], + } +) + TEST_DF_MISSING_VALS = DataFrame( { "PatientID": [np.isnan, ""], @@ -108,10 +118,10 @@ def fixture_va_obj( yield ValidateAttribute(dmge) -@pytest.fixture(name="cross_val_df1") -def fixture_cross_val_df1() -> Generator[DataFrame, None, None]: +@pytest.fixture(name="test_df1") +def fixture_test_df1() -> Generator[DataFrame, None, None]: """Yields a dataframe""" - df = DataFrame( + yield DataFrame( { "PatientID": ["A", "B", "C"], "component": ["comp1", "comp1", "comp1"], @@ -119,44 +129,9 @@ def fixture_cross_val_df1() -> Generator[DataFrame, None, None]: "entityid": ["x", "x", "x"], } ) - yield df - - -@pytest.fixture(name="cross_val_df2") -def fixture_cross_val_df2(cross_val_df1: DataFrame) -> Generator[DataFrame, None, None]: - """Yields dataframe df1 with an extra row""" - df = concat( - [ - cross_val_df1, - DataFrame( - { - "PatientID": ["D"], - "component": ["comp1"], - "id": ["id4"], - "entityid": ["x"], - } - ), - ] - ) - yield df - - -@pytest.fixture(name="cross_val_df3") -def fixture_cross_val_df3() -> Generator[DataFrame, None, None]: - """Yields empty dataframe""" - df = DataFrame( - { - "PatientID": [], - "component": [], - "id": [], - "entityid": [], - } - ) - yield df - -@pytest.fixture(name="cross_val_col_names") -def fixture_cross_val_col_names() -> Generator[dict[str, str], None, None]: +@pytest.fixture(name="test_df_col_names") +def fixture_test_df_col_names() -> Generator[dict[str, str], None, None]: """ Yields: Generator[dict[str, str], None, None]: A dicitonary of column names @@ -180,10 +155,10 @@ class TestValidateAttributeObject: @pytest.mark.parametrize("series", EXACTLY_ATLEAST_PASSING_SERIES) @pytest.mark.parametrize("rule", MATCH_ATLEAST_ONE_SET_RULES) - def test_cross_validation_match_atleast_one_set_rules_passing( + def test_cross_validation_match_atleast_one_set_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -194,16 +169,16 @@ def test_cross_validation_match_atleast_one_set_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, series) == ([], []) @pytest.mark.parametrize("series", EXACTLY_ATLEAST_PASSING_SERIES) @pytest.mark.parametrize("rule", MATCH_EXACTLY_ONE_SET_RULES) - def test_cross_validation_match_exactly_one_set_rules_passing( + def test_cross_validation_match_exactly_one_set_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -214,7 +189,7 @@ def test_cross_validation_match_exactly_one_set_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, series) == ([], []) @@ -228,10 +203,10 @@ def test_cross_validation_match_exactly_one_set_rules_passing( ], ) @pytest.mark.parametrize("rule", MATCH_ATLEAST_ONE_SET_RULES) - def test_cross_validation_match_atleast_one_set_rules_errors( + def test_cross_validation_match_atleast_one_set_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -242,7 +217,7 @@ def test_cross_validation_match_atleast_one_set_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, series) if rule.endswith("error"): @@ -260,10 +235,10 @@ def test_cross_validation_match_atleast_one_set_rules_errors( ], ) @pytest.mark.parametrize("rule", MATCH_EXACTLY_ONE_SET_RULES) - def test_cross_validation_match_exactly_one_set_rules_errors( + def test_cross_validation_match_exactly_one_set_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -274,7 +249,7 @@ def test_cross_validation_match_exactly_one_set_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1, "syn2": cross_val_df1}, + return_value={"syn1": test_df1, "syn2": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, series) if rule.endswith("error"): @@ -295,10 +270,10 @@ def test_cross_validation_match_exactly_one_set_rules_errors( ], ) @pytest.mark.parametrize("rule", MATCH_NONE_SET_RULES) - def test_cross_validation_match_none_set_rules_passing( + def test_cross_validation_match_none_set_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -309,7 +284,7 @@ def test_cross_validation_match_none_set_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, series) == ([], []) @@ -323,10 +298,10 @@ def test_cross_validation_match_none_set_rules_passing( ], ) @pytest.mark.parametrize("rule", MATCH_NONE_SET_RULES) - def test_cross_validation_match_none_set_rules_errors( + def test_cross_validation_match_none_set_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -337,7 +312,7 @@ def test_cross_validation_match_none_set_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, series) if rule.endswith("error"): @@ -348,6 +323,7 @@ def test_cross_validation_match_none_set_rules_errors( assert errors == [] @pytest.mark.parametrize("rule", MATCH_ATLEAST_ONE_VALUE_RULES) + @pytest.mark.parametrize("target_manifest", [TEST_DF1, TEST_DF3]) @pytest.mark.parametrize( "tested_column", [ @@ -359,12 +335,12 @@ def test_cross_validation_match_none_set_rules_errors( (["A", "B", "C", "C"]), ], ) - def test_cross_validation_value_match_atleast_one_rules_passing( + def test_cross_validation_match_atleast_one_value_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, rule: str, tested_column: list, + target_manifest: DataFrame ): """ Tests ValidateAttribute.cross_validation @@ -373,7 +349,7 @@ def test_cross_validation_value_match_atleast_one_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": target_manifest}, ): assert va_obj.cross_validation(rule, Series(tested_column)) == ([], []) @@ -388,10 +364,10 @@ def test_cross_validation_value_match_atleast_one_rules_passing( Series([1], index=[0], name="PatientID"), ], ) - def test_cross_validation_value_match_atleast_one_rules_errors( + def test_cross_validation_match_atleast_one_value_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: Series, ): @@ -402,7 +378,7 @@ def test_cross_validation_value_match_atleast_one_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, tested_column) if rule.endswith("error"): @@ -414,31 +390,39 @@ def test_cross_validation_value_match_atleast_one_rules_errors( @pytest.mark.parametrize("rule", MATCH_EXACTLY_ONE_VALUE_RULES) @pytest.mark.parametrize( - "tested_column", + "tested_column, target_manifest", [ - ([]), - (["A"]), - (["A", "A"]), - (["A", "B"]), - (["A", "B", "C"]), - (["A", "B", "C", "C"]), + ([], TEST_DF1), + ([], TEST_DF3), + (["C"], TEST_DF1), + (["C"], TEST_DF3), + (["C", "C"], TEST_DF1), + (["C", "C"], TEST_DF3), + + (["A"], TEST_DF1), + (["A", "A"], TEST_DF1), + (["A", "B"], TEST_DF1), + (["A", "B", "C"], TEST_DF1), + (["A", "B", "C", "C"], TEST_DF1), ], ) - def test_cross_validation_match_exactly_one_value_rules_passing( + def test_cross_validation_match_exactly_one_value_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, rule: str, tested_column: list, + target_manifest: DataFrame ): """ Tests ValidateAttribute.cross_validation These tests show what columns pass for matchExactlyOne + The first group are ones that pass for TEST_DF1 and TEST_DF3 + The second group are those that pass only for test """ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": target_manifest}, ): assert va_obj.cross_validation(rule, Series(tested_column)) == ([], []) @@ -452,10 +436,10 @@ def test_cross_validation_match_exactly_one_value_rules_passing( Series([1], index=[0], name="PatientID"), ], ) - def test_cross_validation_value_match_exactly_one_rules_errors( + def test_cross_validation_match_exactly_one_value_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: Series, ): @@ -466,7 +450,7 @@ def test_cross_validation_value_match_exactly_one_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, tested_column) if rule.endswith("error"): @@ -481,10 +465,10 @@ def test_cross_validation_value_match_exactly_one_rules_errors( "tested_column", [([]), (["D"]), (["D", "D"]), (["D", "F"]), ([1]), ([np.nan])], ) - def test_cross_validation_match_none_value_rules_passing( + def test_cross_validation_match_none_value_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: list, ): @@ -495,7 +479,7 @@ def test_cross_validation_match_none_value_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, Series(tested_column)) == ([], []) @@ -508,10 +492,10 @@ def test_cross_validation_match_none_value_rules_passing( Series(["A", "A"], index=[0, 1], name="PatientID"), ], ) - def test_cross_validation_value_match_none_rules_errors( + def test_cross_validation_value_match_none_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: Series, ): @@ -522,7 +506,7 @@ def test_cross_validation_value_match_none_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, tested_column) if rule.endswith("error"): @@ -591,7 +575,7 @@ def test__run_validation_across_target_manifests_return_msg( @pytest.mark.parametrize("rule", ALL_VALUE_RULES) def test__run_validation_across_target_manifests_value_scope( - self, va_obj: ValidateAttribute, cross_val_df1: DataFrame, rule: str + self, va_obj: ValidateAttribute, test_df1: DataFrame, rule: str ) -> None: """Tests for ValidateAttribute._run_validation_across_target_manifests with value rule""" @@ -599,7 +583,7 @@ def test__run_validation_across_target_manifests_value_scope( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="value", @@ -634,7 +618,7 @@ def test__run_validation_across_target_manifests_value_scope( def test__run_validation_across_target_manifests_match_atleast_exactly_with_one_target( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -652,7 +636,7 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_one_ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -679,7 +663,7 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_one_ def test__run_validation_across_target_manifests_match_atleast_exactly_with_two_targets( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -696,7 +680,7 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_two_ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1, "syn2": cross_val_df1}, + return_value={"syn1": test_df1, "syn2": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -726,7 +710,7 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_two_ def test__run_validation_across_target_manifests_set_rules_match_none_with_one_target( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -743,7 +727,7 @@ def test__run_validation_across_target_manifests_set_rules_match_none_with_one_t with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -773,7 +757,7 @@ def test__run_validation_across_target_manifests_set_rules_match_none_with_one_t def test__run_validation_across_target_manifests_set_rules_match_none_with_two_targets( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -790,7 +774,7 @@ def test__run_validation_across_target_manifests_set_rules_match_none_with_two_t with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1, "syn2": cross_val_df1}, + return_value={"syn1": test_df1, "syn2": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -870,8 +854,8 @@ def test__run_validation_across_targets_value( def test__run_validation_across_targets_set_match_exactly_atleaset_one_no_missing_values( self, va_obj: ValidateAttribute, - cross_val_col_names: dict[str, str], - cross_val_df1: DataFrame, + test_df_col_names: dict[str, str], + test_df1: DataFrame, rule: str, tested_column: list, target_id: str, @@ -886,10 +870,10 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_no_missin """ output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule=rule, - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(tested_column), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id=target_id, missing_manifest_log={}, present_manifest_log=present_log_input.copy(), @@ -917,8 +901,8 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_no_missin def test__run_validation_across_targets_set_match_exactly_atleaset_one_missing_values( self, va_obj: ValidateAttribute, - cross_val_col_names: dict[str, str], - cross_val_df1: DataFrame, + test_df_col_names: dict[str, str], + test_df1: DataFrame, rule: str, tested_column: list, target_id: str, @@ -931,10 +915,10 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_missing_v """ output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule=rule, - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(tested_column), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id=target_id, missing_manifest_log={}, present_manifest_log=present_log_input.copy(), @@ -951,17 +935,17 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_missing_v def test__run_validation_across_targets_set_match_none( self, va_obj: ValidateAttribute, - cross_val_col_names: dict[str, str], - cross_val_df1: DataFrame, + test_df_col_names: dict[str, str], + test_df1: DataFrame, ) -> None: """Tests for ValidateAttribute._run_validation_across_targets_set for matchAtLeastOne""" output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule="matchNone, Patient.PatientID, set", - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(["A", "B", "C"]), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id="syn1", missing_manifest_log={}, present_manifest_log=[], @@ -978,10 +962,10 @@ def test__run_validation_across_targets_set_match_none( output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule="matchNone, Patient.PatientID, set", - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(["A"]), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id="syn2", missing_manifest_log={}, present_manifest_log=[], @@ -1326,3 +1310,288 @@ def test__get_column_names( ) -> None: """Tests for ValidateAttribute._get_column_names""" assert va_obj._get_column_names(DataFrame(input_dict)) == expected_dict + + ############## + # get_no_entry + ############## + + @pytest.mark.parametrize( + "input_entry, node_name, expected", + [ + ("entry", "Check NA", False), + ("entry", "Check Date", False), + ("", "Check NA", False), + ("", "Check Date", True), + ], + ) + def test_get_no_entry( + self, + va_obj: ValidateAttribute, + input_entry: str, + node_name: str, + expected: bool + ) -> None: + """ + This test shows that: + - if the entry is a normal string the result is always False(not no entry), + - if the entry is "" the result is False if the attribute has the "isNA" rule + - if the entry is "" the result is True if the attribute does not have the "isNA" rule + + """ + assert va_obj.get_no_entry(input_entry, node_name) is expected + + ##################### + # get_entry_has_value + ##################### + + @pytest.mark.parametrize( + "input_entry, node_name, expected", + [ + ("entry", "Check NA", True), + ("entry", "Check Date", True), + ("", "Check NA", True), + ("", "Check Date", False), + ], + ) + def test_get_entry_has_value( + self, + va_obj: ValidateAttribute, + input_entry: str, + node_name: str, + expected: bool + ) -> None: + """ + This test shows that: + - if the entry is a normal string the result is always True, + - if the entry is "" the result is True if the attribute has the "isNA" rule + - if the entry is "" the result is False if the attribute does not have the "isNA" rule + + """ + assert va_obj.get_entry_has_value(input_entry, node_name) is expected + + ################# + # list_validation + ################# + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["x,x,x"], name="Check List"), "list like"), + (Series(["x,x,x"], name="Check List"), "list strict"), + (Series([], name="Check List"), "list like"), + (Series([], name="Check List"), "list strict"), + + (Series(["x"], name="Check List"), "list like"), + (Series(["xxx"], name="Check List"), "list like"), + (Series(["1"], name="Check List"), "list like"), + (Series([1], name="Check List"), "list like"), + (Series([1.1], name="Check List"), "list like"), + (Series([1,1,1], name="Check List"), "list like"), + (Series([np.nan], name="Check List"), "list like"), + (Series([True], name="Check List"), "list like"), + ], + ) + def test_list_validation_passing( + self, + va_obj: ValidateAttribute, + input_column: Series, + rule: str + ) -> None: + """ + This tests ValidateAttribute.list_validation + This test shows that: + - when using list like, just about anything is validated + - when using list strict, empty columns, and comma separated strings pass + + """ + errors, warnings, _ = va_obj.list_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column", + [ + (Series(["x"], name="Check List")), + (Series(["xxxx"], name="Check List")), + (Series([1], name="Check List")), + (Series([1.1], name="Check List")), + (Series([1,1,1], name="Check List")), + (Series([np.nan], name="Check List")), + (Series([True], name="Check List")), + ], + ) + @pytest.mark.parametrize("rule", ["list strict", "list strict warning"]) + def test_list_validation_not_passing( + self, + va_obj: ValidateAttribute, + input_column: Series, + rule: str + ) -> None: + """ + This tests ValidateAttribute.list_validation + This test shows what doesn't pass when using list strict + """ + errors, warnings, _ = va_obj.list_validation(rule, input_column) + if rule.endswith("warning"): + assert len(errors) == 0 + assert len(warnings) > 0 + else: + assert len(errors) > 0 + assert len(warnings) == 0 + + ################## + # regex_validation + ################## + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["a"], name="Check List"), "regex match [a-f]"), + (Series(["a,b,"], name="Check Regex List Strict"), "regex match [a-f]"), + ], + ) + def test_regex_validation_passing( + self, + va_obj: ValidateAttribute, + input_column: Series, + rule: str + ) -> None: + """ + This tests ValidateAttribute.regex_validation + This test shows passing examples using the match rule + """ + errors, warnings = va_obj.regex_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["g"], name="Check List"), "regex match [a-f]"), + (Series(["a,b,c,g"], name="Check Regex List Strict"), "regex match [a-f]"), + ], + ) + def test_regex_validation_failing( + self, + va_obj: ValidateAttribute, + input_column: Series, + rule: str + ) -> None: + """ + This tests ValidateAttribute.regex_validation + This test shows failing examples using the match rule + """ + errors, warnings = va_obj.regex_validation(rule, input_column) + assert len(errors) == 1 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column, rule, exception", + [ + (Series(["a"]), "", ValidationError), + (Series(["a"]), "regex", ValidationError), + (Series(["a"]), "regex match", ValidationError), + + (Series(["a"]), "regex match [a-f]", ValueError), + ], + ) + def test_regex_validation_exceptions( + self, + va_obj: ValidateAttribute, + input_column: Series, + rule: str, + exception + ) -> None: + """ + This tests ValidateAttribute.regex_validation + This test shows that: + - when the rule is malformed, a ValidationError is raised + - when the input series has no name, a ValueError is raised + + """ + with pytest.raises(exception): + va_obj.regex_validation(rule, input_column) + + + ################# + # type_validation + ################# + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["a"], name="Check String"), "str"), + (Series([1], name="Check Num"), "num"), + (Series([1], name="Check Int"), "int"), + (Series([1.1], name="Check Float"), "float"), + (Series([np.nan], name="Check String"), "str"), + (Series([np.nan], name="Check Num"), "num"), + (Series([np.nan], name="Check Int"), "int"), + (Series([np.nan], name="Check Float"), "float"), + ] + ) + def test_type_validation_passing( + self, + va_obj: ValidateAttribute, + input_column: Series, + rule: str + ) -> None: + """ + This tests ValidateAttribute.type_validation + This test shows passing examples using the type rule + """ + errors, warnings = va_obj.type_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series([1], name="Check String"), "str"), + (Series(["a"], name="Check Num"), "num"), + (Series(["20"], name="Check Num"), "num"), + (Series([1.1], name="Check Int"), "int"), + (Series(["a"], name="Check Int"), "int"), + (Series([1], name="Check Float"), "float"), + (Series(["a"], name="Check Float"), "float"), + ] + ) + def test_type_validation_failing( + self, + va_obj: ValidateAttribute, + input_column: Series, + rule: str + ) -> None: + """ + This tests ValidateAttribute.type_validation + This test shows failing examples using the type rule + """ + errors, warnings = va_obj.type_validation(rule, input_column) + assert len(errors) == 1 + assert len(warnings) == 0 + + + ################ + # url_validation + ################ + + @pytest.mark.parametrize( + "input_column", + [ + (Series([], name="Check URL")), + (Series([np.nan], name="Check URL")) + ] + ) + def test_url_validation_passing( + self, + va_obj: ValidateAttribute, + input_column: Series, + ) -> None: + """ + This tests ValidateAttribute.type_validation + This test shows passing examples using the type rule + """ + errors, warnings = va_obj.url_validation("url", input_column) + assert len(errors) == 0 + assert len(warnings) == 0 From f0368f98b4f143d9803f0de44771467bf3e9f80c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Oct 2024 13:28:54 -0700 Subject: [PATCH 2/5] add unit tests --- schematic/models/validate_attribute.py | 104 +++--- tests/unit/test_validate_attribute.py | 454 ++++++++++++++++++++++--- 2 files changed, 473 insertions(+), 85 deletions(-) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 327a6013f..fe352a330 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -954,8 +954,10 @@ def list_validation( entry=list_string, node_display_name=manifest_col.name, ) - # Since this column has been turned into a string, it's unclear if this can ever be - # anything other than a string + # Because of the above line: manifest_col = manifest_col.astype(str) + # this column has been turned into a string, it's unclear if any values + # from this column can be anything other than a string, and therefore this + # if statement may not be needed if not isinstance(list_string, str) and entry_has_value: list_error = "not_a_string" elif not re.fullmatch(csv_re, list_string) and entry_has_value: @@ -1029,7 +1031,9 @@ def regex_validation( validation_rules = self.dmge.get_node_validation_rules( node_display_name=manifest_col.name ) - # TODO: Write test to trigger this if statemment: + # It seems like this statement can ever be true + # self.dmge.get_node_validation_rules never returns a list with "::" even when + # the attribute has the "list::regex" rule if validation_rules and "::" in validation_rules[0]: validation_rules = validation_rules[0].split("::") # Handle case where validating re's within a list. @@ -1289,14 +1293,14 @@ def url_validation( return errors, warnings def _parse_validation_log( - self, validation_log: dict[str, pd.core.series.Series] - ) -> tuple[[list[str], list[str], list[str]]]: + self, validation_log: dict[str, pd.Series] + ) -> tuple[list[str], list[str], list[str]]: """Parse validation log, so values can be used to raise warnings/errors Args: - validation_log, dict[str, pd.core.series.Series]: + validation_log, dict[str, pd.Series]: Returns: invalid_rows, list: invalid rows recorded in the validation log - invalid_enties, list: invalid values recorded in the validation log + invalid_entities, list: invalid values recorded in the validation log manifest_ids, list: """ # Initialize parameters @@ -1316,12 +1320,15 @@ def _parse_validation_log( return invalid_rows, invalid_entries, manifest_ids def _merge_format_invalid_rows_values( - self, series_1: pd.core.series.Series, series_2: pd.core.series.Series - ) -> tuple[[list[str], list[str]]]: - """Merge two series to identify gather all invalid values, and parse out invalid rows and entries + self, series_1: pd.Series, series_2: pd.Series + ) -> tuple[list[str], list[str]]: + """ + Merge two series to identify gather all invalid values, + and parse out invalid rows and entries + Args: - series_1, pd.core.series.Series: first set of invalid values to extract - series_2, pd.core.series.Series: second set of invalid values to extract + series_1, pd.Series: first set of invalid values to extract + series_2, pd.Series: second set of invalid values to extract Returns: invalid_rows, list: invalid rows taken from both series invalid_entry, list: invalid values taken from both series @@ -1345,12 +1352,14 @@ def _merge_format_invalid_rows_values( return invalid_rows, invalid_entry def _format_invalid_row_values( - self, invalid_values: dict[str, pd.core.series.Series] - ) -> tuple[[list[str], list[str]]]: - """Parse invalid_values dictionary, to extract invalid_rows and invalid_entry to be used later - to raise warnings or errors. + self, invalid_values: pd.Series + ) -> tuple[list[str], list[str]]: + """ + Parse invalid_values, to extract invalid_rows and invalid_entry + to be used later to raise warnings or errors. + Args: - invalid_values, dict[str, pd.core.series.Series]: + invalid_values, pd.Series: Returns: invalid_rows, list: invalid rows recorded in invalid_values invalid_entry, list: invalid values recorded in invalid_values @@ -1386,9 +1395,9 @@ def _gather_set_warnings_errors( Returns: errors, list[str]: list of errors to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) + not pass relevant cross manifest validation across the target manifest(s) warnings, list[str]: list of warnings to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) + not pass relevant cross manifest validation across the target manifest(s) """ errors: list[str] = [] warnings: list[str] = [] @@ -1443,21 +1452,28 @@ def _remove_non_entry_from_invalid_entry_list( row_num: Optional[list[str]], attribute_name: str, ) -> tuple[list[str], list[str]]: - """Helper to remove NAs from a list of invalid entries (if applicable, and allowed), remove the row - too from row_num. This will make sure errors are not rasied for NA entries unless the value is required. + """ + Helper to remove NAs from a list of invalid entries (if applicable, and allowed), + remove the row too from row_num. This will make sure errors are not raised for + NA entries unless the value is required. + Args: invalid_entry, list[str]: default=None, list of entries in the source manifest where - invalid values were located. - row_num, list[str[: default=None, list of rows in the source manifest where invalid values were located + invalid values were located. + row_num, list[str[: default=None, list of rows in the source manifest where invalid + values were located attribute_name, str: source attribute name + Returns: - invalid_entry and row_num returned with any NA and corresponding row index value removed, if applicable. + invalid_entry and row_num returned with any NA and corresponding row index value + removed, if applicable. """ idx_to_remove = [] # Check if the current attribute column is required, via the data model if invalid_entry and row_num: # Check each invalid entry and determine if it has a value and/or is required. - # If there is no entry and its not required, remove the NA value so an error is not raised. + # If there is no entry and its not required, remove the NA value so an + # error is not raised. for idx, entry in enumerate(invalid_entry): entry_has_value = self.get_entry_has_value(entry, attribute_name) # If there is no value, and is not required, recored the index @@ -1469,8 +1485,8 @@ def _remove_non_entry_from_invalid_entry_list( for idx in sorted(idx_to_remove, reverse=True): del invalid_entry[idx] del row_num[idx] - # Perform check to make sure length of invalid_entry and row_num is the same. If not that would suggest - # there was an issue recording or removing values. + # Perform check to make sure length of invalid_entry and row_num is the same. + # If not that would suggest there was an issue recording or removing values. if len(invalid_entry) != len(row_num): logger.error( f"There was an error handling and validating a non-entry." @@ -1533,17 +1549,22 @@ def _gather_value_warnings_errors( source_attribute: str, value_validation_store: tuple[pd.Series, pd.Series, pd.Series], ) -> tuple[list[str], list[str]]: - """For value rule scope, find invalid rows and entries, and generate appropriate errors and warnings + """ + For value rule scope, find invalid rows and entries, and generate + appropriate errors and warnings + Args: val_rule, str: Validation rule source_attribute, str: source manifest column name value_validation_store, tuple(pd.Series, pd.Series, pd.Series]): contains missing_values, duplicated_values, and repeat values Returns: - errors, list[str]: list of errors to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) - warnings, list[str]: list of warnings to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) + errors, list[str]: list of errors to raise, as appropriate, if values + in current manifest do not pass relevant cross manifest validation + across the target manifest(s) + warnings, list[str]: list of warnings to raise, as appropriate, + if values in current manifest do not pass relevant cross manifest + validation across the target manifest(s) """ # Initialize with empty lists errors, warnings = [], [] @@ -1583,17 +1604,22 @@ def _gather_value_warnings_errors( def _check_if_target_manifest_is_empty( self, - target_manifest: pd.core.series.Series, + target_manifest: pd.DataFrame, target_manifest_empty: list[bool], column_names: dict[str, str], ) -> list[bool]: - """If a target manifest is found with the attribute column of interest check to see if the manifest is empty. + """ + If a target manifest is found with the attribute column of interest check to see if + the manifest is empty. + Args: - target_manifest, pd.core.series.Series: Current target manifest - target_manifest_empty, list[bool]: a list of booleans recording if the target manifest are emtpy or not. + target_manifest, pd.Dataframe: Current target manifest + target_manifest_empty, list[bool]: a list of booleans recording if the target manifest + are empty or not. column_names, dict[str, str]: {stripped_col_name:original_column_name} Returns: - target_manifest_empty, list[bool]: a list of booleans recording if the target manifest are emtpy or not. + target_manifest_empty, list[bool]: a list of booleans recording if the target manifest + are empty or not. """ # Make a copy of the target manifest with only user uploaded columns target_manifest_dupe = target_manifest.drop( @@ -2043,7 +2069,7 @@ def cross_validation( def filename_validation( self, val_rule: str, - manifest: pd.core.frame.DataFrame, + manifest: pd.DataFrame, access_token: str, dataset_scope: str, project_scope: Optional[list] = None, @@ -2053,7 +2079,7 @@ def filename_validation( Validate the filenames in the manifest against the data paths in the fileview. Args: val_rule: str, Validation rule for the component - manifest: pd.core.frame.DataFrame, manifest + manifest: pd.DataFrame, manifest access_token: str, Asset Store access token dataset_scope: str, Dataset with files to validate against project_scope: Optional[list] = None: Projects to limit the scope of cross manifest validation to. diff --git a/tests/unit/test_validate_attribute.py b/tests/unit/test_validate_attribute.py index 90d42cf1e..5a9a4cb61 100644 --- a/tests/unit/test_validate_attribute.py +++ b/tests/unit/test_validate_attribute.py @@ -177,6 +177,7 @@ def fixture_test_df1() -> Generator[DataFrame, None, None]: } ) + @pytest.fixture(name="test_df_col_names") def fixture_test_df_col_names() -> Generator[dict[str, str], None, None]: """ @@ -474,7 +475,7 @@ def test_cross_validation_match_atleast_one_value_passing_one_df( va_obj: ValidateAttribute, rule: str, tested_column: list, - target_manifest: DataFrame + target_manifest: DataFrame, ): """ Tests ValidateAttribute.cross_validation @@ -532,7 +533,6 @@ def test_cross_validation_match_atleast_one_value_errors_one_df( (["C"], TEST_DF3), (["C", "C"], TEST_DF1), (["C", "C"], TEST_DF3), - (["A"], TEST_DF1), (["A", "A"], TEST_DF1), (["A", "B"], TEST_DF1), @@ -545,7 +545,7 @@ def test_cross_validation_match_exactly_one_value_passing_one_df( va_obj: ValidateAttribute, rule: str, tested_column: list, - target_manifest: DataFrame + target_manifest: DataFrame, ): """ Tests ValidateAttribute.cross_validation @@ -1577,7 +1577,7 @@ def test_get_no_entry( va_obj: ValidateAttribute, input_entry: str, node_name: str, - expected: bool + expected: bool, ) -> None: """ This test shows that: @@ -1605,7 +1605,7 @@ def test_get_entry_has_value( va_obj: ValidateAttribute, input_entry: str, node_name: str, - expected: bool + expected: bool, ) -> None: """ This test shows that: @@ -1626,22 +1626,18 @@ def test_get_entry_has_value( (Series(["x,x,x"], name="Check List"), "list strict"), (Series([], name="Check List"), "list like"), (Series([], name="Check List"), "list strict"), - (Series(["x"], name="Check List"), "list like"), (Series(["xxx"], name="Check List"), "list like"), (Series(["1"], name="Check List"), "list like"), (Series([1], name="Check List"), "list like"), (Series([1.1], name="Check List"), "list like"), - (Series([1,1,1], name="Check List"), "list like"), + (Series([1, 1, 1], name="Check List"), "list like"), (Series([np.nan], name="Check List"), "list like"), (Series([True], name="Check List"), "list like"), ], ) def test_list_validation_passing( - self, - va_obj: ValidateAttribute, - input_column: Series, - rule: str + self, va_obj: ValidateAttribute, input_column: Series, rule: str ) -> None: """ This tests ValidateAttribute.list_validation @@ -1661,17 +1657,14 @@ def test_list_validation_passing( (Series(["xxxx"], name="Check List")), (Series([1], name="Check List")), (Series([1.1], name="Check List")), - (Series([1,1,1], name="Check List")), + (Series([1, 1, 1], name="Check List")), (Series([np.nan], name="Check List")), (Series([True], name="Check List")), ], ) @pytest.mark.parametrize("rule", ["list strict", "list strict warning"]) def test_list_validation_not_passing( - self, - va_obj: ValidateAttribute, - input_column: Series, - rule: str + self, va_obj: ValidateAttribute, input_column: Series, rule: str ) -> None: """ This tests ValidateAttribute.list_validation @@ -1697,10 +1690,7 @@ def test_list_validation_not_passing( ], ) def test_regex_validation_passing( - self, - va_obj: ValidateAttribute, - input_column: Series, - rule: str + self, va_obj: ValidateAttribute, input_column: Series, rule: str ) -> None: """ This tests ValidateAttribute.regex_validation @@ -1718,10 +1708,7 @@ def test_regex_validation_passing( ], ) def test_regex_validation_failing( - self, - va_obj: ValidateAttribute, - input_column: Series, - rule: str + self, va_obj: ValidateAttribute, input_column: Series, rule: str ) -> None: """ This tests ValidateAttribute.regex_validation @@ -1737,16 +1724,11 @@ def test_regex_validation_failing( (Series(["a"]), "", ValidationError), (Series(["a"]), "regex", ValidationError), (Series(["a"]), "regex match", ValidationError), - (Series(["a"]), "regex match [a-f]", ValueError), ], ) def test_regex_validation_exceptions( - self, - va_obj: ValidateAttribute, - input_column: Series, - rule: str, - exception + self, va_obj: ValidateAttribute, input_column: Series, rule: str, exception ) -> None: """ This tests ValidateAttribute.regex_validation @@ -1758,6 +1740,23 @@ def test_regex_validation_exceptions( with pytest.raises(exception): va_obj.regex_validation(rule, input_column) + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["a,b,c"], name="Check Regex List"), "list::regex match [a-f]"), + (Series(["a,b,c", "d,e,f"], name="Check Regex List"), "list::regex match [a-f]"), + ], + ) + def test_regex_validation_with_list_column( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.regex_validation using a list column + """ + errors, warnings = va_obj.regex_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + ################# # type_validation ################# @@ -1773,13 +1772,10 @@ def test_regex_validation_exceptions( (Series([np.nan], name="Check Num"), "num"), (Series([np.nan], name="Check Int"), "int"), (Series([np.nan], name="Check Float"), "float"), - ] + ], ) def test_type_validation_passing( - self, - va_obj: ValidateAttribute, - input_column: Series, - rule: str + self, va_obj: ValidateAttribute, input_column: Series, rule: str ) -> None: """ This tests ValidateAttribute.type_validation @@ -1789,7 +1785,6 @@ def test_type_validation_passing( assert len(errors) == 0 assert len(warnings) == 0 - @pytest.mark.parametrize( "input_column, rule", [ @@ -1800,13 +1795,10 @@ def test_type_validation_passing( (Series(["a"], name="Check Int"), "int"), (Series([1], name="Check Float"), "float"), (Series(["a"], name="Check Float"), "float"), - ] + ], ) def test_type_validation_failing( - self, - va_obj: ValidateAttribute, - input_column: Series, - rule: str + self, va_obj: ValidateAttribute, input_column: Series, rule: str ) -> None: """ This tests ValidateAttribute.type_validation @@ -1816,7 +1808,6 @@ def test_type_validation_failing( assert len(errors) == 1 assert len(warnings) == 0 - ################ # url_validation ################ @@ -1825,8 +1816,13 @@ def test_type_validation_failing( "input_column", [ (Series([], name="Check URL")), - (Series([np.nan], name="Check URL")) - ] + (Series([np.nan], name="Check URL")), + ( + Series( + ["https://doi.org/10.1158/0008-5472.can-23-0128"], name="Check URL" + ) + ), + ], ) def test_url_validation_passing( self, @@ -1834,9 +1830,375 @@ def test_url_validation_passing( input_column: Series, ) -> None: """ - This tests ValidateAttribute.type_validation - This test shows passing examples using the type rule + This tests ValidateAttribute.url_validation + This test shows passing examples using the url rule """ errors, warnings = va_obj.url_validation("url", input_column) assert len(errors) == 0 assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column", + [(Series([""], name="Check URL")), (Series(["xxx"], name="Check URL"))], + ) + def test_url_validation_failing( + self, + va_obj: ValidateAttribute, + input_column: Series, + ) -> None: + """ + This tests ValidateAttribute.url_validation + This test shows failing examples using the url rule + """ + errors, warnings = va_obj.url_validation("url", input_column) + assert len(errors) > 0 + assert len(warnings) == 0 + + ####################### + # _parse_validation_log + ####################### + + @pytest.mark.parametrize( + "input_log, expected_invalid_rows, expected_invalid_entities, expected_manifest_ids", + [ + ({}, [], [], []), + ({"syn1": Series(["A"])}, ["2"], ["A"], ["syn1"]), + ({"syn1": Series(["A"], index=[1])}, ["3"], ["A"], ["syn1"]), + ({"syn1": Series(["A", "B"])}, ["2"], ["A"], ["syn1"]), + ( + {"syn1": Series(["A"]), "syn2": Series(["B"])}, + ["2"], + ["A", "B"], + ["syn1", "syn2"], + ), + ], + ) + def test__parse_validation_log( + self, + va_obj: ValidateAttribute, + input_log: dict[str, Series], + expected_invalid_rows: list[str], + expected_invalid_entities: list[str], + expected_manifest_ids: list[str], + ) -> None: + """ + This test shows that + - an empty log returns empty values + - only the first value in each series is returned as invalid entities + - the index of the invalid entity is returned incremented by 2 + - each manifest entity is returned + + """ + invalid_rows, invalid_entities, manifest_ids = va_obj._parse_validation_log( + input_log + ) + assert invalid_rows == expected_invalid_rows + assert sorted(invalid_entities) == expected_invalid_entities + assert manifest_ids == expected_manifest_ids + + ################################### + # _merge_format_invalid_rows_values + ################################### + + @pytest.mark.parametrize( + "input_series1, input_series2, expected_invalid_rows, expected_invalid_entry", + [ + (Series([], name="x"), Series([], name="x"), [], []), + (Series(["A"], name="x"), Series([], name="x"), ["2"], ["A"]), + (Series([], name="x"), Series(["B"], name="x"), ["2"], ["B"]), + (Series(["A"], name="x"), Series(["B"], name="x"), ["2"], ["A", "B"]), + (Series(["A"], name="x"), Series(["C"], name="x"), ["2"], ["A", "C"]), + ( + Series(["A", "B"], name="x"), + Series(["C"], name="x"), + ["2", "3"], + ["A", "B", "C"], + ), + ], + ) + def test__merge_format_invalid_rows_values( + self, + va_obj: ValidateAttribute, + input_series1: Series, + input_series2: Series, + expected_invalid_rows: list[str], + expected_invalid_entry: list[str], + ) -> None: + """ + This test shows that + - the names of the series must match + - the indices of both series get combined and incremented by 2 + - the values of both series are combined + """ + invalid_rows, invalid_entry = va_obj._merge_format_invalid_rows_values( + input_series1, input_series2 + ) + assert invalid_rows == expected_invalid_rows + assert invalid_entry == expected_invalid_entry + + ############################ + # _format_invalid_row_values + ############################ + + @pytest.mark.parametrize( + "input_series, expected_invalid_rows, expected_invalid_entry", + [ + (Series([]), [], []), + (Series(["A"]), ["2"], ["A"]), + (Series(["A", "B"]), ["2", "3"], ["A", "B"]), + ], + ) + def test__format_invalid_row_values( + self, + va_obj: ValidateAttribute, + input_series: Series, + expected_invalid_rows: list[str], + expected_invalid_entry: list[str], + ) -> None: + """ + This test shows that the indices of the input series is incremented by 2 + """ + invalid_rows, invalid_entry = va_obj._format_invalid_row_values(input_series) + assert invalid_rows == expected_invalid_rows + assert invalid_entry == expected_invalid_entry + + ########################################### + # _remove_non_entry_from_invalid_entry_list + ########################################### + + @pytest.mark.parametrize( + "input_entry, input_row_num, input_name, expected_invalid_entry, expected_row_num", + [ + # Cases where entry and row number remain unchanged + ([], [], "", [], []), + (None, None, "", None, None), + (["A"], None, "", ["A"], None), + (None, ["1"], "", None, ["1"]), + (["A"], ["1"], "x", ["A"], ["1"]), + (["A", "B"], ["1"], "x", ["A", "B"], ["1"]), + (["A"], ["1", "2"], "x", ["A"], ["1", "2"]), + # When there are missing values the value and the row number are removed + ([""], ["1"], "x", [], []), + (["", ""], ["1", "2"], "x", [], []), + (["", "A"], ["1", "2"], "x", ["A"], ["2"]), + # When there are more row numbers than values, and there are missing values + # then the row number that corresponds to the missing value is removed + ([""], ["1", "2"], "x", [], ["2"]), + (["", ""], ["1", "2", "3", "4"], "x", [], ["3", "4"]), + (["", "A"], ["1", "2", "3", "4"], "x", ["A"], ["2", "3", "4"]), + (["A", ""], ["1", "2", "3", "4"], "x", ["A"], ["1", "3", "4"]), + ], + ) + def test__remove_non_entry_from_invalid_entry_list( + self, + va_obj: ValidateAttribute, + input_entry: list[str], + input_row_num: list[str], + input_name: str, + expected_invalid_entry: list[str], + expected_row_num: list[str], + ) -> None: + """ + Tests for ValidateAttribute.remove_non_entry_from_invalid_entry_list + """ + invalid_entry, row_num = va_obj._remove_non_entry_from_invalid_entry_list( + input_entry, input_row_num, input_name + ) + assert invalid_entry == expected_invalid_entry + assert row_num == expected_row_num + + @pytest.mark.parametrize( + "input_entry, input_row_num, input_name, exception", + [ + # if first two inputs are not empty, an empty name string causes an IndexError + (["A"], ["1"], "", IndexError), + # if there are more invalid entries than row numbers, there is an IndexError + (["", ""], ["1"], "x", IndexError), + ], + ) + def test__remove_non_entry_from_invalid_entry_list_exceptions( + self, + va_obj: ValidateAttribute, + input_entry: list[str], + input_row_num: list[str], + input_name: str, + exception: Exception, + ) -> None: + """ + Tests for ValidateAttribute.remove_non_entry_from_invalid_entry_list that cause + exceptions + """ + with pytest.raises(exception): + va_obj._remove_non_entry_from_invalid_entry_list( + input_entry, input_row_num, input_name + ) + + #################################### + # _check_if_target_manifest_is_empty + #################################### + + @pytest.mark.parametrize( + "input_dataframe, input_bool_list, input_column_dict, output_bool_list", + [ + # Dataframes with only required columns are always considered_empty + ( + DataFrame({"component": [], "id": [], "entityid": []}), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + ( + DataFrame({"component": ["xxx"], "id": ["xxx"], "entityid": ["xxx"]}), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + # Dataframes with non-required columns whose only values are null are considered empty + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": [np.nan], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": [np.nan], + "col2": [np.nan], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + # Dataframes with non-required columns who have non-null values are not considered empty + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": ["xxx"], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [False], + ), + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": [np.nan], + "col2": ["xxx"], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [False], + ), + ], + ) + def test__check_if_target_manifest_is_empty( + self, + va_obj: ValidateAttribute, + input_dataframe: DataFrame, + input_bool_list: list[bool], + input_column_dict: dict[str, str], + output_bool_list: list[bool], + ) -> None: + """ + Tests for ValidateAttribute._check_if_target_manifest_is_empty + """ + bool_list = va_obj._check_if_target_manifest_is_empty( + input_dataframe, input_bool_list, input_column_dict + ) + assert bool_list == output_bool_list + + @pytest.mark.parametrize( + "input_dataframe, input_bool_list, input_column_dict, exception", + [ + # column name dict must have keys "component", "id", "entityid" + (DataFrame({"component": [], "id": [], "entityid": []}), [], {}, KeyError), + # dataframe must have columns "component", "id", "entityid" + ( + DataFrame(), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + KeyError, + ), + ], + ) + def test__check_if_target_manifest_is_empty_exceptions( + self, + va_obj: ValidateAttribute, + input_dataframe: DataFrame, + input_bool_list: list[bool], + input_column_dict: dict[str, str], + exception: Exception, + ) -> None: + """ + Tests for ValidateAttribute._check_if_target_manifest_is_empty that cause + exceptions + """ + with pytest.raises(exception): + va_obj._check_if_target_manifest_is_empty( + input_dataframe, input_bool_list, input_column_dict + ) + + ################# + # _get_rule_scope + ################# + + @pytest.mark.parametrize( + "input_rule, output_scope", + [ + # After splitting by spaces, the third element is returned + ("a b c", "c"), + ("a b c d", "c") + ], + ) + def test__get_rule_scope( + self, + va_obj: ValidateAttribute, + input_rule: str, + output_scope: str + ) -> None: + """ + Tests for ValidateAttribute._get_rule_scope + """ + assert va_obj._get_rule_scope(input_rule) == output_scope + + @pytest.mark.parametrize( + "input_rule, exception", + [ + # The rule must a string when split by spaces, have atleast three elements + ("", IndexError), + ("x", IndexError), + ("x x", IndexError), + ("x;x;x", IndexError) + ], + ) + def test__get_rule_scope_exceptions( + self, + va_obj: ValidateAttribute, + input_rule: str, + exception: Exception + ) -> None: + """ + Tests for ValidateAttribute._get_rule_scope that cause exceptions + """ + with pytest.raises(exception): + va_obj._get_rule_scope(input_rule) From 89c1cf03590af9341dce94c9114f22b7109f0032 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 4 Oct 2024 10:30:40 -0700 Subject: [PATCH 3/5] ran black --- schematic/models/validate_attribute.py | 12 ++++++------ tests/unit/test_validate_attribute.py | 19 ++++++++----------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index fe352a330..35e3c932c 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -932,8 +932,8 @@ def list_validation( # For each 'list' (input as a string with a , delimiter) entered, # convert to a real list of strings, with leading and trailing # white spaces removed. - errors:list[list[str]] = [] - warnings:list[list[str]] = [] + errors: list[list[str]] = [] + warnings: list[list[str]] = [] replace_null = True csv_re = comma_separated_list_regex() @@ -1025,8 +1025,8 @@ def regex_validation( f" They should be provided as follows ['regex', 'module name', 'regular expression']" ) - errors:list[list[str]] = [] - warnings:list[list[str]] = [] + errors: list[list[str]] = [] + warnings: list[list[str]] = [] validation_rules = self.dmge.get_node_validation_rules( node_display_name=manifest_col.name @@ -1128,8 +1128,8 @@ def type_validation( "str": (str), } - errors:list[list[str]] = [] - warnings:list[list[str]] = [] + errors: list[list[str]] = [] + warnings: list[list[str]] = [] # num indicates either a float or int. if val_rule == "num": diff --git a/tests/unit/test_validate_attribute.py b/tests/unit/test_validate_attribute.py index 5a9a4cb61..a24e9609c 100644 --- a/tests/unit/test_validate_attribute.py +++ b/tests/unit/test_validate_attribute.py @@ -1744,7 +1744,10 @@ def test_regex_validation_exceptions( "input_column, rule", [ (Series(["a,b,c"], name="Check Regex List"), "list::regex match [a-f]"), - (Series(["a,b,c", "d,e,f"], name="Check Regex List"), "list::regex match [a-f]"), + ( + Series(["a,b,c", "d,e,f"], name="Check Regex List"), + "list::regex match [a-f]", + ), ], ) def test_regex_validation_with_list_column( @@ -2167,14 +2170,11 @@ def test__check_if_target_manifest_is_empty_exceptions( [ # After splitting by spaces, the third element is returned ("a b c", "c"), - ("a b c d", "c") + ("a b c d", "c"), ], ) def test__get_rule_scope( - self, - va_obj: ValidateAttribute, - input_rule: str, - output_scope: str + self, va_obj: ValidateAttribute, input_rule: str, output_scope: str ) -> None: """ Tests for ValidateAttribute._get_rule_scope @@ -2188,14 +2188,11 @@ def test__get_rule_scope( ("", IndexError), ("x", IndexError), ("x x", IndexError), - ("x;x;x", IndexError) + ("x;x;x", IndexError), ], ) def test__get_rule_scope_exceptions( - self, - va_obj: ValidateAttribute, - input_rule: str, - exception: Exception + self, va_obj: ValidateAttribute, input_rule: str, exception: Exception ) -> None: """ Tests for ValidateAttribute._get_rule_scope that cause exceptions From 95d768673fab51f2dd543a2cd97382e07a1c2ec9 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Thu, 10 Oct 2024 11:57:35 -0700 Subject: [PATCH 4/5] Update schematic/models/validate_attribute.py Co-authored-by: BryanFauble <17128019+BryanFauble@users.noreply.github.com> --- schematic/models/validate_attribute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 35e3c932c..9247ac79e 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -1031,7 +1031,7 @@ def regex_validation( validation_rules = self.dmge.get_node_validation_rules( node_display_name=manifest_col.name ) - # It seems like this statement can ever be true + # It seems like this statement can never be true # self.dmge.get_node_validation_rules never returns a list with "::" even when # the attribute has the "list::regex" rule if validation_rules and "::" in validation_rules[0]: From 95a59c4ff841f5649749956a6c81f6fea0cfc222 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Oct 2024 08:41:58 -0700 Subject: [PATCH 5/5] added tests --- tests/unit/test_validate_attribute.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/unit/test_validate_attribute.py b/tests/unit/test_validate_attribute.py index a24e9609c..fa1070428 100644 --- a/tests/unit/test_validate_attribute.py +++ b/tests/unit/test_validate_attribute.py @@ -1811,6 +1811,24 @@ def test_type_validation_failing( assert len(errors) == 1 assert len(warnings) == 0 + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series([1], name="Check String"), "str error"), + (Series([1], name="Check String"), "str warning"), + ], + ) + def test_type_validation_does_not_work( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.type_validation + This test shows that the msg level parameter doesn't work + """ + errors, warnings = va_obj.type_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + ################ # url_validation ################