diff --git a/datasae/string.py b/datasae/string.py index 4e3b1dd..ab65ad6 100644 --- a/datasae/string.py +++ b/datasae/string.py @@ -34,6 +34,42 @@ def __init__(self, dataFrame: pd.DataFrame): """ self.dataFrame = dataFrame + @staticmethod + def check_is_in_contain(string_data: str, compare_data: list) -> tuple: + """ + check_is_in_contain method. + + Check if a given string value is not present in a specified + dict + + Args: + string_data (str): The string value to be checked. + compare_data: The list of values to check against. + + Returns: + tuple: A tuple containing the following elements: + - valid (int): The number of valid values (either 0 or 1). + - invalid (int): The number of invalid values (either 0 or 1). + - warning_data (dict): A dictionary with warning data if the + value is invalid, including the warning message, + the actual value, and a detailed message. + """ + valid = 0 + invalid = 0 + warning_data = {} + if any( + strings in compare_data for strings in string_data + ): + valid = 1 + else: + invalid = 1 + warning_data = create_warning_data( + compare_data, + f"Value should be contain to {string_data}", + ) + + return valid, invalid, warning_data + @staticmethod def check_contain(string_data: str, compare_data: str) -> tuple: """ @@ -297,6 +333,51 @@ def check_is_capitalize_all_word(str_data: str) -> tuple: return valid, invalid, warning_data + def is_in_contain(self, str_is_in_contain: list, column: str) -> dict: + """ + is_in_contain method. + + data quality for is_in_contain. + + Args: + str_is_in_contain: string that want to check + column: column name that want to check + + Returns: + dict: A dictionary containing the result of the data quality check, + including the number of valid and invalid values, + and any warning messages. + """ + valid = 0 + invalid = 0 + warning = {} + + for index, str_data in enumerate(self.dataFrame[column]): + try: + if isinstance(str_data, (str)) is False: + raise InvalidDataTypeWarning(warning) + (valid_row, + invalid_row, + warning_data) = self.check_is_in_contain( + str_is_in_contain, str_data + ) + valid += valid_row + invalid += invalid_row + if warning_data != {}: + warning[index] = InvalidDataValueWarning( + warning_data + ).message + except InvalidDataTypeWarning: + invalid += 1 + warning_data = create_warning_data( + str_data, + WarningDataDetailMessage.STRING_DATA_TYPE, + WarningDataMessage.INVALID_DATA_TYPE, + ) + warning[index] = InvalidDataTypeWarning(warning_data).message + result = self.response(valid, invalid, warning) + return result + def contain(self, str_contain, column) -> dict: """ Contain method. diff --git a/tests/test_string.py b/tests/test_string.py index 5950235..27080d9 100644 --- a/tests/test_string.py +++ b/tests/test_string.py @@ -23,6 +23,76 @@ def __init__(self, methodName: str = "TestString"): super().__init__(methodName) self.maxDiff = None + def test_is_in_contain_valid(self): + """test_is_in_contain_valid.""" + dummy = pd.DataFrame( + { + "column": [ + "Masa", + "Jelas", + "Gelas", + "Keras", + "Panas", + "Masak", + "Bersama-sama", + "Jelas", + "Pantas", + "Tas", + ] + } + ) + + actual_result = String(dummy).is_in_contain(['sa', "as"], "column") + expected_result = { + "score": 1.0, + "valid": 10, + "invalid": 0, + "warning": {}, + } + + self.assertDictEqual(actual_result, expected_result, MESSAGE) + + def test_is_in_contain_invalid(self): + """test_contain_invalid.""" + dummy = pd.DataFrame( + { + "column": [ + "Masa", + "Jelas", + "Gelas", + "Keras", + "Panas", + "Masak", + "Bersama-sama", + "Jelas", + "Pantas", + "Tas", + "Laptop", + 10 + ] + } + ) + + actual_result = String(dummy).is_in_contain(['sa', "as"], "column") + expected_result = { + "score": 0.8333333333333334, + "valid": 10, + "invalid": 2, + "warning": { + 10: create_warning_data( + "Laptop", + "Value should be contain to ['sa', 'as']", + WarningDataMessage.INVALID_VALUE, + ), + 11: create_warning_data( + 10, + WarningDataDetailMessage.STRING_DATA_TYPE, + WarningDataMessage.INVALID_DATA_TYPE, + ), + }, + } + self.assertDictEqual(actual_result, expected_result, MESSAGE) + def test_contain_valid(self): """test_contain_valid.""" dummy = pd.DataFrame(