Skip to content

Commit

Permalink
Merge pull request #16 from jabardigitalservice/feature/string_is_in_…
Browse files Browse the repository at this point in the history
…contain

Feature/string is in contain
  • Loading branch information
pipinfitriadi authored Feb 2, 2024
2 parents 1859b8f + e09b6b4 commit ff0f9a9
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 0 deletions.
81 changes: 81 additions & 0 deletions datasae/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,42 @@ def __init__(self, dataFrame: pd.DataFrame):
"""
self.dataFrame = dataFrame

@staticmethod
def check_is_in_contain(string_data: str, compare_data: list) -> tuple:
"""
check_is_in_contain method.
Check if a given string value is not present in a specified
dict
Args:
string_data (str): The string value to be checked.
compare_data: The list of values to check against.
Returns:
tuple: A tuple containing the following elements:
- valid (int): The number of valid values (either 0 or 1).
- invalid (int): The number of invalid values (either 0 or 1).
- warning_data (dict): A dictionary with warning data if the
value is invalid, including the warning message,
the actual value, and a detailed message.
"""
valid = 0
invalid = 0
warning_data = {}
if any(
strings in compare_data for strings in string_data
):
valid = 1
else:
invalid = 1
warning_data = create_warning_data(
compare_data,
f"Value should be contain to {string_data}",
)

return valid, invalid, warning_data

@staticmethod
def check_contain(string_data: str, compare_data: str) -> tuple:
"""
Expand Down Expand Up @@ -297,6 +333,51 @@ def check_is_capitalize_all_word(str_data: str) -> tuple:

return valid, invalid, warning_data

def is_in_contain(self, str_is_in_contain: list, column: str) -> dict:
"""
is_in_contain method.
data quality for is_in_contain.
Args:
str_is_in_contain: string that want to check
column: column name that want to check
Returns:
dict: A dictionary containing the result of the data quality check,
including the number of valid and invalid values,
and any warning messages.
"""
valid = 0
invalid = 0
warning = {}

for index, str_data in enumerate(self.dataFrame[column]):
try:
if isinstance(str_data, (str)) is False:
raise InvalidDataTypeWarning(warning)
(valid_row,
invalid_row,
warning_data) = self.check_is_in_contain(
str_is_in_contain, str_data
)
valid += valid_row
invalid += invalid_row
if warning_data != {}:
warning[index] = InvalidDataValueWarning(
warning_data
).message
except InvalidDataTypeWarning:
invalid += 1
warning_data = create_warning_data(
str_data,
WarningDataDetailMessage.STRING_DATA_TYPE,
WarningDataMessage.INVALID_DATA_TYPE,
)
warning[index] = InvalidDataTypeWarning(warning_data).message
result = self.response(valid, invalid, warning)
return result

def contain(self, str_contain, column) -> dict:
"""
Contain method.
Expand Down
70 changes: 70 additions & 0 deletions tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,76 @@ def __init__(self, methodName: str = "TestString"):
super().__init__(methodName)
self.maxDiff = None

def test_is_in_contain_valid(self):
"""test_is_in_contain_valid."""
dummy = pd.DataFrame(
{
"column": [
"Masa",
"Jelas",
"Gelas",
"Keras",
"Panas",
"Masak",
"Bersama-sama",
"Jelas",
"Pantas",
"Tas",
]
}
)

actual_result = String(dummy).is_in_contain(['sa', "as"], "column")
expected_result = {
"score": 1.0,
"valid": 10,
"invalid": 0,
"warning": {},
}

self.assertDictEqual(actual_result, expected_result, MESSAGE)

def test_is_in_contain_invalid(self):
"""test_contain_invalid."""
dummy = pd.DataFrame(
{
"column": [
"Masa",
"Jelas",
"Gelas",
"Keras",
"Panas",
"Masak",
"Bersama-sama",
"Jelas",
"Pantas",
"Tas",
"Laptop",
10
]
}
)

actual_result = String(dummy).is_in_contain(['sa', "as"], "column")
expected_result = {
"score": 0.8333333333333334,
"valid": 10,
"invalid": 2,
"warning": {
10: create_warning_data(
"Laptop",
"Value should be contain to ['sa', 'as']",
WarningDataMessage.INVALID_VALUE,
),
11: create_warning_data(
10,
WarningDataDetailMessage.STRING_DATA_TYPE,
WarningDataMessage.INVALID_DATA_TYPE,
),
},
}
self.assertDictEqual(actual_result, expected_result, MESSAGE)

def test_contain_valid(self):
"""test_contain_valid."""
dummy = pd.DataFrame(
Expand Down

0 comments on commit ff0f9a9

Please sign in to comment.