From 2bf538d82e081c4a5858f5ccd5e4e49b2b0a0721 Mon Sep 17 00:00:00 2001 From: Agistya Anugrah Dwiutama Date: Tue, 5 Mar 2024 13:58:52 +0700 Subject: [PATCH] fix: charachter function --- datasae/profiling/profiling.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/datasae/profiling/profiling.py b/datasae/profiling/profiling.py index 348c456..1778687 100644 --- a/datasae/profiling/profiling.py +++ b/datasae/profiling/profiling.py @@ -124,7 +124,7 @@ def check_missing_cells(data: list) -> int: return missing_cells @staticmethod - def check_characters_and_unicode(data: list) -> dict: + def check_characters(data: list) -> dict: """ Check total characters and unicode of list of data. @@ -136,21 +136,18 @@ def check_characters_and_unicode(data: list) -> dict: """ total_characters = 0 characters = "" - for row in data: - values = list(row.values()) - for value in values: - if isinstance(value, str): - value = re.sub(r"[^a-zA-Z]", "", value) - total_characters += len(value) - characters += value - - characters = "".join(set(characters)) - - return { - "characters": total_characters, - "unicode": len(characters), + for value in data: + value = re.sub(r"[^a-zA-Z]", "", value) + total_characters += len(value) + characters += value + + result = { + "total_characters": sum(len(i) for i in data), + "distinct_characters": len("".join(set(characters))), } + return result + @staticmethod def check_data_types(data: list) -> dict: """ @@ -313,7 +310,7 @@ def check_coeff_var(data: list) -> float: def profiling(self): data = self.dataFrame.to_dict(orient="records") - data2 = self.dataFrame.to_dict() + data2 = self.dataFrame.to_dict(orient="list") result = { "overview": { "number_of_observations": self.check_number_of_observations( @@ -353,5 +350,6 @@ def profiling(self): if value == "Text": result["variables"][key] = { "data_type": value, + "characters": self.check_characters(data2[key]) } return result