From 2bf538d82e081c4a5858f5ccd5e4e49b2b0a0721 Mon Sep 17 00:00:00 2001
From: Agistya Anugrah Dwiutama <agistyaanugrah@gmail.com>
Date: Tue, 5 Mar 2024 13:58:52 +0700
Subject: [PATCH] fix: charachter function

---
 datasae/profiling/profiling.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/datasae/profiling/profiling.py b/datasae/profiling/profiling.py
index 348c456..1778687 100644
--- a/datasae/profiling/profiling.py
+++ b/datasae/profiling/profiling.py
@@ -124,7 +124,7 @@ def check_missing_cells(data: list) -> int:
         return missing_cells
 
     @staticmethod
-    def check_characters_and_unicode(data: list) -> dict:
+    def check_characters(data: list) -> dict:
         """
         Check total characters and unicode of list of data.
 
@@ -136,21 +136,18 @@ def check_characters_and_unicode(data: list) -> dict:
         """
         total_characters = 0
         characters = ""
-        for row in data:
-            values = list(row.values())
-            for value in values:
-                if isinstance(value, str):
-                    value = re.sub(r"[^a-zA-Z]", "", value)
-                    total_characters += len(value)
-                    characters += value
-
-        characters = "".join(set(characters))
-
-        return {
-            "characters": total_characters,
-            "unicode": len(characters),
+        for value in data:
+            value = re.sub(r"[^a-zA-Z]", "", value)
+            total_characters += len(value)
+            characters += value
+
+        result = {
+            "total_characters": sum(len(i) for i in data),
+            "distinct_characters": len("".join(set(characters))),
         }
 
+        return result
+
     @staticmethod
     def check_data_types(data: list) -> dict:
         """
@@ -313,7 +310,7 @@ def check_coeff_var(data: list) -> float:
 
     def profiling(self):
         data = self.dataFrame.to_dict(orient="records")
-        data2 = self.dataFrame.to_dict()
+        data2 = self.dataFrame.to_dict(orient="list")
         result = {
             "overview": {
                 "number_of_observations": self.check_number_of_observations(
@@ -353,5 +350,6 @@ def profiling(self):
             if value == "Text":
                 result["variables"][key] = {
                     "data_type": value,
+                    "characters": self.check_characters(data2[key])
                 }
         return result