From e6e20cc3538a2723075d7abe329572a47e54f89f Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 5 Feb 2024 15:59:00 -0600 Subject: [PATCH 1/2] update profiler utils --- dataprofiler/profilers/profiler_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index e38e1b04..5edc4898 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -26,6 +26,7 @@ ) import numpy as np +import polars as pl import psutil import scipy from pandas import DataFrame, Series @@ -331,6 +332,7 @@ def biased_skew(df_series: Series) -> np.float64: :return: biased skewness :rtype: np.float64 """ + df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -369,6 +371,7 @@ def biased_kurt(df_series: Series) -> np.float64: :return: biased kurtosis :rtype: np.float64 """ + df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) From bcf6b26917b690e7b35befb38fca173fefe3e843 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 5 Feb 2024 16:26:12 -0600 Subject: [PATCH 2/2] finish updates --- dataprofiler/profilers/numerical_column_stats.py | 8 ++++++++ dataprofiler/profilers/profiler_utils.py | 11 ++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 74c24e21..7fe05aee 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -1924,6 +1924,10 @@ def _get_skewness( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] @@ -1968,6 +1972,10 @@ def _get_kurtosis( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index 5edc4898..a81dca7a 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -29,7 +29,8 @@ import polars as pl import psutil import scipy -from pandas import DataFrame, Series +from pandas import DataFrame +from polars import Series from ..labelers.data_labelers import DataLabeler @@ -321,7 +322,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict: return merged_dict -def biased_skew(df_series: Series) -> np.float64: +def biased_skew(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for skewness of the given data. @@ -332,7 +333,6 @@ def biased_skew(df_series: Series) -> np.float64: :return: biased skewness :rtype: np.float64 """ - df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -360,7 +360,7 @@ def biased_skew(df_series: Series) -> np.float64: return skew -def biased_kurt(df_series: Series) -> np.float64: +def biased_kurt(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for kurtosis of the given data. @@ -371,7 +371,6 @@ def biased_kurt(df_series: Series) -> np.float64: :return: biased kurtosis :rtype: np.float64 """ - df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -678,6 +677,8 @@ def get_memory_size(data: list | np.ndarray | DataFrame, unit: str = "M") -> flo :type unit: string :return: memory size of the input data """ + if type(data) is DataFrame: + data = pl.from_pandas(data) unit_map: dict = collections.defaultdict(B=0, K=1, M=2, G=3) if unit not in unit_map: raise ValueError(