From 11cd49bd5c2fc4bc6a5e239448bd17dd68832b3c Mon Sep 17 00:00:00 2001 From: alexbarros Date: Tue, 3 Dec 2024 14:37:51 -0300 Subject: [PATCH] fix: add invalid dates to variable info --- .../model/pandas/describe_date_pandas.py | 20 ++++++++++++++++++- .../model/pandas/summary_pandas.py | 3 --- .../model/typeset_relations.py | 4 ++-- .../report/structure/variables/render_date.py | 10 ++++++++++ 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index 1ff64a50f..072a4671c 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -11,6 +11,13 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.typeset_relations import is_pandas_1 + + +def string_to_datetime(series: pd.Series) -> pd.Series: + if is_pandas_1(): + return pd.to_datetime(series, errors="coerce") + return pd.to_datetime(series, format="mixed", errors="coerce") @describe_date_1d.register @@ -29,6 +36,12 @@ def pandas_describe_date_1d( Returns: A dict containing calculated series description values. """ + og_series = series.dropna() + series = string_to_datetime(og_series) + invalid_values = og_series[series.isna()] + + series = series.dropna() + if summary["value_counts_without_nan"].empty: values = series.values summary.update( @@ -53,5 +66,10 @@ def pandas_describe_date_1d( if config.vars.num.chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(values) - summary.update(histogram_compute(config, values, summary["n_distinct"])) + summary.update(histogram_compute(config, values, series.nunique())) + summary.update({ + "invalid_dates": invalid_values.nunique(), + "n_invalid_dates": len(invalid_values), + "p_invalid_dates": len(invalid_values) / summary["n"], + }) return config, values, summary diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index df12dc8cf..5d15b2d3c 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -44,9 +44,6 @@ def pandas_describe_1d( and series.name in typeset.type_schema ): vtype = typeset.type_schema[series.name] - infered_vtype = typeset.infer_type(series) - if vtype == infered_vtype: - series = typeset.cast_to_inferred(series) elif config.infer_dtypes: # Infer variable types diff --git a/src/ydata_profiling/model/typeset_relations.py b/src/ydata_profiling/model/typeset_relations.py index 61614527a..beac15b51 100644 --- a/src/ydata_profiling/model/typeset_relations.py +++ b/src/ydata_profiling/model/typeset_relations.py @@ -114,8 +114,8 @@ def string_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool: def string_to_datetime(series: pd.Series, state: dict) -> pd.Series: if is_pandas_1(): - return pd.to_datetime(series, errors="coerce") - return pd.to_datetime(series, format="mixed", errors="coerce") + return pd.to_datetime(series) + return pd.to_datetime(series, format="mixed") def string_to_numeric(series: pd.Series, state: dict) -> pd.Series: diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index c75a80a5e..1f142daae 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -62,6 +62,16 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: [ {"name": "Minimum", "value": fmt(summary["min"]), "alert": False}, {"name": "Maximum", "value": fmt(summary["max"]), "alert": False}, + { + "name": "Invalid dates", + "value": fmt(summary["n_invalid_dates"]), + "alert": False, + }, + { + "name": "Invalid dates (%)", + "value": fmt_percent(summary["p_invalid_dates"]), + "alert": False, + }, ], style=config.html.style, )