Skip to content

Commit

Permalink
fix: add invalid dates to variable info
Browse files Browse the repository at this point in the history
  • Loading branch information
alexbarros committed Dec 3, 2024
1 parent 72d282d commit 11cd49b
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 6 deletions.
20 changes: 19 additions & 1 deletion src/ydata_profiling/model/pandas/describe_date_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
series_handle_nulls,
series_hashable,
)
from ydata_profiling.model.typeset_relations import is_pandas_1


def string_to_datetime(series: pd.Series) -> pd.Series:
if is_pandas_1():
return pd.to_datetime(series, errors="coerce")
return pd.to_datetime(series, format="mixed", errors="coerce")


@describe_date_1d.register
Expand All @@ -29,6 +36,12 @@ def pandas_describe_date_1d(
Returns:
A dict containing calculated series description values.
"""
og_series = series.dropna()
series = string_to_datetime(og_series)
invalid_values = og_series[series.isna()]

series = series.dropna()

if summary["value_counts_without_nan"].empty:
values = series.values
summary.update(
Expand All @@ -53,5 +66,10 @@ def pandas_describe_date_1d(
if config.vars.num.chi_squared_threshold > 0.0:
summary["chi_squared"] = chi_square(values)

summary.update(histogram_compute(config, values, summary["n_distinct"]))
summary.update(histogram_compute(config, values, series.nunique()))
summary.update({
"invalid_dates": invalid_values.nunique(),
"n_invalid_dates": len(invalid_values),
"p_invalid_dates": len(invalid_values) / summary["n"],
})
return config, values, summary
3 changes: 0 additions & 3 deletions src/ydata_profiling/model/pandas/summary_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ def pandas_describe_1d(
and series.name in typeset.type_schema
):
vtype = typeset.type_schema[series.name]
infered_vtype = typeset.infer_type(series)
if vtype == infered_vtype:
series = typeset.cast_to_inferred(series)

elif config.infer_dtypes:
# Infer variable types
Expand Down
4 changes: 2 additions & 2 deletions src/ydata_profiling/model/typeset_relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ def string_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool:

def string_to_datetime(series: pd.Series, state: dict) -> pd.Series:
if is_pandas_1():
return pd.to_datetime(series, errors="coerce")
return pd.to_datetime(series, format="mixed", errors="coerce")
return pd.to_datetime(series)
return pd.to_datetime(series, format="mixed")


def string_to_numeric(series: pd.Series, state: dict) -> pd.Series:
Expand Down
10 changes: 10 additions & 0 deletions src/ydata_profiling/report/structure/variables/render_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
[
{"name": "Minimum", "value": fmt(summary["min"]), "alert": False},
{"name": "Maximum", "value": fmt(summary["max"]), "alert": False},
{
"name": "Invalid dates",
"value": fmt(summary["n_invalid_dates"]),
"alert": False,
},
{
"name": "Invalid dates (%)",
"value": fmt_percent(summary["p_invalid_dates"]),
"alert": False,
},
],
style=config.html.style,
)
Expand Down

0 comments on commit 11cd49b

Please sign in to comment.