Skip to content

Commit

Permalink
Data Explorer: More gracefully handle summary statistics for all-null…
Browse files Browse the repository at this point in the history
… arrays for pandas and polars (#4329)

As described in #4307, for pandas we were raising a warning for all null
/ NA arrays when computing summary stats. The polars implementation
turned out to have a different bug, so that is fixed and tested here
also.

### QA Notes

Follow example reported in #4307 for both pandas and polars (substitute
all None values for polars)
  • Loading branch information
wesm committed Aug 14, 2024
1 parent 3701b09 commit 55429d0
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -930,42 +930,33 @@ def _pandas_datetimetz_mapper(type_name):
def _pandas_summarize_number(col: "pd.Series", options: FormatOptions):
float_format = _get_float_formatter(options)

min_val = max_val = median_val = mean_val = std_val = None
if "complex" in str(col.dtype):
min_val = max_val = median_val = mean_val = None
values = col.to_numpy()
non_null_values = values[~np_.isnan(values)]
if len(non_null_values) > 0:
median_val = float_format(np_.median(non_null_values))
mean_val = float_format(np_.mean(non_null_values))
return _box_number_stats(
min_val,
max_val,
mean_val,
median_val,
None, # std_val
)
else:
return _get_float_stats_generic(col, float_format)

non_null_values = col[col.notna()].to_numpy() # type: ignore

def _get_float_stats_generic(col, float_format):
# This works for both pandas and polars because their API for
# these functions is the same
median_val = mean_val = std_val = None
if len(non_null_values) > 0:
min_val = non_null_values.min()
max_val = non_null_values.max()

min_val = col.min()
max_val = col.max()
if not _isinf(min_val) and not _isinf(max_val):
# These stats are not defined when there is an
# inf/-inf in the data
mean_val = float_format(non_null_values.mean())
median_val = float_format(np_.median(non_null_values))
std_val = float_format(non_null_values.std(ddof=1))

if not _isinf(min_val) and not _isinf(max_val):
# These stats are not defined when there is an
# inf/-inf in the data
mean_val = float_format(col.mean())
median_val = float_format(col.median())
std_val = float_format(col.std())
min_val = float_format(min_val)
max_val = float_format(max_val)

return _box_number_stats(
float_format(min_val),
float_format(max_val),
min_val,
max_val,
mean_val,
median_val,
std_val,
Expand Down Expand Up @@ -1839,7 +1830,31 @@ def _parse_iso8601_like(x, tz=None):

def _polars_summarize_number(col: "pl.Series", options: FormatOptions):
float_format = _get_float_formatter(options)
return _get_float_stats_generic(col, float_format)
min_val = max_val = median_val = mean_val = std_val = None

is_empty = col.null_count() == len(col)

if not is_empty:
min_val = col.min()
max_val = col.max()

if not _isinf(min_val) and not _isinf(max_val): # type: ignore
# These stats are not defined when there is an
# inf/-inf in the data
mean_val = float_format(col.mean())
median_val = float_format(col.median())
std_val = float_format(col.std())

min_val = float_format(min_val)
max_val = float_format(max_val)

return _box_number_stats(
min_val,
max_val,
mean_val,
median_val,
std_val,
)


def _polars_summarize_string(col: "pl.Series", _):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2416,6 +2416,7 @@ def test_pandas_profile_summary_stats(dxf: DataExplorerFixture):
), # datetime single tz
"f7": [1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j, np.nan] * 20, # complex,
"f8": [np.nan, np.inf, -np.inf, 0, np.nan] * 20, # with infinity
"f9": [np.nan] * 100,
}
)

Expand Down Expand Up @@ -2548,6 +2549,11 @@ def test_pandas_profile_summary_stats(dxf: DataExplorerFixture):
8,
{"min_value": "-INF", "max_value": "INF"},
),
(
"df1",
9,
{},
),
(
"df_mixed_tz1",
0,
Expand Down Expand Up @@ -3381,6 +3387,7 @@ def test_polars_profile_summary_stats(dxf: DataExplorerFixture):
dtype=pl.Datetime("ms", time_zone="UTC"),
), # datetime single tz
"f7": [np.nan, np.inf, -np.inf, 0, np.nan] * 20, # with infinity
"f8": pl.Series([None] * 100, dtype=pl.Float64),
}
)

Expand Down Expand Up @@ -3468,6 +3475,11 @@ def test_polars_profile_summary_stats(dxf: DataExplorerFixture):
7,
{"min_value": "-INF", "max_value": "INF"},
),
(
"df1",
8,
{},
),
]

for table_name, col_index, ex_result in cases:
Expand Down

0 comments on commit 55429d0

Please sign in to comment.