Skip to content

Commit

Permalink
popmon working with hgr v1.0.22
Browse files Browse the repository at this point in the history
* popmon working with hgr v1.0.22
* removal of all HistogramContainer code
* all unit tests working
* fix all flake8 errors
* In readme.rst switch example and documentation sections
* bump up version to 0.3.15
  • Loading branch information
mbaak committed Mar 23, 2021
1 parent 7da807c commit e7f1122
Show file tree
Hide file tree
Showing 41 changed files with 570 additions and 3,040 deletions.
12 changes: 6 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,18 @@ For Spark 2.X compiled against scala 2.11, in the string above simply replace 2.

`January 29, 2021`

Documentation
=============

The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.


Examples
========

- `Flight Delays and Cancellations Kaggle data <https://crclz.com/popmon/reports/flight_delays_report.html>`_
- `Synthetic data (code example below) <https://crclz.com/popmon/reports/test_data_report.html>`_

Documentation
=============

The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.


Notebooks
=========

Expand Down
1 change: 1 addition & 0 deletions examples/flight_delays.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
import pandas as pd

import popmon
Expand Down
1 change: 1 addition & 0 deletions examples/synthetic_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
import pandas as pd

import popmon
Expand Down
2 changes: 1 addition & 1 deletion popmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from popmon import decorators

# histogram and report functions
from .hist.filling import get_bin_specs, get_time_axes, make_histograms
from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms
from .pipeline.metrics import df_stability_metrics, stability_metrics
from .pipeline.report import df_stability_report, stability_report
from .stitching import stitch_histograms
Expand Down
40 changes: 20 additions & 20 deletions popmon/analysis/apply_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,9 @@ def apply_func(feature, selected_metrics, df, arr):

if (
"entire" in arr
and arr["entire"] is not None
and arr["entire"] is not False
and arr["entire"] != 0
and arr["entire"] is not None # noqa: W503
and arr["entire"] is not False # noqa: W503
and arr["entire"] != 0 # noqa: W503
):
obj = func(df, *args, **kwargs)
else:
Expand All @@ -302,48 +302,48 @@ def apply_func(feature, selected_metrics, df, arr):
obj = {"_".join(df.columns): obj}
elif (
isinstance(obj, (list, tuple, np.ndarray))
and isinstance(df, pd.DataFrame)
and len(df.columns) == len(obj)
and isinstance(df, pd.DataFrame) # noqa: W503
and len(df.columns) == len(obj) # noqa: W503
):
obj = {c: o for c, o in zip(df.columns, obj)}
elif (
isinstance(obj, (list, tuple, np.ndarray))
and isinstance(df, pd.Series)
and len(df.index) == len(obj)
and isinstance(df, pd.Series) # noqa: W503
and len(df.index) == len(obj) # noqa: W503
):
obj = {df.name: pd.Series(data=obj, index=df.index)}
elif (
isinstance(obj, (list, tuple, np.ndarray))
and isinstance(df, pd.DataFrame)
and len(df.index) == len(obj)
and isinstance(df, pd.DataFrame) # noqa: W503
and len(df.index) == len(obj) # noqa: W503
):
obj = {"_".join(df.columns): pd.Series(data=obj, index=df.index)}
elif (
isinstance(obj, pd.Series)
and isinstance(df, pd.Series)
and len(obj) == len(df)
and all(obj.index == df.index)
and isinstance(df, pd.Series) # noqa: W503
and len(obj) == len(df) # noqa: W503
and all(obj.index == df.index) # noqa: W503
):
obj = {df.name: obj}
elif (
isinstance(obj, pd.Series)
and isinstance(df, pd.DataFrame)
and len(obj) == len(df)
and all(obj.index == df.index)
and isinstance(df, pd.DataFrame) # noqa: W503
and len(obj) == len(df) # noqa: W503
and all(obj.index == df.index) # noqa: W503
):
obj = {"_".join(df.columns): obj}
elif (
isinstance(obj, pd.DataFrame)
and len(obj.columns) == 1
and len(obj.index) != len(df.index)
and len(obj.columns) == 1 # noqa: W503
and len(obj.index) != len(df.index) # noqa: W503
):
# e.g. output of normalized_hist_mean_cov: a dataframe with one column, actually a series
obj = obj[obj.columns[0]].to_dict()
elif (
isinstance(obj, pd.DataFrame)
and len(obj.columns) == 1
and len(obj.index) == len(df.index)
and (obj.index != df.index).any()
and len(obj.columns) == 1 # noqa: W503
and len(obj.index) == len(df.index) # noqa: W503
and (obj.index != df.index).any() # noqa: W503
):
# e.g. output of normalized_hist_mean_cov: a dataframe with one column, actually a series
obj = obj[obj.columns[0]].to_dict()
Expand Down
26 changes: 13 additions & 13 deletions popmon/analysis/comparison/hist_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
get_consistent_numpy_entries,
)
from ...base import Pipeline
from ...hist.histogram import HistogramContainer
from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2
from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric


def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
Expand Down Expand Up @@ -81,18 +81,18 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
raise RuntimeError("Need to provide two histogram column names.")

# basic histogram checks
hc1 = row[hist_name1]
hc2 = row[hist_name2]
if not all([isinstance(hc, HistogramContainer) for hc in [hc1, hc2]]):
hist1 = row[hist_name1]
hist2 = row[hist_name2]
if not all([isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]]):
return x
if not check_similar_hists([hc1, hc2]):
if not check_similar_hists([hist1, hist2]):
return x

# compare
is_num = hc1.is_num
if hc1.n_dim == 1:
is_num = is_numeric(hist1)
if hist1.n_dim == 1:
if is_num:
numpy_1dhists = get_consistent_numpy_1dhists([hc1, hc2])
numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2])
entries_list = [nphist[0] for nphist in numpy_1dhists]
# KS-test only properly defined for (ordered) 1D interval variables
ks_testscore = ks_test(*entries_list)
Expand All @@ -101,14 +101,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
x["ks_pvalue"] = ks_pvalue
x["ks_zscore"] = -norm.ppf(ks_pvalue)
else: # categorical
entries_list = get_consistent_numpy_entries([hc1, hc2])
entries_list = get_consistent_numpy_entries([hist1, hist2])
# check consistency of bin_labels
labels1 = hc1.hist.bin_labels()
labels2 = hc2.hist.bin_labels()
labels1 = hist1.bin_labels()
labels2 = hist2.bin_labels()
subset = set(labels1) <= set(labels2)
unknown_labels = int(not subset)
elif hc1.n_dim == 2:
numpy_2dgrids = get_consistent_numpy_2dgrids([hc1, hc2])
elif hist1.n_dim == 2:
numpy_2dgrids = get_consistent_numpy_2dgrids([hist1, hist2])
entries_list = [entry.flatten() for entry in numpy_2dgrids]

# calculate pearson coefficient
Expand Down
42 changes: 21 additions & 21 deletions popmon/analysis/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@
from scipy import linalg, stats
from scipy.stats import linregress, norm

from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric
from ..analysis.hist_numpy import (
check_similar_hists,
get_consistent_numpy_2dgrids,
get_consistent_numpy_entries,
set_2dgrid,
)
from ..hist.histogram import HistogramContainer
from ..stats.numpy import probability_distribution_mean_covariance


Expand Down Expand Up @@ -311,7 +311,7 @@ def hist_sum(x, hist_name=""):
Usage: df['hists'].apply(hist_sum) ; series.apply(hist_sum)
:param pd.Series x: pandas series to extract HistogramContainer list from.
:param pd.Series x: pandas series to extract histogram list from.
:param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
:return: sum histogram
"""
Expand All @@ -331,20 +331,21 @@ def hist_sum(x, hist_name=""):
o[hist_name] = None

# basic checks
all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
if not all_hc:
all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
if not all_hist:
return o

similar = check_similar_hists(hist_list)
if not similar:
return o

# MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is.
# h_sum = np.sum([hc.hist for hc in hist_list])
# h_sum = np.sum([hist for hist in hist_list])

h_sum = hist_list[0].hist.zero()
for hc in hist_list:
h_sum += hc.hist
o[hist_name] = HistogramContainer(h_sum)
h_sum = hist_list[0].zero()
for hist in hist_list:
h_sum += hist
o[hist_name] = h_sum
return o


Expand Down Expand Up @@ -386,7 +387,7 @@ def normalized_hist_mean_cov(x, hist_name=""):
Usage: df['hists'].apply(normalized_hist_mean_cov) ; series.apply(normalized_hist_mean_cov)
:param pd.Series x: pandas series to extract HistogramContainer list from.
:param pd.Series x: pandas series to extract histogram list from.
:param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
:return: mean normalized histogram, covariance probability matrix
"""
Expand All @@ -408,8 +409,8 @@ def normalized_hist_mean_cov(x, hist_name=""):
o[hist_name + "_binning"] = None

# basic checks
all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
if not all_hc:
all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
if not all_hist:
return o
similar = check_similar_hists(hist_list)
if not similar:
Expand Down Expand Up @@ -470,13 +471,13 @@ def relative_chi_squared(
if not all(r in row for r in required):
return x

hc = row[hist_name]
hist = row[hist_name]
norm_mean = row[hist_name + suffix_mean]
cov = row[hist_name + suffix_cov]
binning = row[hist_name + suffix_binning]

# basic checks
if not isinstance(hc, HistogramContainer):
if not isinstance(hist, COMMON_HIST_TYPES):
return x
if any([ho is None for ho in [norm_mean, cov, binning]]):
return x
Expand All @@ -486,23 +487,22 @@ def relative_chi_squared(
variance = np.diagonal(cov)

# get entries as numpy arrays
if hc.n_dim == 1:
if hist.n_dim == 1:
entries = (
hc.hist.bin_entries(xvalues=binning)
if hc.is_num
else hc.hist.bin_entries(labels=binning)
hist.bin_entries(xvalues=binning)
if is_numeric(hist)
else hist.bin_entries(labels=binning)
)
else:
assert len(binning) == 2
entries = set_2dgrid(hc.hist, binning[0], binning[1])
entries = set_2dgrid(hist, binning[0], binning[1])
entries = entries.flatten()

# calculation of mean normalized histogram and its covariance matrix of input histogram
single_norm, _ = probability_distribution_mean_covariance([entries])

if (
np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps
and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps
np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps
):
# check if covariance matrix is invertible
# see: https://stackoverflow.com/questions/13249108/efficient-pythonic-check-for-singular-matrix
Expand Down
Loading

0 comments on commit e7f1122

Please sign in to comment.