Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

popmon working with hgr v1.0.23 #101

Merged
merged 8 commits into from
Mar 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,18 @@ For Spark 2.X compiled against scala 2.11, in the string above simply replace 2.

`January 29, 2021`

Documentation
=============

The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.


Examples
========

- `Flight Delays and Cancellations Kaggle data <https://crclz.com/popmon/reports/flight_delays_report.html>`_
- `Synthetic data (code example below) <https://crclz.com/popmon/reports/test_data_report.html>`_

Documentation
=============

The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.


Notebooks
=========

Expand Down
10 changes: 7 additions & 3 deletions popmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,16 @@
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


# flake8: noqa
# histogram and report functions
from histogrammar.dfinterface.make_histograms import (
get_bin_specs,
get_time_axes,
make_histograms,
)

# pandas/spark dataframe decorators
from popmon import decorators

# histogram and report functions
from .hist.filling import get_bin_specs, get_time_axes, make_histograms
from .pipeline.metrics import df_stability_metrics, stability_metrics
from .pipeline.report import df_stability_report, stability_report
from .stitching import stitch_histograms
Expand Down
2 changes: 1 addition & 1 deletion popmon/alerting/compute_tl_bounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def df_single_op_pull_bounds(
:param list cols: list of cols to calculate bounds of (optional)
"""
if len(df.index) == 0:
raise RuntimeError("input df has zero length")
raise ValueError("input df has zero length")
row = df.iloc[0]
return pull_bounds(
row, red_high, yellow_high, yellow_low, red_low, suffix_mean, suffix_std, cols
Expand Down
28 changes: 14 additions & 14 deletions popmon/analysis/comparison/hist_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
get_consistent_numpy_entries,
)
from ...base import Pipeline
from ...hist.histogram import HistogramContainer
from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric
from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2


Expand Down Expand Up @@ -78,21 +78,21 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
hist_name1 = cols[0]
hist_name2 = cols[1]
if not all([name in cols for name in [hist_name1, hist_name2]]):
raise RuntimeError("Need to provide two histogram column names.")
raise ValueError("Need to provide two histogram column names.")

# basic histogram checks
hc1 = row[hist_name1]
hc2 = row[hist_name2]
if not all([isinstance(hc, HistogramContainer) for hc in [hc1, hc2]]):
hist1 = row[hist_name1]
hist2 = row[hist_name2]
if not all([isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]]):
return x
if not check_similar_hists([hc1, hc2]):
if not check_similar_hists([hist1, hist2]):
return x

# compare
is_num = hc1.is_num
if hc1.n_dim == 1:
is_num = is_numeric(hist1)
if hist1.n_dim == 1:
if is_num:
numpy_1dhists = get_consistent_numpy_1dhists([hc1, hc2])
numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2])
entries_list = [nphist[0] for nphist in numpy_1dhists]
# KS-test only properly defined for (ordered) 1D interval variables
ks_testscore = ks_test(*entries_list)
Expand All @@ -101,14 +101,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
x["ks_pvalue"] = ks_pvalue
x["ks_zscore"] = -norm.ppf(ks_pvalue)
else: # categorical
entries_list = get_consistent_numpy_entries([hc1, hc2])
entries_list = get_consistent_numpy_entries([hist1, hist2])
# check consistency of bin_labels
labels1 = hc1.hist.bin_labels()
labels2 = hc2.hist.bin_labels()
labels1 = hist1.bin_labels()
labels2 = hist2.bin_labels()
subset = set(labels1) <= set(labels2)
unknown_labels = int(not subset)
elif hc1.n_dim == 2:
numpy_2dgrids = get_consistent_numpy_2dgrids([hc1, hc2])
elif hist1.n_dim == 2:
numpy_2dgrids = get_consistent_numpy_2dgrids([hist1, hist2])
entries_list = [entry.flatten() for entry in numpy_2dgrids]

# calculate pearson coefficient
Expand Down
43 changes: 22 additions & 21 deletions popmon/analysis/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
get_consistent_numpy_entries,
set_2dgrid,
)
from ..hist.histogram import HistogramContainer
from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric
from ..stats.numpy import probability_distribution_mean_covariance


Expand Down Expand Up @@ -311,7 +311,7 @@ def hist_sum(x, hist_name=""):

Usage: df['hists'].apply(hist_sum) ; series.apply(hist_sum)

:param pd.Series x: pandas series to extract HistogramContainer list from.
:param pd.Series x: pandas series to extract histogram list from.
:param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
:return: sum histogram
"""
Expand All @@ -324,27 +324,28 @@ def hist_sum(x, hist_name=""):
hist_name = "histogram"

if len(hist_list) == 0:
raise RuntimeError("List of input histograms is empty.")
raise ValueError("List of input histograms is empty.")

# initialize
o = pd.Series()
o[hist_name] = None

# basic checks
all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
if not all_hc:
all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
if not all_hist:
return o

similar = check_similar_hists(hist_list)
if not similar:
return o

# MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is.
# h_sum = np.sum([hc.hist for hc in hist_list])
# h_sum = np.sum([hist for hist in hist_list])

h_sum = hist_list[0].hist.zero()
for hc in hist_list:
h_sum += hc.hist
o[hist_name] = HistogramContainer(h_sum)
h_sum = hist_list[0].zero()
for hist in hist_list:
h_sum += hist
o[hist_name] = h_sum
return o


Expand Down Expand Up @@ -386,7 +387,7 @@ def normalized_hist_mean_cov(x, hist_name=""):

Usage: df['hists'].apply(normalized_hist_mean_cov) ; series.apply(normalized_hist_mean_cov)

:param pd.Series x: pandas series to extract HistogramContainer list from.
:param pd.Series x: pandas series to extract histogram list from.
:param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
:return: mean normalized histogram, covariance probability matrix
"""
Expand All @@ -399,7 +400,7 @@ def normalized_hist_mean_cov(x, hist_name=""):
hist_name = "histogram"

if len(hist_list) == 0:
raise RuntimeError("List of input histograms is empty.")
raise ValueError("List of input histograms is empty.")

# initialize
o = pd.Series()
Expand All @@ -408,8 +409,8 @@ def normalized_hist_mean_cov(x, hist_name=""):
o[hist_name + "_binning"] = None

# basic checks
all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
if not all_hc:
all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
if not all_hist:
return o
similar = check_similar_hists(hist_list)
if not similar:
Expand Down Expand Up @@ -470,13 +471,13 @@ def relative_chi_squared(
if not all(r in row for r in required):
return x

hc = row[hist_name]
hist = row[hist_name]
norm_mean = row[hist_name + suffix_mean]
cov = row[hist_name + suffix_cov]
binning = row[hist_name + suffix_binning]

# basic checks
if not isinstance(hc, HistogramContainer):
if not isinstance(hist, COMMON_HIST_TYPES):
return x
if any([ho is None for ho in [norm_mean, cov, binning]]):
return x
Expand All @@ -486,15 +487,15 @@ def relative_chi_squared(
variance = np.diagonal(cov)

# get entries as numpy arrays
if hc.n_dim == 1:
if hist.n_dim == 1:
entries = (
hc.hist.bin_entries(xvalues=binning)
if hc.is_num
else hc.hist.bin_entries(labels=binning)
hist.bin_entries(xvalues=binning)
if is_numeric(hist)
else hist.bin_entries(labels=binning)
)
else:
assert len(binning) == 2
entries = set_2dgrid(hc.hist, binning[0], binning[1])
entries = set_2dgrid(hist, binning[0], binning[1])
entries = entries.flatten()

# calculation of mean normalized histogram and its covariance matrix of input histogram
Expand Down
Loading