popmon working with hgr v1.0.22

* popmon working with hgr v1.0.22 * removal of all HistogramContainer code * all unit tests working * fix all flake8 errors * In readme.rst switch example and documentation sections * bump up version to 0.3.15
ing-bank · Mar 23, 2021 · e7f1122 · e7f1122
1 parent 7da807c
commit e7f1122
Show file tree

Hide file tree

Showing 41 changed files with 570 additions and 3,040 deletions.
diff --git a/README.rst b/README.rst
@@ -35,18 +35,18 @@ For Spark 2.X compiled against scala 2.11, in the string above simply replace 2.
 
 `January 29, 2021`
 
-Documentation
-=============
-
-The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.
-
-
 Examples
 ========
 
 - `Flight Delays and Cancellations Kaggle data <https://crclz.com/popmon/reports/flight_delays_report.html>`_
 - `Synthetic data (code example below) <https://crclz.com/popmon/reports/test_data_report.html>`_
 
+Documentation
+=============
+
+The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.
+
+
 Notebooks
 =========
 

diff --git a/examples/flight_delays.py b/examples/flight_delays.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import pandas as pd
 
 import popmon

diff --git a/examples/synthetic_data.py b/examples/synthetic_data.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import pandas as pd
 
 import popmon

diff --git a/popmon/__init__.py b/popmon/__init__.py
@@ -23,7 +23,7 @@
 from popmon import decorators
 
 # histogram and report functions
-from .hist.filling import get_bin_specs, get_time_axes, make_histograms
+from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms
 from .pipeline.metrics import df_stability_metrics, stability_metrics
 from .pipeline.report import df_stability_report, stability_report
 from .stitching import stitch_histograms

diff --git a/popmon/analysis/apply_func.py b/popmon/analysis/apply_func.py
@@ -283,9 +283,9 @@ def apply_func(feature, selected_metrics, df, arr):
 
     if (
         "entire" in arr
-        and arr["entire"] is not None
-        and arr["entire"] is not False
-        and arr["entire"] != 0
+        and arr["entire"] is not None  # noqa: W503
+        and arr["entire"] is not False  # noqa: W503
+        and arr["entire"] != 0  # noqa: W503
     ):
         obj = func(df, *args, **kwargs)
     else:
@@ -302,48 +302,48 @@ def apply_func(feature, selected_metrics, df, arr):
         obj = {"_".join(df.columns): obj}
     elif (
         isinstance(obj, (list, tuple, np.ndarray))
-        and isinstance(df, pd.DataFrame)
-        and len(df.columns) == len(obj)
+        and isinstance(df, pd.DataFrame)  # noqa: W503
+        and len(df.columns) == len(obj)  # noqa: W503
     ):
         obj = {c: o for c, o in zip(df.columns, obj)}
     elif (
         isinstance(obj, (list, tuple, np.ndarray))
-        and isinstance(df, pd.Series)
-        and len(df.index) == len(obj)
+        and isinstance(df, pd.Series)  # noqa: W503
+        and len(df.index) == len(obj)  # noqa: W503
     ):
         obj = {df.name: pd.Series(data=obj, index=df.index)}
     elif (
         isinstance(obj, (list, tuple, np.ndarray))
-        and isinstance(df, pd.DataFrame)
-        and len(df.index) == len(obj)
+        and isinstance(df, pd.DataFrame)  # noqa: W503
+        and len(df.index) == len(obj)  # noqa: W503
     ):
         obj = {"_".join(df.columns): pd.Series(data=obj, index=df.index)}
     elif (
         isinstance(obj, pd.Series)
-        and isinstance(df, pd.Series)
-        and len(obj) == len(df)
-        and all(obj.index == df.index)
+        and isinstance(df, pd.Series)  # noqa: W503
+        and len(obj) == len(df)  # noqa: W503
+        and all(obj.index == df.index)  # noqa: W503
     ):
         obj = {df.name: obj}
     elif (
         isinstance(obj, pd.Series)
-        and isinstance(df, pd.DataFrame)
-        and len(obj) == len(df)
-        and all(obj.index == df.index)
+        and isinstance(df, pd.DataFrame)  # noqa: W503
+        and len(obj) == len(df)  # noqa: W503
+        and all(obj.index == df.index)  # noqa: W503
     ):
         obj = {"_".join(df.columns): obj}
     elif (
         isinstance(obj, pd.DataFrame)
-        and len(obj.columns) == 1
-        and len(obj.index) != len(df.index)
+        and len(obj.columns) == 1  # noqa: W503
+        and len(obj.index) != len(df.index)  # noqa: W503
     ):
         # e.g. output of normalized_hist_mean_cov: a dataframe with one column, actually a series
         obj = obj[obj.columns[0]].to_dict()
     elif (
         isinstance(obj, pd.DataFrame)
-        and len(obj.columns) == 1
-        and len(obj.index) == len(df.index)
-        and (obj.index != df.index).any()
+        and len(obj.columns) == 1  # noqa: W503
+        and len(obj.index) == len(df.index)  # noqa: W503
+        and (obj.index != df.index).any()  # noqa: W503
     ):
         # e.g. output of normalized_hist_mean_cov: a dataframe with one column, actually a series
         obj = obj[obj.columns[0]].to_dict()

diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py
@@ -39,8 +39,8 @@
     get_consistent_numpy_entries,
 )
 from ...base import Pipeline
-from ...hist.histogram import HistogramContainer
 from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2
+from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric
 
 
 def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
@@ -81,18 +81,18 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
         raise RuntimeError("Need to provide two histogram column names.")
 
     # basic histogram checks
-    hc1 = row[hist_name1]
-    hc2 = row[hist_name2]
-    if not all([isinstance(hc, HistogramContainer) for hc in [hc1, hc2]]):
+    hist1 = row[hist_name1]
+    hist2 = row[hist_name2]
+    if not all([isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]]):
         return x
-    if not check_similar_hists([hc1, hc2]):
+    if not check_similar_hists([hist1, hist2]):
         return x
 
     # compare
-    is_num = hc1.is_num
-    if hc1.n_dim == 1:
+    is_num = is_numeric(hist1)
+    if hist1.n_dim == 1:
         if is_num:
-            numpy_1dhists = get_consistent_numpy_1dhists([hc1, hc2])
+            numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2])
             entries_list = [nphist[0] for nphist in numpy_1dhists]
             # KS-test only properly defined for (ordered) 1D interval variables
             ks_testscore = ks_test(*entries_list)
@@ -101,14 +101,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
             x["ks_pvalue"] = ks_pvalue
             x["ks_zscore"] = -norm.ppf(ks_pvalue)
         else:  # categorical
-            entries_list = get_consistent_numpy_entries([hc1, hc2])
+            entries_list = get_consistent_numpy_entries([hist1, hist2])
             # check consistency of bin_labels
-            labels1 = hc1.hist.bin_labels()
-            labels2 = hc2.hist.bin_labels()
+            labels1 = hist1.bin_labels()
+            labels2 = hist2.bin_labels()
             subset = set(labels1) <= set(labels2)
             unknown_labels = int(not subset)
-    elif hc1.n_dim == 2:
-        numpy_2dgrids = get_consistent_numpy_2dgrids([hc1, hc2])
+    elif hist1.n_dim == 2:
+        numpy_2dgrids = get_consistent_numpy_2dgrids([hist1, hist2])
         entries_list = [entry.flatten() for entry in numpy_2dgrids]
 
     # calculate pearson coefficient

diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py
@@ -24,13 +24,13 @@
 from scipy import linalg, stats
 from scipy.stats import linregress, norm
 
+from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric
 from ..analysis.hist_numpy import (
     check_similar_hists,
     get_consistent_numpy_2dgrids,
     get_consistent_numpy_entries,
     set_2dgrid,
 )
-from ..hist.histogram import HistogramContainer
 from ..stats.numpy import probability_distribution_mean_covariance
 
 
@@ -311,7 +311,7 @@ def hist_sum(x, hist_name=""):
 
     Usage: df['hists'].apply(hist_sum) ; series.apply(hist_sum)
 
-    :param pd.Series x: pandas series to extract HistogramContainer list from.
+    :param pd.Series x: pandas series to extract histogram list from.
     :param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
     :return: sum histogram
     """
@@ -331,20 +331,21 @@ def hist_sum(x, hist_name=""):
     o[hist_name] = None
 
     # basic checks
-    all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
-    if not all_hc:
+    all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
+    if not all_hist:
         return o
+
     similar = check_similar_hists(hist_list)
     if not similar:
         return o
 
     # MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is.
-    # h_sum = np.sum([hc.hist for hc in hist_list])
+    # h_sum = np.sum([hist for hist in hist_list])
 
-    h_sum = hist_list[0].hist.zero()
-    for hc in hist_list:
-        h_sum += hc.hist
-    o[hist_name] = HistogramContainer(h_sum)
+    h_sum = hist_list[0].zero()
+    for hist in hist_list:
+        h_sum += hist
+    o[hist_name] = h_sum
     return o
 
 
@@ -386,7 +387,7 @@ def normalized_hist_mean_cov(x, hist_name=""):
 
     Usage: df['hists'].apply(normalized_hist_mean_cov) ; series.apply(normalized_hist_mean_cov)
 
-    :param pd.Series x: pandas series to extract HistogramContainer list from.
+    :param pd.Series x: pandas series to extract histogram list from.
     :param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
     :return: mean normalized histogram, covariance probability matrix
     """
@@ -408,8 +409,8 @@ def normalized_hist_mean_cov(x, hist_name=""):
     o[hist_name + "_binning"] = None
 
     # basic checks
-    all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
-    if not all_hc:
+    all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
+    if not all_hist:
         return o
     similar = check_similar_hists(hist_list)
     if not similar:
@@ -470,13 +471,13 @@ def relative_chi_squared(
     if not all(r in row for r in required):
         return x
 
-    hc = row[hist_name]
+    hist = row[hist_name]
     norm_mean = row[hist_name + suffix_mean]
     cov = row[hist_name + suffix_cov]
     binning = row[hist_name + suffix_binning]
 
     # basic checks
-    if not isinstance(hc, HistogramContainer):
+    if not isinstance(hist, COMMON_HIST_TYPES):
         return x
     if any([ho is None for ho in [norm_mean, cov, binning]]):
         return x
@@ -486,23 +487,22 @@ def relative_chi_squared(
     variance = np.diagonal(cov)
 
     # get entries as numpy arrays
-    if hc.n_dim == 1:
+    if hist.n_dim == 1:
         entries = (
-            hc.hist.bin_entries(xvalues=binning)
-            if hc.is_num
-            else hc.hist.bin_entries(labels=binning)
+            hist.bin_entries(xvalues=binning)
+            if is_numeric(hist)
+            else hist.bin_entries(labels=binning)
         )
     else:
         assert len(binning) == 2
-        entries = set_2dgrid(hc.hist, binning[0], binning[1])
+        entries = set_2dgrid(hist, binning[0], binning[1])
         entries = entries.flatten()
 
     # calculation of mean normalized histogram and its covariance matrix of input histogram
     single_norm, _ = probability_distribution_mean_covariance([entries])
 
     if (
-        np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps
-        and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps
+        np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps
     ):
         # check if covariance matrix is invertible
         # see: https://stackoverflow.com/questions/13249108/efficient-pythonic-check-for-singular-matrix