ing-bank · sbrugman · Mar 27, 2021 · Feb 8, 2021 · Mar 24, 2021 · Mar 27, 2021
diff --git a/README.rst b/README.rst
@@ -35,18 +35,18 @@ For Spark 2.X compiled against scala 2.11, in the string above simply replace 2.
 
 `January 29, 2021`
 
-Documentation
-=============
-
-The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.
-
-
 Examples
 ========
 
 - `Flight Delays and Cancellations Kaggle data <https://crclz.com/popmon/reports/flight_delays_report.html>`_
 - `Synthetic data (code example below) <https://crclz.com/popmon/reports/test_data_report.html>`_
 
+Documentation
+=============
+
+The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.
+
+
 Notebooks
 =========
 

diff --git a/popmon/__init__.py b/popmon/__init__.py
@@ -18,12 +18,16 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
-# flake8: noqa
+# histogram and report functions
+from histogrammar.dfinterface.make_histograms import (
+    get_bin_specs,
+    get_time_axes,
+    make_histograms,
+)
+
 # pandas/spark dataframe decorators
 from popmon import decorators
 
-# histogram and report functions
-from .hist.filling import get_bin_specs, get_time_axes, make_histograms
 from .pipeline.metrics import df_stability_metrics, stability_metrics
 from .pipeline.report import df_stability_report, stability_report
 from .stitching import stitch_histograms

diff --git a/popmon/alerting/compute_tl_bounds.py b/popmon/alerting/compute_tl_bounds.py
@@ -329,7 +329,7 @@ def df_single_op_pull_bounds(
     :param list cols: list of cols to calculate bounds of (optional)
     """
     if len(df.index) == 0:
-        raise RuntimeError("input df has zero length")
+        raise ValueError("input df has zero length")
     row = df.iloc[0]
     return pull_bounds(
         row, red_high, yellow_high, yellow_low, red_low, suffix_mean, suffix_std, cols

diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py
@@ -39,7 +39,7 @@
     get_consistent_numpy_entries,
 )
 from ...base import Pipeline
-from ...hist.histogram import HistogramContainer
+from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric
 from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2
 
 
@@ -78,21 +78,21 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
         hist_name1 = cols[0]
         hist_name2 = cols[1]
     if not all([name in cols for name in [hist_name1, hist_name2]]):
-        raise RuntimeError("Need to provide two histogram column names.")
+        raise ValueError("Need to provide two histogram column names.")
 
     # basic histogram checks
-    hc1 = row[hist_name1]
-    hc2 = row[hist_name2]
-    if not all([isinstance(hc, HistogramContainer) for hc in [hc1, hc2]]):
+    hist1 = row[hist_name1]
+    hist2 = row[hist_name2]
+    if not all([isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]]):
         return x
-    if not check_similar_hists([hc1, hc2]):
+    if not check_similar_hists([hist1, hist2]):
         return x
 
     # compare
-    is_num = hc1.is_num
-    if hc1.n_dim == 1:
+    is_num = is_numeric(hist1)
+    if hist1.n_dim == 1:
         if is_num:
-            numpy_1dhists = get_consistent_numpy_1dhists([hc1, hc2])
+            numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2])
             entries_list = [nphist[0] for nphist in numpy_1dhists]
             # KS-test only properly defined for (ordered) 1D interval variables
             ks_testscore = ks_test(*entries_list)
@@ -101,14 +101,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
             x["ks_pvalue"] = ks_pvalue
             x["ks_zscore"] = -norm.ppf(ks_pvalue)
         else:  # categorical
-            entries_list = get_consistent_numpy_entries([hc1, hc2])
+            entries_list = get_consistent_numpy_entries([hist1, hist2])
             # check consistency of bin_labels
-            labels1 = hc1.hist.bin_labels()
-            labels2 = hc2.hist.bin_labels()
+            labels1 = hist1.bin_labels()
+            labels2 = hist2.bin_labels()
             subset = set(labels1) <= set(labels2)
             unknown_labels = int(not subset)
-    elif hc1.n_dim == 2:
-        numpy_2dgrids = get_consistent_numpy_2dgrids([hc1, hc2])
+    elif hist1.n_dim == 2:
+        numpy_2dgrids = get_consistent_numpy_2dgrids([hist1, hist2])
         entries_list = [entry.flatten() for entry in numpy_2dgrids]
 
     # calculate pearson coefficient

diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py
@@ -30,7 +30,7 @@
     get_consistent_numpy_entries,
     set_2dgrid,
 )
-from ..hist.histogram import HistogramContainer
+from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric
 from ..stats.numpy import probability_distribution_mean_covariance
 
 
@@ -311,7 +311,7 @@ def hist_sum(x, hist_name=""):
 
     Usage: df['hists'].apply(hist_sum) ; series.apply(hist_sum)
 
-    :param pd.Series x: pandas series to extract HistogramContainer list from.
+    :param pd.Series x: pandas series to extract histogram list from.
     :param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
     :return: sum histogram
     """
@@ -324,27 +324,28 @@ def hist_sum(x, hist_name=""):
             hist_name = "histogram"
 
     if len(hist_list) == 0:
-        raise RuntimeError("List of input histograms is empty.")
+        raise ValueError("List of input histograms is empty.")
 
     # initialize
     o = pd.Series()
     o[hist_name] = None
 
     # basic checks
-    all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
-    if not all_hc:
+    all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
+    if not all_hist:
         return o
+
     similar = check_similar_hists(hist_list)
     if not similar:
         return o
 
     # MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is.
-    # h_sum = np.sum([hc.hist for hc in hist_list])
+    # h_sum = np.sum([hist for hist in hist_list])
 
-    h_sum = hist_list[0].hist.zero()
-    for hc in hist_list:
-        h_sum += hc.hist
-    o[hist_name] = HistogramContainer(h_sum)
+    h_sum = hist_list[0].zero()
+    for hist in hist_list:
+        h_sum += hist
+    o[hist_name] = h_sum
     return o
 
 
@@ -386,7 +387,7 @@ def normalized_hist_mean_cov(x, hist_name=""):
 
     Usage: df['hists'].apply(normalized_hist_mean_cov) ; series.apply(normalized_hist_mean_cov)
 
-    :param pd.Series x: pandas series to extract HistogramContainer list from.
+    :param pd.Series x: pandas series to extract histogram list from.
     :param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional)
     :return: mean normalized histogram, covariance probability matrix
     """
@@ -399,7 +400,7 @@ def normalized_hist_mean_cov(x, hist_name=""):
             hist_name = "histogram"
 
     if len(hist_list) == 0:
-        raise RuntimeError("List of input histograms is empty.")
+        raise ValueError("List of input histograms is empty.")
 
     # initialize
     o = pd.Series()
@@ -408,8 +409,8 @@ def normalized_hist_mean_cov(x, hist_name=""):
     o[hist_name + "_binning"] = None
 
     # basic checks
-    all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list])
-    if not all_hc:
+    all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list])
+    if not all_hist:
         return o
     similar = check_similar_hists(hist_list)
     if not similar:
@@ -470,13 +471,13 @@ def relative_chi_squared(
     if not all(r in row for r in required):
         return x
 
-    hc = row[hist_name]
+    hist = row[hist_name]
     norm_mean = row[hist_name + suffix_mean]
     cov = row[hist_name + suffix_cov]
     binning = row[hist_name + suffix_binning]
 
     # basic checks
-    if not isinstance(hc, HistogramContainer):
+    if not isinstance(hist, COMMON_HIST_TYPES):
         return x
     if any([ho is None for ho in [norm_mean, cov, binning]]):
         return x
@@ -486,15 +487,15 @@ def relative_chi_squared(
     variance = np.diagonal(cov)
 
     # get entries as numpy arrays
-    if hc.n_dim == 1:
+    if hist.n_dim == 1:
         entries = (
-            hc.hist.bin_entries(xvalues=binning)
-            if hc.is_num
-            else hc.hist.bin_entries(labels=binning)
+            hist.bin_entries(xvalues=binning)
+            if is_numeric(hist)
+            else hist.bin_entries(labels=binning)
         )
     else:
         assert len(binning) == 2
-        entries = set_2dgrid(hc.hist, binning[0], binning[1])
+        entries = set_2dgrid(hist, binning[0], binning[1])
         entries = entries.flatten()
 
     # calculation of mean normalized histogram and its covariance matrix of input histogram