From e2a08a599a05a6b0587d99fe2102decec06f3339 Mon Sep 17 00:00:00 2001 From: Max Baak Date: Mon, 8 Feb 2021 12:19:46 +0100 Subject: [PATCH] popmon working with hgr v1.0.22 * popmon working with hgr v1.0.22 * removal of all HistogramContainer code * all unit tests working * fix all flake8 errors * In readme.rst switch example and documentation sections * bump up version to 0.3.15 bump up histogrammar version to 1.0.23 [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.rst | 12 +- examples/flight_delays.py | 1 + examples/synthetic_data.py | 1 + popmon/__init__.py | 9 +- popmon/analysis/apply_func.py | 40 +- popmon/analysis/comparison/hist_comparer.py | 26 +- popmon/analysis/functions.py | 39 +- popmon/analysis/hist_numpy.py | 95 ++-- popmon/analysis/profiling/hist_profiler.py | 44 +- popmon/config.py | 6 +- popmon/decorators/pandas.py | 5 +- popmon/decorators/spark.py | 3 +- popmon/hist/filling/__init__.py | 10 +- popmon/hist/filling/histogram_filler_base.py | 495 ------------------ popmon/hist/filling/make_histograms.py | 299 ----------- popmon/hist/filling/numpy_histogrammar.py | 107 ---- popmon/hist/filling/pandas_histogrammar.py | 264 ---------- popmon/hist/filling/spark_histogrammar.py | 251 --------- popmon/hist/filling/utils.py | 222 -------- popmon/hist/hist_splitter.py | 26 +- popmon/hist/hist_utils.py | 313 +++++++++++ popmon/hist/histogram.py | 360 ------------- popmon/hist/patched_histogrammer.py | 128 ----- .../notebooks/popmon_tutorial_advanced.ipynb | 4 +- popmon/pipeline/metrics.py | 6 +- popmon/pipeline/report.py | 6 +- popmon/stitching/hist_stitcher.py | 19 +- popmon/version.py | 4 +- popmon/visualization/histogram_section.py | 2 +- requirements.txt | 2 +- setup.py | 2 +- .../analysis/profiling/test_hist_profiler.py | 10 +- tests/popmon/analysis/test_functions.py | 4 +- tests/popmon/analysis/test_hist_numpy.py | 112 ++-- tests/popmon/hist/test_histogram.py | 87 ++- tests/popmon/hist/test_numpy_histogrammar.py | 93 ---- tests/popmon/hist/test_pandas_histogrammar.py | 231 -------- tests/popmon/hist/test_spark_histogrammar.py | 255 --------- tests/popmon/pipeline/test_report.py | 14 +- tests/popmon/stats/test_numpy.py | 10 +- .../visualization/test_report_generator.py | 2 +- 41 files changed, 582 insertions(+), 3037 deletions(-) delete mode 100644 popmon/hist/filling/histogram_filler_base.py delete mode 100644 popmon/hist/filling/make_histograms.py delete mode 100644 popmon/hist/filling/numpy_histogrammar.py delete mode 100644 popmon/hist/filling/pandas_histogrammar.py delete mode 100644 popmon/hist/filling/spark_histogrammar.py delete mode 100644 popmon/hist/filling/utils.py create mode 100644 popmon/hist/hist_utils.py delete mode 100644 popmon/hist/histogram.py delete mode 100644 popmon/hist/patched_histogrammer.py delete mode 100644 tests/popmon/hist/test_numpy_histogrammar.py delete mode 100644 tests/popmon/hist/test_pandas_histogrammar.py delete mode 100644 tests/popmon/hist/test_spark_histogrammar.py diff --git a/README.rst b/README.rst index 57a7271d..de927f57 100644 --- a/README.rst +++ b/README.rst @@ -35,18 +35,18 @@ For Spark 2.X compiled against scala 2.11, in the string above simply replace 2. `January 29, 2021` -Documentation -============= - -The entire `popmon` documentation including tutorials can be found at `read-the-docs `_. - - Examples ======== - `Flight Delays and Cancellations Kaggle data `_ - `Synthetic data (code example below) `_ +Documentation +============= + +The entire `popmon` documentation including tutorials can be found at `read-the-docs `_. + + Notebooks ========= diff --git a/examples/flight_delays.py b/examples/flight_delays.py index 657cff06..df628983 100644 --- a/examples/flight_delays.py +++ b/examples/flight_delays.py @@ -1,3 +1,4 @@ +# flake8: noqa import pandas as pd import popmon diff --git a/examples/synthetic_data.py b/examples/synthetic_data.py index b219a40b..d2f95974 100644 --- a/examples/synthetic_data.py +++ b/examples/synthetic_data.py @@ -1,3 +1,4 @@ +# flake8: noqa import pandas as pd import popmon diff --git a/popmon/__init__.py b/popmon/__init__.py index e73f9560..6427670d 100644 --- a/popmon/__init__.py +++ b/popmon/__init__.py @@ -18,12 +18,17 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# histogram and report functions +from histogrammar.dfinterface.make_histograms import ( + get_bin_specs, + get_time_axes, + make_histograms, +) + # flake8: noqa # pandas/spark dataframe decorators from popmon import decorators -# histogram and report functions -from .hist.filling import get_bin_specs, get_time_axes, make_histograms from .pipeline.metrics import df_stability_metrics, stability_metrics from .pipeline.report import df_stability_report, stability_report from .stitching import stitch_histograms diff --git a/popmon/analysis/apply_func.py b/popmon/analysis/apply_func.py index 339e5032..d2535bca 100644 --- a/popmon/analysis/apply_func.py +++ b/popmon/analysis/apply_func.py @@ -283,9 +283,9 @@ def apply_func(feature, selected_metrics, df, arr): if ( "entire" in arr - and arr["entire"] is not None - and arr["entire"] is not False - and arr["entire"] != 0 + and arr["entire"] is not None # noqa: W503 + and arr["entire"] is not False # noqa: W503 + and arr["entire"] != 0 # noqa: W503 ): obj = func(df, *args, **kwargs) else: @@ -302,48 +302,48 @@ def apply_func(feature, selected_metrics, df, arr): obj = {"_".join(df.columns): obj} elif ( isinstance(obj, (list, tuple, np.ndarray)) - and isinstance(df, pd.DataFrame) - and len(df.columns) == len(obj) + and isinstance(df, pd.DataFrame) # noqa: W503 + and len(df.columns) == len(obj) # noqa: W503 ): obj = {c: o for c, o in zip(df.columns, obj)} elif ( isinstance(obj, (list, tuple, np.ndarray)) - and isinstance(df, pd.Series) - and len(df.index) == len(obj) + and isinstance(df, pd.Series) # noqa: W503 + and len(df.index) == len(obj) # noqa: W503 ): obj = {df.name: pd.Series(data=obj, index=df.index)} elif ( isinstance(obj, (list, tuple, np.ndarray)) - and isinstance(df, pd.DataFrame) - and len(df.index) == len(obj) + and isinstance(df, pd.DataFrame) # noqa: W503 + and len(df.index) == len(obj) # noqa: W503 ): obj = {"_".join(df.columns): pd.Series(data=obj, index=df.index)} elif ( isinstance(obj, pd.Series) - and isinstance(df, pd.Series) - and len(obj) == len(df) - and all(obj.index == df.index) + and isinstance(df, pd.Series) # noqa: W503 + and len(obj) == len(df) # noqa: W503 + and all(obj.index == df.index) # noqa: W503 ): obj = {df.name: obj} elif ( isinstance(obj, pd.Series) - and isinstance(df, pd.DataFrame) - and len(obj) == len(df) - and all(obj.index == df.index) + and isinstance(df, pd.DataFrame) # noqa: W503 + and len(obj) == len(df) # noqa: W503 + and all(obj.index == df.index) # noqa: W503 ): obj = {"_".join(df.columns): obj} elif ( isinstance(obj, pd.DataFrame) - and len(obj.columns) == 1 - and len(obj.index) != len(df.index) + and len(obj.columns) == 1 # noqa: W503 + and len(obj.index) != len(df.index) # noqa: W503 ): # e.g. output of normalized_hist_mean_cov: a dataframe with one column, actually a series obj = obj[obj.columns[0]].to_dict() elif ( isinstance(obj, pd.DataFrame) - and len(obj.columns) == 1 - and len(obj.index) == len(df.index) - and (obj.index != df.index).any() + and len(obj.columns) == 1 # noqa: W503 + and len(obj.index) == len(df.index) # noqa: W503 + and (obj.index != df.index).any() # noqa: W503 ): # e.g. output of normalized_hist_mean_cov: a dataframe with one column, actually a series obj = obj[obj.columns[0]].to_dict() diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 3cdcfc5f..db005a64 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -39,7 +39,7 @@ get_consistent_numpy_entries, ) from ...base import Pipeline -from ...hist.histogram import HistogramContainer +from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2 @@ -81,18 +81,18 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): raise RuntimeError("Need to provide two histogram column names.") # basic histogram checks - hc1 = row[hist_name1] - hc2 = row[hist_name2] - if not all([isinstance(hc, HistogramContainer) for hc in [hc1, hc2]]): + hist1 = row[hist_name1] + hist2 = row[hist_name2] + if not all([isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]]): return x - if not check_similar_hists([hc1, hc2]): + if not check_similar_hists([hist1, hist2]): return x # compare - is_num = hc1.is_num - if hc1.n_dim == 1: + is_num = is_numeric(hist1) + if hist1.n_dim == 1: if is_num: - numpy_1dhists = get_consistent_numpy_1dhists([hc1, hc2]) + numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2]) entries_list = [nphist[0] for nphist in numpy_1dhists] # KS-test only properly defined for (ordered) 1D interval variables ks_testscore = ks_test(*entries_list) @@ -101,14 +101,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): x["ks_pvalue"] = ks_pvalue x["ks_zscore"] = -norm.ppf(ks_pvalue) else: # categorical - entries_list = get_consistent_numpy_entries([hc1, hc2]) + entries_list = get_consistent_numpy_entries([hist1, hist2]) # check consistency of bin_labels - labels1 = hc1.hist.bin_labels() - labels2 = hc2.hist.bin_labels() + labels1 = hist1.bin_labels() + labels2 = hist2.bin_labels() subset = set(labels1) <= set(labels2) unknown_labels = int(not subset) - elif hc1.n_dim == 2: - numpy_2dgrids = get_consistent_numpy_2dgrids([hc1, hc2]) + elif hist1.n_dim == 2: + numpy_2dgrids = get_consistent_numpy_2dgrids([hist1, hist2]) entries_list = [entry.flatten() for entry in numpy_2dgrids] # calculate pearson coefficient diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py index a6ce140c..131140e5 100644 --- a/popmon/analysis/functions.py +++ b/popmon/analysis/functions.py @@ -30,7 +30,7 @@ get_consistent_numpy_entries, set_2dgrid, ) -from ..hist.histogram import HistogramContainer +from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric from ..stats.numpy import probability_distribution_mean_covariance @@ -311,7 +311,7 @@ def hist_sum(x, hist_name=""): Usage: df['hists'].apply(hist_sum) ; series.apply(hist_sum) - :param pd.Series x: pandas series to extract HistogramContainer list from. + :param pd.Series x: pandas series to extract histogram list from. :param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional) :return: sum histogram """ @@ -331,20 +331,21 @@ def hist_sum(x, hist_name=""): o[hist_name] = None # basic checks - all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list]) - if not all_hc: + all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list]) + if not all_hist: return o + similar = check_similar_hists(hist_list) if not similar: return o # MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is. - # h_sum = np.sum([hc.hist for hc in hist_list]) + # h_sum = np.sum([hist for hist in hist_list]) - h_sum = hist_list[0].hist.zero() - for hc in hist_list: - h_sum += hc.hist - o[hist_name] = HistogramContainer(h_sum) + h_sum = hist_list[0].zero() + for hist in hist_list: + h_sum += hist + o[hist_name] = h_sum return o @@ -386,7 +387,7 @@ def normalized_hist_mean_cov(x, hist_name=""): Usage: df['hists'].apply(normalized_hist_mean_cov) ; series.apply(normalized_hist_mean_cov) - :param pd.Series x: pandas series to extract HistogramContainer list from. + :param pd.Series x: pandas series to extract histogram list from. :param str hist_name: name of column to extract histograms from. needs to be set with axis=1 (optional) :return: mean normalized histogram, covariance probability matrix """ @@ -408,8 +409,8 @@ def normalized_hist_mean_cov(x, hist_name=""): o[hist_name + "_binning"] = None # basic checks - all_hc = all([isinstance(hc, HistogramContainer) for hc in hist_list]) - if not all_hc: + all_hist = all([isinstance(hist, COMMON_HIST_TYPES) for hist in hist_list]) + if not all_hist: return o similar = check_similar_hists(hist_list) if not similar: @@ -470,13 +471,13 @@ def relative_chi_squared( if not all(r in row for r in required): return x - hc = row[hist_name] + hist = row[hist_name] norm_mean = row[hist_name + suffix_mean] cov = row[hist_name + suffix_cov] binning = row[hist_name + suffix_binning] # basic checks - if not isinstance(hc, HistogramContainer): + if not isinstance(hist, COMMON_HIST_TYPES): return x if any([ho is None for ho in [norm_mean, cov, binning]]): return x @@ -486,15 +487,15 @@ def relative_chi_squared( variance = np.diagonal(cov) # get entries as numpy arrays - if hc.n_dim == 1: + if hist.n_dim == 1: entries = ( - hc.hist.bin_entries(xvalues=binning) - if hc.is_num - else hc.hist.bin_entries(labels=binning) + hist.bin_entries(xvalues=binning) + if is_numeric(hist) + else hist.bin_entries(labels=binning) ) else: assert len(binning) == 2 - entries = set_2dgrid(hc.hist, binning[0], binning[1]) + entries = set_2dgrid(hist, binning[0], binning[1]) entries = entries.flatten() # calculation of mean normalized histogram and its covariance matrix of input histogram diff --git a/popmon/analysis/hist_numpy.py b/popmon/analysis/hist_numpy.py index 5f674698..f3e88f29 100644 --- a/popmon/analysis/hist_numpy.py +++ b/popmon/analysis/hist_numpy.py @@ -20,10 +20,11 @@ import warnings +import histogrammar import numpy as np +from histogrammar.util import get_hist_props -from ..hist.histogram import HistogramContainer, get_hist_props -from ..hist.patched_histogrammer import histogrammar +from ..hist.hist_utils import is_numeric used_hist_types = (histogrammar.Bin, histogrammar.SparselyBin, histogrammar.Categorize) @@ -92,10 +93,10 @@ def set_2dgrid(hist, xkeys, ykeys): continue i = xkeys.index(k) if hasattr(h, "bins"): - for l, g in h.bins.items(): - if l not in ykeys: + for ll, g in h.bins.items(): + if ll not in ykeys: continue - j = ykeys.index(l) + j = ykeys.index(ll) grid[j, i] = g.entries # sum_entries(g) elif hasattr(h, "values"): for j, g in enumerate(h.values): @@ -104,10 +105,10 @@ def set_2dgrid(hist, xkeys, ykeys): elif hasattr(hist, "values"): for i, h in enumerate(hist.values): if hasattr(h, "bins"): - for l, g in h.bins.items(): - if l not in ykeys: + for ll, g in h.bins.items(): + if ll not in ykeys: continue - j = ykeys.index(l) + j = ykeys.index(ll) grid[j, i] = g.entries elif hasattr(h, "values"): for j, g in enumerate(h.values): @@ -140,21 +141,18 @@ def get_2dgrid(hist, get_bin_labels=False): return grid -def get_consistent_numpy_2dgrids(hc_list=[], get_bin_labels=False): +def get_consistent_numpy_2dgrids(hist_list=[], get_bin_labels=False): """Get list of consistent x,y grids of first two dimensions of (sparse) input histograms - :param list hc_list: list of input histogrammar histograms + :param list hist_list: list of input histogrammar histograms :param bool get_bin_labels: if true, return x-keys and y-keys describing binnings of 2d-grid. :return: list of consistent x,y grids of first two dimensions of each input histogram in list """ # --- basic checks - if len(hc_list) == 0: + if len(hist_list) == 0: raise ValueError("Input histogram list has zero length.") - assert_similar_hists(hc_list) + assert_similar_hists(hist_list) - hist_list = [ - hc.hist if isinstance(hc, HistogramContainer) else hc for hc in hc_list - ] xkeys = set() ykeys = set() for hist in hist_list: @@ -180,22 +178,19 @@ def get_consistent_numpy_2dgrids(hc_list=[], get_bin_labels=False): return grid2d_list -def get_consistent_numpy_1dhists(hc_list, get_bin_labels=False): +def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False): """Get list of consistent numpy hists for list of sparse input histograms Note: a numpy histogram is a union of lists of bin_edges and number of entries - :param list hc_list: list of input HistogramContainer objects + :param list hist_list: list of input histogram objects :return: list of consistent 1d numpy hists for list of sparse input histograms """ # --- basic checks - if len(hc_list) == 0: + if len(hist_list) == 0: raise RuntimeError("Input histogram list has zero length.") - assert_similar_hists(hc_list) + assert_similar_hists(hist_list) - hist_list = [ - hc.hist if isinstance(hc, HistogramContainer) else hc for hc in hc_list - ] low_arr = [hist.low for hist in hist_list if hist.low is not None] high_arr = [hist.high for hist in hist_list if hist.high is not None] @@ -231,50 +226,48 @@ def get_consistent_numpy_1dhists(hc_list, get_bin_labels=False): return nphist_list -def get_consistent_numpy_entries(hc_list, get_bin_labels=False): +def get_consistent_numpy_entries(hist_list, get_bin_labels=False): """Get list of consistent numpy bin_entries for list of 1d input histograms :param list hist_list: list of input histogrammar histograms :return: list of consistent 1d numpy arrays with bin_entries for list of input histograms """ # --- basic checks - if len(hc_list) == 0: + if len(hist_list) == 0: raise RuntimeError("Input histogram list has zero length.") - assert_similar_hists(hc_list) + assert_similar_hists(hist_list) # datatype check is_num_arr = [] - for hc in hc_list: - is_num_arr.append(hc.is_num) + for hist in hist_list: + is_num_arr.append(is_numeric(hist)) all_num = all(is_num_arr) all_cat = not any(is_num_arr) if not (all_num or all_cat): raise TypeError( - "Input histograms are mixture of Bin/SparselyBin and Categorize types.".format( - n=hc_list[0].hist.n_dim - ) + "Input histograms are mixture of Bin/SparselyBin and Categorize types." ) # union of all labels encountered labels = set() - for hc in hc_list: - bin_labels = hc.hist.bin_centers() if all_num else hc.hist.bin_labels() + for hist in hist_list: + bin_labels = hist.bin_centers() if all_num else hist.bin_labels() labels = labels.union(bin_labels) labels = sorted(labels) # PATCH: deal with boolean labels, which get bin_labels() returns as strings cat_labels = labels - props = get_hist_props(hc_list[0]) + props = get_hist_props(hist_list[0]) if props["is_bool"]: cat_labels = [lab == "True" for lab in cat_labels] # collect list of consistent bin_entries entries_list = [] - for hc in hc_list: + for hist in hist_list: entries = ( - hc.hist.bin_entries(xvalues=labels) + hist.bin_entries(xvalues=labels) if all_num - else hc.hist.bin_entries(labels=cat_labels) + else hist.bin_entries(labels=cat_labels) ) entries_list.append(entries) @@ -301,18 +294,15 @@ def get_contentType(hist): return "Count" -def check_similar_hists(hc_list, check_type=True, assert_type=used_hist_types): +def check_similar_hists(hist_list, check_type=True, assert_type=used_hist_types): """Check consistent list of input histograms Check that type and dimension of all histograms in input list are the same. - :param list hc_list: list of input HistogramContainer objects to check on consistency + :param list hist_list: list of input histogram objects to check on consistency :param bool check_type: if true, also check type consistency of histograms (besides n-dim and datatype). :return: bool indicating if lists are similar """ - hist_list = [ - hc.hist if isinstance(hc, HistogramContainer) else hc for hc in hc_list - ] if len(hist_list) < 1: return True for hist in hist_list: @@ -414,37 +404,36 @@ def check_similar_hists(hc_list, check_type=True, assert_type=used_hist_types): if hist.num > 0: sub_hist_list.append(hist.values[0]) # iterate down - sub_hc_list = [HistogramContainer(h) for h in sub_hist_list] - if not check_similar_hists(sub_hc_list): + if not check_similar_hists(sub_hist_list): return False return True -def assert_similar_hists(hc_list, check_type=True, assert_type=used_hist_types): +def assert_similar_hists(hist_list, check_type=True, assert_type=used_hist_types): """Assert consistent list of input histograms Assert that type and dimension of all histograms in input list are the same. - :param list hc_list: list of input HistogramContainer objects to check on consistency + :param list hist_list: list of input histogram objects to check on consistency :param bool assert_type: if true, also assert type consistency of histograms (besides n-dim and datatype). """ similar = check_similar_hists( - hc_list, check_type=check_type, assert_type=assert_type + hist_list, check_type=check_type, assert_type=assert_type ) if not similar: raise ValueError("Input histograms are not all similar.") -def check_same_hists(hc1, hc2): +def check_same_hists(hist1, hist2): """Check if two hists are the same - :param hc1: input histogram container 1 - :param hc2: input histogram container 2 + :param hist1: input histogram 1 + :param hist2: input histogram 2 :return: boolean, true if two histograms are the same """ - same = check_similar_hists([hc1, hc2]) - same &= hc1.hist.entries == hc2.hist.entries - same &= hc1.hist.n_bins == hc2.hist.n_bins - same &= hc1.hist.quantity.name == hc2.hist.quantity.name + same = check_similar_hists([hist1, hist2]) + same &= hist1.entries == hist2.entries + same &= hist1.n_bins == hist2.n_bins + same &= hist1.quantity.name == hist2.quantity.name return same diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index d09e5689..86cbc3eb 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -26,7 +26,7 @@ from ...analysis.hist_numpy import get_2dgrid from ...base import Module -from ...hist.histogram import sum_entries +from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp, sum_entries DEFAULT_STATS = { "mean": pm_np.mean, @@ -97,12 +97,12 @@ def __init__( f"No stats function dict is provided. {self.stats_functions.keys()} is set as default" ) - def _profile_1d_histogram(self, name, hc): - is_num = hc.is_num - is_ts = hc.is_ts or name in self.var_timestamp + def _profile_1d_histogram(self, name, hist): + is_num = is_numeric(hist) + is_ts = is_timestamp(hist) or name in self.var_timestamp - bin_labels = np.array(hc.get_bin_centers()[0]) - bin_counts = np.array([v.entries for v in hc.get_bin_centers()[1]]) + bin_labels = np.array(get_bin_centers(hist)[0]) + bin_counts = np.array([v.entries for v in get_bin_centers(hist)[1]]) if len(bin_counts) == 0: self.logger.warning(f'Histogram "{name}" is empty; skipping.') @@ -114,12 +114,10 @@ def _profile_1d_histogram(self, name, hc): profile = dict() profile["filled"] = bin_counts.sum() - profile["nan"] = hc.hist.nanflow.entries if hasattr(hc.hist, "nanflow") else 0 - profile["overflow"] = ( - hc.hist.overflow.entries if hasattr(hc.hist, "overflow") else 0 - ) + profile["nan"] = hist.nanflow.entries if hasattr(hist, "nanflow") else 0 + profile["overflow"] = hist.overflow.entries if hasattr(hist, "overflow") else 0 profile["underflow"] = ( - hc.hist.underflow.entries if hasattr(hc.hist, "underflow") else 0 + hist.underflow.entries if hasattr(hist, "underflow") else 0 ) profile["count"] = profile["filled"] + profile["nan"] profile["distinct"] = len(np.unique(bin_labels)) @@ -147,19 +145,19 @@ def _profile_1d_histogram(self, name, hc): return profile - def _profile_2d_histogram(self, name, hc): - if hc.n_dim < 2: + def _profile_2d_histogram(self, name, hist): + if hist.n_dim < 2: self.logger.warning( - f"Histogram {name} has {hc.n_dim} dimensions (<2); cannot profile. Returning empty." + f"Histogram {name} has {hist.n_dim} dimensions (<2); cannot profile. Returning empty." ) return [] try: - grid = get_2dgrid(hc.hist) + grid = get_2dgrid(hist) except Exception as e: raise e # calc some basic 2d-histogram statistics - sume = int(sum_entries(hc.hist)) + sume = int(sum_entries(hist)) # calculate phik correlation try: @@ -180,7 +178,7 @@ def _profile_hist(self, split, hist_name): hist0 = split[0][self.hist_col] dimension = hist0.n_dim - is_num = hist0.is_num + is_num = is_numeric(hist0) # these are the profiled quantities we will monitor fields = [] @@ -197,14 +195,14 @@ def _profile_hist(self, split, hist_name): # now loop over split-axis, e.g. time index, and profile each sub-hist x:y profile_list = [] for hist_dict in split: - index, hc = hist_dict[self.index_col], hist_dict[self.hist_col] + index, hist = hist_dict[self.index_col], hist_dict[self.hist_col] - profile = {self.index_col: index, self.hist_col: hc} + profile = {self.index_col: index, self.hist_col: hist} if dimension == 1: - profile.update(self._profile_1d_histogram(hist_name, hc)) + profile.update(self._profile_1d_histogram(hist_name, hist)) elif dimension == 2: - profile.update(self._profile_2d_histogram(hist_name, hc)) + profile.update(self._profile_2d_histogram(hist_name, hist)) if sorted(profile.keys()) != sorted( fields + [self.index_col, self.hist_col] @@ -228,10 +226,10 @@ def transform(self, datastore): for feature in features[:]: df = self.get_datastore_object(data, feature, dtype=pd.DataFrame) - hc_split_list = df.reset_index().to_dict("records") + hist_split_list = df.reset_index().to_dict("records") self.logger.debug(f'Profiling histogram "{feature}".') - profile_list = self._profile_hist(split=hc_split_list, hist_name=feature) + profile_list = self._profile_hist(split=hist_split_list, hist_name=feature) if len(profile_list) > 0: profiled[feature] = pd.DataFrame(profile_list).set_index( [self.index_col] diff --git a/popmon/config.py b/popmon/config.py index 17a7d480..8a24857f 100644 --- a/popmon/config.py +++ b/popmon/config.py @@ -47,11 +47,11 @@ "chi2_pvalue": "p-value of the chi-squared statistic, comparing each time slot with {ref}", "chi2_zscore": "Z-score of the chi-squared statistic, comparing each time slot with {ref}", "chi2_max_residual": "The largest absolute normalized residual (|chi|) observed in all bin pairs " - + "(one histogram in a time slot and one in {ref})", # noqa: W504 + + "(one histogram in a time slot and one in {ref})", # noqa: W503 "chi2_spike_count": "The number of normalized residuals of all bin pairs (one histogram in a time" - + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).", # noqa: W504 + + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).", # noqa: W503 "max_prob_diff": "The largest absolute difference between all bin pairs of two normalized " - + "histograms (one histogram in a time slot and one in {ref})", # noqa: W504 + + "histograms (one histogram in a time slot and one in {ref})", # noqa: W503 "unknown_labels": "Are categories observed in a given time slot that are not present in {ref}?", } diff --git a/popmon/decorators/pandas.py b/popmon/decorators/pandas.py index e766a62c..dc88ed30 100644 --- a/popmon/decorators/pandas.py +++ b/popmon/decorators/pandas.py @@ -18,13 +18,14 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from histogrammar.dfinterface.make_histograms import make_histograms from pandas import DataFrame -from ..hist.filling import make_histograms from ..pipeline.metrics import df_stability_metrics from ..pipeline.report import df_stability_report -# add function to create histogrammar histograms +# add function to create histogrammar histograms. +# pm_make_histograms is kept for bkw compatibility. DataFrame.pm_make_histograms = make_histograms # add function to create stability report diff --git a/popmon/decorators/spark.py b/popmon/decorators/spark.py index 95eff7ba..52a9cd8c 100644 --- a/popmon/decorators/spark.py +++ b/popmon/decorators/spark.py @@ -18,7 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from popmon.hist.filling import make_histograms +from histogrammar.dfinterface.make_histograms import make_histograms + from popmon.pipeline.metrics import df_stability_metrics from popmon.pipeline.report import df_stability_report diff --git a/popmon/hist/filling/__init__.py b/popmon/hist/filling/__init__.py index e23a95f5..333df22b 100644 --- a/popmon/hist/filling/__init__.py +++ b/popmon/hist/filling/__init__.py @@ -17,22 +17,22 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# MB 20210323: histogrammming code hade been moved to histogrammar v1.0.20+ +# these imports are kept for backwards compatibility. -from ...hist.filling.make_histograms import ( +from histogrammar.dfinterface.make_histograms import ( get_bin_specs, get_one_time_axis, get_time_axes, has_one_time_axis, make_histograms, ) -from ...hist.filling.numpy_histogrammar import NumpyHistogrammar -from ...hist.filling.pandas_histogrammar import PandasHistogrammar -from ...hist.filling.spark_histogrammar import SparkHistogrammar +from histogrammar.dfinterface.pandas_histogrammar import PandasHistogrammar +from histogrammar.dfinterface.spark_histogrammar import SparkHistogrammar __all__ = [ "PandasHistogrammar", "SparkHistogrammar", - "NumpyHistogrammar", "make_histograms", "get_time_axes", "get_one_time_axis", diff --git a/popmon/hist/filling/histogram_filler_base.py b/popmon/hist/filling/histogram_filler_base.py deleted file mode 100644 index 4a4fe570..00000000 --- a/popmon/hist/filling/histogram_filler_base.py +++ /dev/null @@ -1,495 +0,0 @@ -""" -Copyright Eskapade: -License Apache-2: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE -Reference link: -https://github.com/KaveIO/Eskapade/blob/master/python/eskapade/analysis/histogram_filling.py -All modifications copyright ING WBAA. -""" - -import copy -import logging -from collections import defaultdict - -import histogrammar as hg -import numpy as np -import pandas as pd - -from ...base import Module -from ...hist.filling.utils import check_column, check_dtype - - -class HistogramFillerBase(Module): - """Base class link to fill histograms. - - Timestamp features are - converted to nanoseconds before the binning is applied. - Semi-clever auto-binning is applied in case no bin specifications are provided. - Final histograms are stored in the datastore. - """ - - def __init__( - self, - features=None, - binning="unit", - bin_specs=None, - time_axis="", - var_dtype=None, - read_key=None, - store_key=None, - nbins_1d=40, - nbins_2d=20, - nbins_3d=10, - max_nunique=500, - ): - """Initialize module instance. - - Store and do basic check on the attributes HistogramFillerBase. - - :param list features: colums to pick up from input data. (default is all features) - For multi-dimensional histograms, separate the column names with a ":" - Example features list is: - - .. code-block:: python - - features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] - - :param str binning: default binning to revert to in case bin_specs not supplied. options are: - "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done. - :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features - Example bin_specs dictionary is: - - .. code-block:: python - - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} - - In the bin specs for x:y, x reverts to the 1-dim setting. - - :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed. - If time_axis is set, if no features given, features becomes: ['date:x', 'date:y', 'date:z'] etc. - :param dict var_dtype: dictionary with specified datatype per feature (optional) - :param str read_key: key of input histogram-dict to read from data store . - (only required when calling transform(datastore) as module) - :param str store_key: key of output data to store in data store - (only required when calling transform(datastore) as module) - :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40. - :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20. - :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10. - :param int max_nunique: auto-binning threshold for unique categorical values. default is 500. - """ - super().__init__() - - features = features or [] - self.features = [check_column(c) for c in features] - if not any([binning == opt for opt in ["auto", "unit"]]): - raise TypeError('binning should be "auto" or "unit".') - self.binning = binning - self.bin_specs = bin_specs or {} - self.time_axis = time_axis - var_dtype = var_dtype or {} - self.var_dtype = {k: check_dtype(v) for k, v in var_dtype.items()} - self.read_key = read_key - self.store_key = store_key - - # several default unit values - self._unit_bin_specs = {"bin_width": 1.0, "bin_offset": 0.0} - self._unit_timestamp_specs = { - "bin_width": pd.Timedelta(days=30).value, - "bin_offset": pd.Timestamp("2010-01-04").value, - } - self._auto_n_bins_1d = nbins_1d - self._auto_n_bins_2d = nbins_2d - self._auto_n_bins_3d = nbins_3d - self._nunique_threshold = max_nunique - - # these get filled during execution - self._hists = {} - - def assert_dataframe(self, df): - """assert dataframe datatype""" - raise NotImplementedError("assert_dataframe not implemented!") - - def get_features(self, df): - raise NotImplementedError("get_features not implemented!") - - def get_quantiles(self, df, quantiles, columns): - """return dict with quantiles for given columns""" - raise NotImplementedError("get_quantiles not implemented!") - - def get_nunique(self, df, columns): - """return dict with number of unique entries for given columns""" - raise NotImplementedError("get_nunique not implemented!") - - def process_features(self, df, cols_by_type): - raise NotImplementedError("process_features not implemented!") - - def fill_histograms(self, idf): - raise NotImplementedError("fill_histograms not implemented!") - - def construct_empty_hist(self, features): - raise NotImplementedError("construct_empty_hist not implemented!") - - def _auto_n_bins(self, c): - """Return number of bins for this histogram - - :param list c: list of columns for this histogram - :return: number of bins to use for this histogram - """ - if isinstance(c, str): - c = [c] - if len(self.time_axis) > 0 and c[0] == self.time_axis: - # in case of time-axis, use fine-grained binning - # do this by removing first element, decreasing size of c. - # note that affects original input c, so copy first - c = copy.copy(c) - del c[0] - if len(c) <= 1: - return self._auto_n_bins_1d - elif len(c) == 2: - return self._auto_n_bins_2d - elif len(c) == 3: - return self._auto_n_bins_3d - else: - return self._auto_n_bins_3d - - def _execute(self, df): - """ - _execute() does five things: - - * check presence and data type of requested features - * timestamp variables are converted to nanosec (integers) - * clever auto-binning is done in case no bin-specs have been provided - * do the actual value counting based on categories and created indices - * then convert to histograms - """ - df = self.assert_dataframe(df) - - # 1. check presence and data type of requested features - # sort features into numerical, timestamp and category based - cols_by_type = self.categorize_features(df) - - # 2. assign features to make histograms of (if not already provided) - # and figure out time-axis if provided - # check if all features are present in dataframe - self.assign_and_check_features(df, cols_by_type) - - # 3. timestamp variables are converted to ns here - idf = self.process_features(df, cols_by_type) - - # 4. complete bin-specs that have not been provided in case of 'auto' binning option - if self.binning == "auto": - self.auto_complete_bin_specs(idf, cols_by_type) - - # 5. do the actual histogram/counter filling - self.logger.info( - f"Filling {len(self.features)} specified histograms. {self.binning}-binning." - ) - self.fill_histograms(idf) - - return self._hists - - def assign_and_check_features(self, df, cols_by_type): - """auto assign feature to make histograms of and do basic checks on them - - :param df: input dateframe - :param cols_by_type: dict of columns classified by type - """ - # user leaves feature selection up to us - no_initial_features = len(self.features) == 0 - - all_cols = ( - list(cols_by_type["num"]) - + list(cols_by_type["dt"]) - + list(cols_by_type["str"]) - ) - - # 1. assign / figure out a time axis - if isinstance(self.time_axis, str) and len(self.time_axis) > 0: - # a) specified time axis - if self.time_axis not in all_cols: - raise RuntimeError( - f'Specified time-axis "{self.time_axis}" not found in dataframe.' - ) - elif isinstance(self.time_axis, bool) and self.time_axis: - # b) try to figure out time axis - self.time_axis = "" - num = len(cols_by_type["dt"]) - if num == 1: - # the obvious choice - self.time_axis = list(cols_by_type["dt"])[0] - self.logger.info(f'Time-axis automatically set to "{self.time_axis}"') - elif num == 0: - self.logger.warning( - "No obvious time-axes found to choose from. So not used." - ) - else: - self.logger.warning( - f'Found {num} time-axes: {cols_by_type["dt"]}. Set *one* time_axis manually! Now NOT used.' - ) - else: - # c) no time axis - self.time_axis = "" - - # 2. assign all features to make histograms of, in case not provided by user - if no_initial_features: - if len(self.time_axis) > 0: - # time-axis is selected: make histograms of all columns in dataframe vs time-axis - self.features = [ - [self.time_axis, c] - for c in sorted(self.get_features(df)) - if c != self.time_axis - ] - else: - # make histograms of all columns in dataframe - self.features = [[c] for c in sorted(self.get_features(df))] - - # 3. check presence of all features (in case provided by user) - all_selected_cols = np.unique([j for i in self.features for j in i]) - for c in all_selected_cols: - if c not in self.get_features(df): - raise RuntimeError(f"Requested feature {c} not in dataframe.") - - # 4. check number of unique entries for categorical features - # this can be an expensive call, so avoid if possible. do run however when debugging. - if no_initial_features or self.logger.level == logging.DEBUG: - str_cols = [c for c in all_selected_cols if c in cols_by_type["str"]] - nuniq = self.get_nunique(df, str_cols) - huge_cats = [] - for c in str_cols: - if nuniq[c] < self._nunique_threshold: - continue - if no_initial_features: - # we're the boss. we're not going to histogram this ... - huge_cats.append(c) - else: # debug mode - self.logger.warning( - f"Column {c} has {nuniq[c]} unique entries (large). Really histogram it?" - ) - # scrub self.features of huge categories. - self.features = [ - cols - for cols in self.features - if not any([c in huge_cats for c in cols]) - ] - - def auto_complete_bin_specs(self, df, cols_by_type): - """auto complete the bin-specs that have not been provided - - :param df: input dataframe - :param cols_by_type: dict of columns classified by type - """ - # auto-determine binning of numerical and time features for which no bin_specs exist - # do this based on range of 5-95% quantiles, so extreme outliers are binned separately - # otherwise, the idea is to always reuse 1-dim binning for high n-dim, if those exist. - bs_keys = list(self.bin_specs.keys()) # create initial unchanging list of keys - all_selected_cols = np.unique([j for i in self.features for j in i]) - cols = list(cols_by_type["num"]) + list(cols_by_type["dt"]) - num_cols = [c for c in all_selected_cols if c in cols and c not in bs_keys] - - # quantiles for bin specs - int_cols = [c for c in num_cols if c in cols_by_type["int"]] - quantiles_i = self.get_quantiles(df, quantiles=[0.0, 1.0], columns=int_cols) - float_cols = [c for c in num_cols if c not in cols_by_type["int"]] - quantiles_f = self.get_quantiles(df, quantiles=[0.05, 0.95], columns=float_cols) - - for cols in self.features: - n = ":".join(cols) - if len(cols) == 1 and n not in num_cols: - continue - if n in bs_keys: - # already provided; will pick that one up - continue - # get default number of bins for n-dim histogram - n_bins = self._auto_n_bins(cols) - specs = [] - for idx, c in enumerate(cols): - if c not in num_cols or c in bs_keys: - # skip categorical; revert to what is already provided by user at 1dim-level - specs.append({}) - continue - - if c in float_cols: - q = quantiles_f[c] - # by default, n_bins covers range 5-95% quantiles + we add 10% - # basically this gives a nice plot when plotted - # specs for Bin and Sparselybin histograms - if q[1] == q[0]: - # in case of highly imbalanced data it can happen that q05=q95. If so use min and max instead. - q = (self.get_quantiles(df, quantiles=[0.0, 1.0], columns=[c]))[ - c - ] - qdiff = (q[1] - q[0]) * (1.0 / 0.9) if q[1] > q[0] else 1.0 - bin_width = qdiff / float(n_bins) - bin_offset = q[0] - qdiff * 0.05 - low = q[0] - qdiff * 0.05 - high = q[1] + qdiff * 0.05 - elif c in int_cols: - # for ints use bins around integer values - low = quantiles_i[c][0] - high = quantiles_i[c][1] - bin_width = np.max((np.round((high - low) / float(n_bins)), 1.0)) - bin_offset = low = np.floor(low - 0.5) + 0.5 - n_bins = int((high - low) // bin_width) + int( - (high - low) % bin_width > 0.0 - ) - high = low + n_bins * bin_width - - if c == self.time_axis and idx == 0: - # time axis is always sparselybin (unbound) - specs.append({"bin_width": bin_width, "bin_offset": bin_offset}) - elif len(cols) >= 3: - # always binned histogram for high n-dim histograms, avoid potentially exploding histograms - specs.append({"num": n_bins, "low": low, "high": high}) - else: - # sparse allowed for low dimensional histograms (1 and 2 dim) - specs.append({"bin_width": bin_width, "bin_offset": bin_offset}) - if len(cols) == 1: - specs = specs[0] - self.bin_specs[n] = specs - - def get_data_type(self, df, col): - """Get data type of dataframe column. - - :param df: input data frame - :param str col: column - """ - if col not in self.get_features(df): - raise KeyError(f'column "{col:s}" not in input dataframe') - return df[col].dtype - - def categorize_features(self, df): - """Categorize features of dataframe by data type. - - :param df: input (pandas) data frame - """ - # check presence and data type of requested features - # sort features into numerical, timestamp and category based - cols_by_type = defaultdict(set) - - features = ( - self.features if self.features else [[c] for c in self.get_features(df)] - ) - - for col_list in features: - for col in col_list: - - dt = check_dtype(self.get_data_type(df, col)) - - if col not in self.var_dtype: - self.var_dtype[col] = dt - - if np.issubdtype(dt, np.integer): - colset = cols_by_type["int"] - if col not in colset: - colset.add(col) - if np.issubdtype(dt, np.number): - colset = cols_by_type["num"] - if col not in colset: - colset.add(col) - elif np.issubdtype(dt, np.datetime64): - colset = cols_by_type["dt"] - if col not in colset: - colset.add(col) - else: - colset = cols_by_type["str"] - if col not in colset: - colset.add(col) - - self.logger.debug( - 'Data type of column "{col}" is "{type}".'.format( - col=col, type=self.var_dtype[col] - ) - ) - return cols_by_type - - def var_bin_specs(self, c, idx=0): - """Determine bin_specs to use for variable c. - - :param list c: list of variables, or string variable - :param int idx: index of the variable in c, for which to return the bin specs. default is 0. - :return: selected bin_specs of variable - """ - if isinstance(c, str): - c = [c] - n = ":".join(c) - - # determine default bin specs - dt = np.dtype(self.var_dtype[c[idx]]) - is_timestamp = isinstance(dt.type(), np.datetime64) - default = ( - self._unit_bin_specs if not is_timestamp else self._unit_timestamp_specs - ) - - # get bin specs - if n in self.bin_specs and len(c) > 1 and len(c) == len(self.bin_specs[n]): - result = self.bin_specs[n][idx] - if not result: - result = self.bin_specs.get(c[idx], default) - else: - result = self.bin_specs.get(c[idx], default) - return result - - def get_histograms(self, input_df): - """Handy function to directly get dict of histograms corresponding to input dataframe. - - :param input_df: spark/pandas input dataframe - :return: dict of histograms - """ - return self._execute(input_df) - - def get_features_specs(self): - """Return bin specifications used to generate histograms - - Can then be passed on to other histogram filler to get identical histograms. - """ - features = [":".join(c) for c in self.features] # rejoin substrings - return features, self.bin_specs, self.var_dtype, self.time_axis - - def transform(self, datastore): - """Transform function called when used as module in a pipeline - - :param dict datastore: input datastore - :return: datastore - """ - if not isinstance(self.read_key, str) and len(self.read_key) > 0: - raise ValueError("read_key has not been properly set.") - if not isinstance(self.store_key, str) and len(self.store_key) > 0: - raise ValueError("store_key has not been properly set.") - if self.read_key not in datastore: - raise KeyError("read_key not found in datastore") - - df = datastore[self.read_key] - hists = self.get_histograms(df) - datastore[self.store_key] = hists - return datastore - - def get_hist_bin(self, hist, features, quant, col, dt): - is_number = np.issubdtype(dt, np.number) - is_timestamp = np.issubdtype(dt, np.datetime64) - - if is_number or is_timestamp: - # numbers and timestamps are put in a sparse binned histogram - specs = self.var_bin_specs(features, features.index(col)) - if "bin_width" in specs: - hist = hg.SparselyBin( - binWidth=specs["bin_width"], - origin=specs.get("bin_offset", 0), - quantity=quant, - value=hist, - ) - elif "num" in specs and "low" in specs and "high" in specs: - hist = hg.Bin( - num=specs["num"], - low=specs["low"], - high=specs["high"], - quantity=quant, - value=hist, - ) - else: - raise RuntimeError("Do not know how to interpret bin specifications.") - else: - # string and booleans are treated as categories - hist = hg.Categorize(quantity=quant, value=hist) - - return hist diff --git a/popmon/hist/filling/make_histograms.py b/popmon/hist/filling/make_histograms.py deleted file mode 100644 index 650d7d7f..00000000 --- a/popmon/hist/filling/make_histograms.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2020 ING Wholesale Banking Advanced Analytics -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -import copy -import logging - -import histogrammar -import numpy as np -import pandas as pd - -from ...hist.filling.pandas_histogrammar import PandasHistogrammar -from ...hist.filling.spark_histogrammar import SparkHistogrammar -from ...hist.filling.utils import check_dtype - -logger = logging.getLogger() - - -def make_histograms( - df, - features=None, - binning="auto", - bin_specs=None, - time_axis="", - time_width=None, - time_offset=0, - var_dtype=None, - ret_specs=False, - nbins_1d=40, - nbins_2d=20, - nbins_3d=10, - max_nunique=500, -): - """Create histograms from pandas or spark dataframe. - - :param df: input pandas or spark dataframe to create histograms of. - :param list features: columns to pick up from input data. (default is all features) - For multi-dimensional histograms, separate the column names with a ":". An example features list is: - - .. code-block:: python - - features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] - - :param str binning: default binning to revert to in case bin_specs not supplied. options are: - "unit" or "auto", default is "auto". When using "auto", semi-clever binning is automatically done. - :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features. An example bin_specs - dictionary is: - - .. code-block:: python - - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} - - In the bin specs for x:y, x is not provided (here) and reverts to the 1-dim setting. The 'bin_width', - 'bin_offset' notation makes an open-ended histogram (for that feature) with given bin width and offset. - The notation 'num', 'low', 'high' gives a fixed range histogram from 'low' to 'high' with 'num' - number of bins. - :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed. - If time_axis is set, if no features given, features becomes: ['date:x', 'date:y', 'date:z'] etc. - :param time_width: bin width of time_axis. str or number (ns). note: bin_specs takes precedence. (optional) - - .. code-block:: text - - Examples: '1w', 3600e9 (number of ns), - anything understood by pd.Timedelta(time_width).value - - :param time_offset: bin offset of time_axis. str or number (ns). note: bin_specs takes precedence. (optional) - - .. code-block:: text - - Examples: '1-1-2020', 0 (number of ns since 1-1-1970), - anything parsed by pd.Timestamp(time_offset).value - - :param dict var_dtype: dictionary with specified datatype per feature (optional) - :param bool ret_specs: if true, also return features, bin_specs, var_dtype, time_axis used for filling histograms. - :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40. - :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20. - :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10. - :param int max_nunique: auto-binning threshold for unique categorical values. default is 500. - :return: dict of created histogrammar histograms - """ - # basic checks on presence of time_axis - if (not isinstance(time_axis, (str, bool))) or ( - isinstance(time_axis, bool) and not time_axis - ): - raise TypeError("time_axis needs to be a string, or a bool set to True") - if ( - isinstance(time_axis, str) - and len(time_axis) > 0 - and time_axis not in df.columns - ): - raise ValueError(f'time_axis "{time_axis}" not found in columns of dataframe.') - if isinstance(time_axis, bool): - time_axes = get_time_axes(df) - num = len(time_axes) - if num == 1: - time_axis = time_axes[0] - logger.info(f'Time-axis automatically set to "{time_axis}"') - elif num == 0: - raise RuntimeError( - "No obvious time-axes found. Cannot generate stability report." - ) - else: - raise RuntimeError( - f"Found {num} time-axes: {time_axes}. Set *one* time_axis manually!" - ) - - # if time_axis present, interpret time_width and time_offset - if ( - isinstance(time_axis, str) - and len(time_axis) > 0 - and isinstance(time_width, (str, int, float)) - and isinstance(time_offset, (str, int, float)) - ): - if not isinstance(bin_specs, (type(None), dict)): - raise RuntimeError("bin_specs object is not a dictionary") - bin_specs = copy.copy(bin_specs) if isinstance(bin_specs, dict) else {} - if time_axis in bin_specs: - raise RuntimeError( - f'time-axis "{time_axis}" already found in binning specifications.' - ) - # convert time width and offset to nanoseconds - time_specs = { - "bin_width": float(pd.Timedelta(time_width).value), - "bin_offset": float(pd.Timestamp(time_offset).value), - } - bin_specs[time_axis] = time_specs - - cls = PandasHistogrammar if isinstance(df, pd.DataFrame) else SparkHistogrammar - hist_filler = cls( - features=features, - binning=binning, - bin_specs=bin_specs, - time_axis=time_axis, - var_dtype=var_dtype, - nbins_1d=nbins_1d, - nbins_2d=nbins_2d, - nbins_3d=nbins_3d, - max_nunique=max_nunique, - ) - hists = hist_filler.get_histograms(df) - - if ret_specs: - features, binning, var_dtype, time_axis = hist_filler.get_features_specs() - return hists, features, binning, time_axis, var_dtype - - return hists - - -def get_data_type(df, col): - """Get data type of a column of pandas or spark dataframe. - - :param df: input data frame (pandas or spark) - :param str col: column - """ - if col not in df.columns: - raise KeyError(f'Column "{col:s}" not in input dataframe.') - dt = dict(df.dtypes)[col] - - if hasattr(dt, "type"): - # convert pandas types, such as pd.Int64, into numpy types - dt = type(dt.type()) - - try: - # spark conversions to numpy or python equivalent - if dt == "string": - dt = "str" - elif dt == "timestamp": - dt = np.datetime64 - elif dt == "boolean": - dt = bool - elif dt == "bigint": - dt = np.int64 - except TypeError: - pass - - return np.dtype(dt) - - -def get_time_axes(df): - """Return all time-axis columns of a dataframe - - :param df: input dataframe (pandas or spark) - :return: list of time-axis columns - """ - return [ - c - for c in df.columns - if np.issubdtype(check_dtype(get_data_type(df, c)), np.datetime64) - ] - - -def has_one_time_axis(df): - """Return boolean if one time-axis column in dataframe - - :param df: input dataframe (pandas or spark) - :return: boolean if one time-axis column - """ - dt_cols = get_time_axes(df) - return len(dt_cols) == 1 - - -def get_one_time_axis(df): - """Return time-axis if one time-axis column in dataframe - - :param df: input dataframe (pandas or spark) - :return: one time-axis column, else empty string - """ - dt_cols = get_time_axes(df) - return dt_cols[0] if len(dt_cols) == 1 else "" - - -def _get_bin_specs(h): - """Get histogram bin specifications - - :param h: input histogrammar histogram - :return: list with bin_specs of all dimensions of the histogram - :rtype: list - """ - bin_specs = [] - if isinstance(h, histogrammar.Count): - return bin_specs - - if isinstance(h, histogrammar.Categorize): - bin_specs.append({}) - elif isinstance(h, histogrammar.Bin): - bin_specs.append(dict(num=h.num, low=h.low, high=h.high)) - elif isinstance(h, histogrammar.SparselyBin): - bin_specs.append(dict(bin_width=h.binWidth, bin_offset=h.origin)) - - # histogram may have a sub-histogram. Extract it and recurse - if hasattr(h, "bins"): - hist = list(h.bins.values())[0] if h.bins else histogrammar.Count() - elif hasattr(h, "values"): - hist = h.values[0] if h.values else histogrammar.Count() - else: - hist = histogrammar.Count() - return bin_specs + _get_bin_specs(hist) - - -def _match_first_key(skip_first_axis=None, feature=""): - """Helper function to match and remove skip_first_axis from feature - - :param skip_first_axis: True or string. if set, ignore first axis of input histogram(s) - :param feature: input feature - :return: match and (rest of) feature - """ - assert isinstance(feature, str) - karr = feature.split(":") - begin = karr[0] - rest_key = ":".join(karr[1:]) - if isinstance(skip_first_axis, bool): - return skip_first_axis, rest_key if skip_first_axis else feature - elif isinstance(skip_first_axis, str) and len(skip_first_axis) > 0: - match = begin == skip_first_axis - return match, rest_key if match else feature - return False, feature - - -def get_bin_specs(hd, skip_first_axis=False): - """Get histogram bin specifications - - :param hd: input histogrammar histogram (or dict of input histograms) - :param skip_first_axis: bool or string of first axis. if set, ignore first axis of input histogram(s) - :return: list (or dict with lists) with bin_specs of all dimensions of the histogram - :rtype: list (or dict) - """ - if isinstance(hd, dict): - bin_specs = {} - for key, h in hd.items(): - bs = _get_bin_specs(h) - match, rest_key = _match_first_key(skip_first_axis, key) - bs = bs[1:] if match else bs - bs = bs[0] if len(bs) == 1 else bs - bin_specs[rest_key] = bs - else: - bs = _get_bin_specs(hd) - match, _ = _match_first_key(skip_first_axis) - bs = bs[1:] if match else bs - bs = bs[0] if len(bs) == 1 else bs - bin_specs = bs - return bin_specs diff --git a/popmon/hist/filling/numpy_histogrammar.py b/popmon/hist/filling/numpy_histogrammar.py deleted file mode 100644 index ad21e77a..00000000 --- a/popmon/hist/filling/numpy_histogrammar.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2020 ING Wholesale Banking Advanced Analytics -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -import numpy as np -import pandas as pd - -from ...hist.filling.pandas_histogrammar import PandasHistogrammar - - -class NumpyHistogrammar(PandasHistogrammar): - """Fill histogrammar histograms. - - Algorithm to fill histogrammar style bin, sparse-bin and category histograms. - - Timestamp features are converted to nanoseconds before - the binning is applied. Final histograms are stored in the datastore. - """ - - def __init__( - self, - features=None, - binning="unit", - bin_specs=None, - time_axis="", - var_dtype=None, - read_key=None, - store_key=None, - nbins_1d=40, - nbins_2d=20, - nbins_3d=10, - max_nunique=500, - ): - """Initialize module instance. - - Store and do basic check on the attributes HistogramFillerBase. - - :param list features: colums to pick up from input data. (default is all features) - For multi-dimensional histograms, separate the column names with a : - - Example features list is: - - .. code-block:: python - - features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] - - :param str binning: default binning to revert to in case bin_specs not supplied. options are: - "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done. - :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features - - Example bin_specs dictionary is: - - .. code-block:: python - - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} - - In the bin specs for x:y, x reverts to the 1-dim setting. - - :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed. - If time_axis is set, if no features given, features becomes: ['date:x', 'date:y', 'date:z'] etc. - :param dict var_dtype: dictionary with specified datatype per feature (optional) - :param str read_key: key of input histogram-dict to read from data store . - (only required when calling transform(datastore) as module) - :param str store_key: key of output data to store in data store - (only required when calling transform(datastore) as module) - :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40. - :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20. - :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10. - :param int max_nunique: auto-binning threshold for unique categorical values. default is 500. - """ - PandasHistogrammar.__init__( - self, - features, - binning, - bin_specs, - time_axis, - var_dtype, - read_key, - store_key, - nbins_1d, - nbins_2d, - nbins_3d, - max_nunique, - ) - - def _execute(self, df): - if not isinstance(df, np.ndarray): - raise TypeError("retrieved object not of type np.ndarray") - return super()._execute(pd.DataFrame(df)) diff --git a/popmon/hist/filling/pandas_histogrammar.py b/popmon/hist/filling/pandas_histogrammar.py deleted file mode 100644 index 0a2718ea..00000000 --- a/popmon/hist/filling/pandas_histogrammar.py +++ /dev/null @@ -1,264 +0,0 @@ -""" -Copyright Eskapade: -License Apache-2: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE -Reference link: -https://github.com/KaveIO/Eskapade/blob/master/python/eskapade/analysis/links/hist_filler.py -All modifications copyright ING WBAA. -""" - -import contextlib -import multiprocessing - -import histogrammar as hg -import joblib -import numpy as np -import pandas as pd -from joblib import Parallel, delayed -from tqdm import tqdm - -from ...hist.filling import utils -from ...hist.filling.histogram_filler_base import HistogramFillerBase - - -class PandasHistogrammar(HistogramFillerBase): - """Fill histogrammar histograms. - - Algorithm to fill histogrammar style bin, sparse-bin and category histograms. - Timestamp features are converted to nanoseconds before - the binning is applied. Final histograms are stored in the datastore. - """ - - def __init__( - self, - features=None, - binning="unit", - bin_specs=None, - time_axis="", - var_dtype=None, - read_key=None, - store_key=None, - nbins_1d=40, - nbins_2d=20, - nbins_3d=10, - max_nunique=500, - ): - """Initialize module instance. - - Store and do basic check on the attributes HistogramFillerBase. - - :param list features: columns to pick up from input data. (default is all features) - For multi-dimensional histograms, separate the column names with a : - - Example features list is: - - .. code-block:: python - - features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] - - :param str binning: default binning to revert to in case bin_specs not supplied. options are: - "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done. - :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features - - Example bin_specs dictionary is: - - .. code-block:: python - - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} - - In the bin specs for x:y, x reverts to the 1-dim setting. - - :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed. - :param dict var_dtype: dictionary with specified datatype per feature (optional) - :param str read_key: key of input histogram-dict to read from data store . - (only required when calling transform(datastore) as module) - :param str store_key: key of output data to store in data store - (only required when calling transform(datastore) as module) - :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40. - :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20. - :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10. - :param int max_nunique: auto-binning threshold for unique categorical values. default is 500. - """ - HistogramFillerBase.__init__( - self, - features, - binning, - bin_specs, - time_axis, - var_dtype, - read_key, - store_key, - nbins_1d, - nbins_2d, - nbins_3d, - max_nunique, - ) - - def assert_dataframe(self, df): - """Check that input data is a filled pandas data frame. - - :param df: input (pandas) data frame - """ - if not isinstance(df, pd.DataFrame): - raise TypeError(f"retrieved object not of type {pd.DataFrame}") - if df.shape[0] == 0: - raise RuntimeError("data is empty") - return df - - def get_features(self, df): - """Get columns of (pandas) dataframe - - :param df: input pandas dataframe - """ - return df.columns.tolist() - - def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]): - """return dict with quantiles for given columns - - :param df: input pandas data frame - :param quantiles: list of quantiles. default is [0.05, 0.95] - :param columns: columns to select. default is all. - """ - if len(columns) == 0: - return {} - qdf = df[columns].quantile(quantiles) - qd = {c: qdf[c].values.tolist() for c in columns} - return qd - - def get_nunique(self, df, columns=[]): - """return dict with number of unique entries for given columns - - :param df: input pandas data frame - :param columns: columns to select (optional) - """ - if not columns: - columns = df.columns - return df[columns].nunique().to_dict() - - def process_features(self, df, cols_by_type): - """Process features before histogram filling. - - Specifically, convert timestamp features to integers - - :param df: input (pandas) data frame - :param cols_by_type: dictionary of column sets for each type - :returns: output (pandas) data frame with converted timestamp features - :rtype: pandas DataFrame - """ - # timestamp variables are converted to ns here - # make temp df for value counting (used below) - idf = df[list(cols_by_type["num"]) + list(cols_by_type["str"])].copy() - for col in cols_by_type["dt"]: - self.logger.debug( - 'Converting column "{col}" of type "{type}" to nanosec.'.format( - col=col, type=self.var_dtype[col] - ) - ) - idf[col] = df[col].apply(utils.to_ns) - return idf - - def fill_histograms(self, idf): - """Fill the histograms - - :param idf: converted input dataframe - """ - # construct empty histograms if needed - for cols in self.features: - name = ":".join(cols) - if name not in self._hists: - # create an (empty) histogram of right type - self._hists[name] = self.construct_empty_hist(cols) - - # parallel histogram filling with working progress bar - num_cores = multiprocessing.cpu_count() - with tqdm_joblib( - tqdm(total=len(self.features), ncols=100) - ) as progress_bar: # noqa: F841 - res = Parallel(n_jobs=num_cores)( - delayed(_fill_histogram)( - idf=idf[c], hist=self._hists[":".join(c)], features=c - ) - for c in self.features - ) - # update dictionary - for name, hist in res: - self._hists[name] = hist - - def construct_empty_hist(self, features): - """Create an (empty) histogram of right type. - - Create a multi-dim histogram by iterating through the features in - reverse order and passing a single-dim hist as input to the next - column. - - :param list features: histogram features - :return: created histogram - :rtype: histogrammar.Count - """ - hist = hg.Count() - - # create a multi-dim histogram by iterating through the features - # in reverse order and passing a single-dim hist as input - # to the next column - revcols = list(reversed(features)) - for idx, col in enumerate(revcols): - # histogram type depends on the data type - dt = self.var_dtype[col] - - # processing function, e.g. only accept boolians during filling - f = utils.QUANTITY[dt] - if len(features) == 1: - # df[col] is a pd.series - quant = lambda x, fnc=f: fnc(x) # noqa - else: - # df[features] is a pd.Dataframe - # fix column to col - quant = lambda x, fnc=f, clm=col: fnc(x[clm]) # noqa - - hist = self.get_hist_bin(hist, features, quant, col, dt) - - return hist - - -def _fill_histogram(idf, hist, features): - """Fill input histogram with column(s) of input dataframe. - - Separate function call for parallellization. - - :param idf: input data frame used for filling histogram - :param hist: empty histogrammar histogram about to be filled - :param list features: histogram column(s) - """ - name = ":".join(features) - clm = features[0] if len(features) == 1 else features - # do the actual filling - hist.fill.numpy(idf[clm]) - return name, hist - - -# tqdm working with joblib -@contextlib.contextmanager -def tqdm_joblib(tqdm_object): - """Context manager to patch joblib to report into tqdm progress bar given as argument - - From: https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution?rq=1 - """ - - class TqdmBatchCompletionCallback: - def __init__(self, time, index, parallel): - self.index = index - self.parallel = parallel - - def __call__(self, index): - tqdm_object.update() - if self.parallel._original_iterator is not None: - self.parallel.dispatch_next() - - old_batch_callback = joblib.parallel.BatchCompletionCallBack - joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback - try: - yield tqdm_object - finally: - joblib.parallel.BatchCompletionCallBack = old_batch_callback - tqdm_object.close() diff --git a/popmon/hist/filling/spark_histogrammar.py b/popmon/hist/filling/spark_histogrammar.py deleted file mode 100644 index ad13add6..00000000 --- a/popmon/hist/filling/spark_histogrammar.py +++ /dev/null @@ -1,251 +0,0 @@ -""" -Copyright Eskapade: -License Apache-2: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE -Reference link: -https://github.com/KaveIO/Eskapade-Spark/blob/master/python/eskapadespark/links/spark_histogrammar_filler.py -All modifications copyright ING WBAA. -""" - -import histogrammar as hg -import numpy as np -from tqdm import tqdm - -from ...hist.filling.histogram_filler_base import HistogramFillerBase - -try: - from pyspark.sql import DataFrame - from pyspark.sql.functions import approxCountDistinct - from pyspark.sql.functions import col as sparkcol -except (ModuleNotFoundError, AttributeError): - pass - - -class SparkHistogrammar(HistogramFillerBase): - """Fill histogrammar histograms with Spark. - - Algorithm to fill histogrammar style bin, sparse-bin and category histograms - with Spark. Timestamp features are converted to nanoseconds before the binning - is applied. Final histograms are stored in the datastore. - """ - - def __init__( - self, - features=None, - binning="unit", - bin_specs=None, - time_axis="", - var_dtype=None, - read_key=None, - store_key=None, - nbins_1d=40, - nbins_2d=20, - nbins_3d=10, - max_nunique=500, - ): - """Initialize module instance. - - Store and do basic check on the attributes HistogramFillerBase. - - :param list features: colums to pick up from input data. (default is all features) - For multi-dimensional histograms, separate the column names with a : - - Example features list is: - - .. code-block:: python - - features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] - - :param str binning: default binning to revert to in case bin_specs not supplied. options are: - "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done. - :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features - - Example bin_specs dictionary is: - - .. code-block:: python - - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} - - In the bin specs for x:y, x reverts to the 1-dim setting. - - :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed. - If time_axis is set, if no features given, features becomes: ['date:x', 'date:y', 'date:z'] etc. - :param dict var_dtype: dictionary with specified datatype per feature (optional) - :param str read_key: key of input histogram-dict to read from data store . - (only required when calling transform(datastore) as module) - :param str store_key: key of output data to store in data store - (only required when calling transform(datastore) as module) - :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40. - :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20. - :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10. - :param int max_nunique: auto-binning threshold for unique categorical values. default is 500. - """ - HistogramFillerBase.__init__( - self, - features, - binning, - bin_specs, - time_axis, - var_dtype, - read_key, - store_key, - nbins_1d, - nbins_2d, - nbins_3d, - max_nunique, - ) - self._unit_timestamp_specs = { - k: float(self._unit_timestamp_specs[k]) - for i, k in enumerate(self._unit_timestamp_specs) - } - - def assert_dataframe(self, df): - """Check that input data is a filled spark data frame. - - :param df: input (spark) data frame - """ - if not isinstance(df, DataFrame): - raise TypeError("retrieved object not of type Spark DataFrame") - assert not len(df.head(1)) == 0, "input dataframe is empty" - return df - - def get_features(self, df): - """Get columns of dataframe - - :param df: input spark dataframe - """ - return df.columns - - def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]): - """return dict with quantiles for given columns - - :param df: input (spark) data frame - :param quantiles: list of quantiles. default is [0.05, 0.95] - :param columns: columns to select. default is all. - """ - if len(columns) == 0: - return {} - qsl = df.approxQuantile(columns, quantiles, 0.25) - qd = {c: qs for c, qs in zip(columns, qsl)} - return qd - - def get_nunique(self, df, columns=[]): - """return dict with number of unique entries for given columns - - :param df: input (spark) data frame - :param columns: columns to select (optional) - """ - if not columns: - columns = df.columns - qdf = df.agg(*(approxCountDistinct(sparkcol(c)).alias(c) for c in columns)) - return qdf.toPandas().T[0].to_dict() - - def get_data_type(self, df, col): - """Get data type of dataframe column. - - :param df: input data frame - :param str col: column - """ - if col not in df.columns: - raise KeyError(f'Column "{col:s}" not in input dataframe.') - dt = dict(df.dtypes)[col] - # spark conversions to numpy or python equivalent - if dt == "string": - dt = "str" - elif dt in ["timestamp", "date"]: - dt = np.datetime64 - elif dt == "boolean": - dt = bool - elif dt == "bigint": - dt = np.int64 - - return np.dtype(dt) - - def process_features(self, df, cols_by_type): - """Process features before histogram filling. - - Specifically, in this case convert timestamp features to nanoseconds - - :param df: input data frame - :return: output data frame with converted timestamp features - :rtype: DataFrame - """ - # make alias df for value counting (used below) - idf = df.alias("") - - # timestamp variables are converted here to ns since 1970-1-1 - # histogrammar does not yet support long integers, so convert timestamps to float - # epoch = (sparkcol("ts").cast("bigint") * 1000000000).cast("bigint") - for col in cols_by_type["dt"]: - self.logger.debug( - 'Converting column "{col}" of type "{type}" to nanosec.'.format( - col=col, type=self.var_dtype[col] - ) - ) - - # first cast to timestamp (in case column is stored as date) - to_ns = sparkcol(col).cast("timestamp").cast("float") * 1e9 - idf = idf.withColumn(col, to_ns) - - return idf - - def construct_empty_hist(self, df, features): - """Create an (empty) histogram of right type. - - Create a multi-dim histogram by iterating through the features in - reverse order and passing a single-dim hist as input to the next - column. - - :param df: input dataframe - :param list features: histogram features - :return: created histogram - :rtype: histogrammar.Count - """ - hist = hg.Count() - - # create a multi-dim histogram by iterating through - # the features in reverse order and passing a single-dim hist - # as input to the next column - revcols = list(reversed(features)) - for idx, col in enumerate(revcols): - # histogram type depends on the data type - dt = self.var_dtype[col] - quant = df[col] - - hist = self.get_hist_bin(hist, features, quant, col, dt) - - return hist - - def fill_histograms(self, idf): - """Fill the histograms - - :param idf: input data frame used for filling histogram - """ - for cols in tqdm(self.features, ncols=100): - self.logger.debug( - 'Processing feature "{cols}".'.format(cols=":".join(cols)) - ) - self.fill_histogram(idf, cols) - - def fill_histogram(self, idf, features): - """Fill input histogram with column(s) of input dataframe. - - :param idf: input data frame used for filling histogram - :param list features: histogram column(s) - """ - name = ":".join(features) - if name not in self._hists: - # create an (empty) histogram of right type - self._hists[name] = self.construct_empty_hist(idf, features) - hist = self._hists[name] - - # do the actual filling - hist.fill.sparksql(idf) - self._hists[name] = hist - - def _execute(self, df): - df.persist() - hists = super()._execute(df) - df.unpersist() - return hists diff --git a/popmon/hist/filling/utils.py b/popmon/hist/filling/utils.py deleted file mode 100644 index 4cc5eeef..00000000 --- a/popmon/hist/filling/utils.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020 ING Wholesale Banking Advanced Analytics -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -import numpy as np -import pandas as pd - -NUM_NS_DAY = 24 * 3600 * int(1e9) - - -def check_column(col, sep=":"): - """Convert input column string to list of columns - - :param col: input string - :param sep: default ":" - :return: list of columns - """ - if isinstance(col, str): - col = col.split(sep) - elif not isinstance(col, list): - raise TypeError(f'Columns "{col}" needs to be a string or list of strings') - return col - - -def check_dtype(dtype): - """Convert datatype to consistent numpy datatype - - :param dtype: input datatype - :rtype: numpy.dtype.type - """ - try: - if hasattr(dtype, "type"): - # this converts pandas types, such as pd.Int64, into numpy types - dtype = type(dtype.type()) - dtype = np.dtype(dtype).type - if dtype in {np.str_, np.string_, np.object_}: - dtype = np.dtype(str).type - except BaseException: - raise RuntimeError(f'unknown assigned datatype "{dtype}"') - return dtype - - -def to_ns(x): - """Convert input timestamps to nanoseconds (integers). - - :param x: value to be converted - :returns: converted value - :rtype: int - """ - if pd.isnull(x): - return 0 - try: - return pd.to_datetime(x).value - except Exception: - if hasattr(x, "__str__"): - return pd.to_datetime(str(x)).value - return 0 - - -def to_str(val): - """Convert input to (array of) string(s). - - :param val: value to be converted - :returns: converted value - :rtype: str or np.ndarray - """ - if isinstance(val, str): - return val - elif hasattr(val, "__iter__"): - return np.asarray( - list( - map( - lambda s: s - if isinstance(s, str) - else str(s) - if hasattr(s, "__str__") - else "", - val, - ) - ) - ) - - elif hasattr(val, "__str__"): - return str(val) - - return "" - - -def only_str(val): - """Pass input value or array only if it is a string. - - :param val: value to be evaluated - :returns: evaluated value - :rtype: str or np.ndarray - """ - if isinstance(val, str): - return val - elif hasattr(val, "__iter__"): - return np.asarray([s if isinstance(s, str) else "None" for s in val]) - return "None" - - -def only_bool(val): - """Pass input value or array only if it is a bool. - - :param val: value to be evaluated - :returns: evaluated value - :rtype: np.bool or np.ndarray - """ - if isinstance(val, (np.bool_, bool)): - return val - elif hasattr(val, "__iter__") and not isinstance(val, str): - return np.asarray( - [s if isinstance(s, (np.bool_, bool)) else np.nan for s in val] - ) - return np.nan - - -def only_int(val): - """Pass input val value or array only if it is an integer. - - :param val: value to be evaluated - :returns: evaluated value - :rtype: np.int64 or np.ndarray - """ - if isinstance(val, (np.int64, int)): - return val - elif hasattr(val, "__iter__") and not isinstance(val, str): - return np.asarray( - [s if isinstance(s, (np.int64, int)) else np.nan for s in val] - ) - return np.nan - - -def only_float(val): - """Pass input val value or array only if it is a float. - - :param val: value to be evaluated - :returns: evaluated value - :rtype: np.float64 or np.ndarray - """ - if isinstance(val, (np.float64, float)): - return val - elif hasattr(val, "__iter__") and not isinstance(val, str): - return np.asarray( - [s if isinstance(s, (np.float64, float)) else np.nan for s in val] - ) - return np.nan - - -QUANTITY = { - str: only_str, - np.str_: only_str, - int: only_int, - np.int64: only_int, - np.int32: only_int, - bool: only_bool, - np.bool_: only_bool, - float: only_float, - np.float64: only_float, - np.datetime64: only_int, -} - - -def value_to_bin_index(val, **kwargs): - """Convert value to bin index. - - Convert a numeric or timestamp column to an integer bin index. - - :param bin_width: bin_width value needed to convert column - to an integer bin index - :param bin_offset: bin_offset value needed to convert column - to an integer bin index - """ - try: - # NOTE this notation also works for timestamps - bin_width = kwargs.get("bin_width", 1) - bin_offset = kwargs.get("bin_offset", 0) - bin_index = int(np.floor((val - bin_offset) / bin_width)) - return bin_index - except BaseException: - pass - return val - - -def value_to_bin_center(val, **kwargs): - """Convert value to bin center. - - Convert a numeric or timestamp column to a common bin center value. - - :param bin_width: bin_width value needed to convert column - to a common bin center value - :param bin_offset: bin_offset value needed to convert column - to a common bin center value - """ - try: - # NOTE this notation also works for timestamps, and does not change the - # unit - bin_width = kwargs.get("bin_width", 1) - bin_offset = kwargs.get("bin_offset", 0) - bin_index = int(np.floor((val - bin_offset) / bin_width)) - obj_type = type(bin_width) - return bin_offset + obj_type((bin_index + 0.5) * bin_width) - except BaseException: - pass - return val diff --git a/popmon/hist/hist_splitter.py b/popmon/hist/hist_splitter.py index cab521d5..663eb305 100644 --- a/popmon/hist/hist_splitter.py +++ b/popmon/hist/hist_splitter.py @@ -21,7 +21,11 @@ import pandas as pd from ..base import Module -from ..hist.histogram import HistogramContainer +from ..hist.hist_utils import ( + get_histogram, + is_timestamp, + split_hist_along_first_dimension, +) class HistSplitter(Module): @@ -87,8 +91,7 @@ def update_divided(self, divided, split, yname): divided.update(split) else: divided[yname] = [ - {self.index_col: k, self.hist_col: HistogramContainer(h)} - for k, h in split.items() + {self.index_col: k, self.hist_col: h} for k, h in split.items() ] return divided @@ -106,32 +109,31 @@ def transform(self, datastore): # if so requested split selected histograms along first axis, and then divide for feature in features[:]: self.logger.debug(f'Now splitting histogram "{feature}"') - hc = HistogramContainer(data[feature]) - if hc.n_dim <= 1: + hist = get_histogram(data[feature]) + if hist.n_dim <= 1: self.logger.debug( f'Histogram "{feature}" does not have two or more dimensions, nothing to split; skipping.' ) continue cols = feature.split(":") - if len(cols) != hc.n_dim: + if len(cols) != hist.n_dim: self.logger.error( - f'Dimension of histogram "{feature}" not consistent: {hc.n_dim} vs {len(cols)}; skipping.' + f'Dimension of histogram "{feature}" not consistent: {hist.n_dim} vs {len(cols)}; skipping.' ) continue xname, yname = cols[0], ":".join(cols[1:]) # 'time:x:y' -> 'time', 'x:y' if yname in divided: - self.logger.debug( - f'HistogramContainer "{yname}" already divided; skipping.' - ) + self.logger.debug(f'Histogram "{yname}" already divided; skipping.') continue # if requested split selected histograms along first axis. e.g. time:x:y is split along time # then check if sub-hists of x:y can be further projected. eg. x:y is projected on x and y as well. # datatype properties - is_ts = hc.is_ts or xname in self.var_timestamp - split = hc.split_hist_along_first_dimension( + is_ts = is_timestamp(hist) or xname in self.var_timestamp + split = split_hist_along_first_dimension( + hist=hist, short_keys=self.short_keys, convert_time_index=is_ts, xname=xname, diff --git a/popmon/hist/hist_utils.py b/popmon/hist/hist_utils.py new file mode 100644 index 00000000..2ab97ccf --- /dev/null +++ b/popmon/hist/hist_utils.py @@ -0,0 +1,313 @@ +# Copyright (c) 2020 ING Wholesale Banking Advanced Analytics +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +import histogrammar +import numpy as np +import pandas as pd +from histogrammar.util import get_hist_props + +COMMON_HIST_TYPES = ( + histogrammar.Categorize, + histogrammar.Bin, + histogrammar.SparselyBin, + histogrammar.specialized.CategorizeHistogramMethods, + histogrammar.specialized.HistogramMethods, + histogrammar.specialized.SparselyHistogramMethods, + histogrammar.specialized.CategorizeHistogramMethods, + histogrammar.specialized.TwoDimensionallyHistogramMethods, + histogrammar.specialized.SparselyTwoDimensionallyHistogramMethods, +) + +HG_FACTORY = histogrammar.Factory() + + +def sum_entries(hist, default=True): + """Recursively get sum of entries of histogram + + Sometimes hist.entries gives zero as answer? This function always works though. + + :param hist: input histogrammar histogram + :param bool default: if false, do not use default HG method for evaluating entries, but exclude nans, of, uf. + :return: total sum of entries of histogram + :rtype: int + """ + if default: + entries = hist.entries + if entries > 0: + return entries + + # double check number of entries, sometimes not well set + sume = 0 + if hasattr(hist, "bins"): + # loop over all counters and integrate over y (=j) + for i in hist.bins: + bi = hist.bins[i] + sume += sum_entries(bi) + elif hasattr(hist, "values"): + # loop over all counters and integrate over y (=j) + for i, bi in enumerate(hist.values): + sume += sum_entries(bi) + elif hasattr(hist, "entries"): + # only count histogrammar.Count() objects + sume += hist.entries + return sume + + +def project_on_x(hist): + """Project n-dim histogram onto x-axis + + :param hist: input histogrammar histogram + :return: on x-axis projected histogram (1d) + """ + # basic check: projecting on itself + if hasattr(hist, "n_dim") and hist.n_dim <= 1: + return hist + # basic checks on contents + if hasattr(hist, "bins"): + if len(hist.bins) == 0: + return hist + elif hasattr(hist, "values"): + if len(hist.values) == 0: + return hist + else: + return hist + + # make empty clone + # note: cannot do: h_x = hist.zero(), b/c it copies n-dim structure, which screws up hist.toJsonString() + if isinstance(hist, histogrammar.Bin): + h_x = histogrammar.Bin( + num=hist.num, + low=hist.low, + high=hist.high, + quantity=hist.quantity, + ) + elif isinstance(hist, histogrammar.SparselyBin): + h_x = histogrammar.SparselyBin( + binWidth=hist.binWidth, + origin=hist.origin, + quantity=hist.quantity, + ) + elif isinstance(hist, histogrammar.Categorize): + h_x = histogrammar.Categorize(quantity=hist.quantity) + else: + raise RuntimeError("unknown historgram type. cannot get zero copy.") + + if hasattr(hist, "bins"): + for key, bi in hist.bins.items(): + h_x.bins[key] = histogrammar.Count.ed(sum_entries(bi)) + elif hasattr(hist, "values"): + for i, bi in enumerate(hist.values): + h_x.values[i] = histogrammar.Count.ed(sum_entries(bi)) + + return h_x + + +def sum_over_x(hist): + """Integrate histogram over first dimension + + :param hist: input histogrammar histogram + :return: integrated histogram + """ + # basic check: nothing to do? + if hasattr(hist, "n_dim") and hist.n_dim == 0: + return hist + if hasattr(hist, "n_dim") and hist.n_dim == 1: + return histogrammar.Count.ed(sum_entries(hist)) + + # n_dim >= 2 from now on + # basic checks on contents + if hasattr(hist, "bins"): + if len(hist.bins) == 0: + return hist + elif hasattr(hist, "values"): + if len(hist.values) == 0: + return hist + else: + return hist + + # n_dim >= 2 and we have contents; here we sum over it. + h_proj = None + if hasattr(hist, "bins"): + h_proj = list(hist.bins.values())[0].zero() + # loop over all counters and integrate over x (=i) + for bi in hist.bins.values(): + h_proj += bi + elif hasattr(hist, "values"): + h_proj = hist.values[0].zero() + # loop over all counters and integrate + for bi in hist.values: + h_proj += bi + + return h_proj + + +def project_split2dhist_on_axis(splitdict, axis="x"): + """Project a split 2d-histogram onto one axis + + Project a 2d hist that's been split with function split_hist_along_first_dimension + onto x or y axis. + + :param dict splitdict: input split histogram to be projected. + :param str axis: name of axis to project on, should be x or y. default is x. + + :return: sorted dictionary of sub-histograms, with as keys the x-axis name and bin-number + :rtype: SortedDict + """ + if not isinstance(splitdict, dict): + raise TypeError( + "splitdict: {wt}, type should be a dictionary.".format(wt=type(splitdict)) + ) + if axis not in ["x", "y"]: + raise ValueError(f"axis: {axis}, can only be x or y.") + + hdict = dict() + + for key, hxy in splitdict.items(): + h = project_on_x(hxy) if axis == "x" else sum_over_x(hxy) + hdict[key] = h + + return hdict + + +def get_histogram(hist_obj): + """ + Parse input and convert to histogrammar object + + :param hist_obj: input histogrammar object. Can also be a corresponding json object or str. + :return: histogrammar histogram + """ + hist = None + if isinstance(hist_obj, COMMON_HIST_TYPES): + hist = hist_obj + elif isinstance(hist_obj, str): + hist = HG_FACTORY.fromJsonString(hist_obj) + elif isinstance(hist_obj, dict): + hist = HG_FACTORY.fromJson(hist_obj) + if hist is None: + raise ValueError("Please provide histogram object as input.") + return hist + + +def is_timestamp(hist): + props = get_hist_props(hist) + return props["is_ts"] + + +def is_numeric(hist): + props = get_hist_props(hist) + return props["is_num"] + + +def sparse_bin_centers_x(hist): + """Get x-axis bin centers of sparse histogram""" + keys = sorted(hist.bins.keys()) + if hist.minBin is None or hist.maxBin is None: + # number of bins is set to 1. + centers = np.array([hist.origin + 0.5 * hist.binWidth]) + else: + centers = np.array([hist.origin + (i + 0.5) * hist.binWidth for i in keys]) + + values = [hist.bins[key] for key in keys] + return centers, values + + +def get_bin_centers(hist): + """Get bin centers or labels of histogram""" + if isinstance(hist, histogrammar.Bin): # Bin + centers, values = hist.bin_centers(), hist.values + elif isinstance(hist, histogrammar.SparselyBin): + centers, values = sparse_bin_centers_x(hist) + else: # categorize + centers, values = hist.bin_labels(), hist.values + return centers, values + + +def split_hist_along_first_dimension( + hist, + xname="x", + yname="y", + short_keys=True, + convert_time_index=True, + filter_empty_split_hists=True, +): + """Split (multi-dimensional) hist into sub-hists along x-axis + + Function to split a (multi-dimensional) histogram into sub-histograms + along the first dimension encountered. + + :param str xname: name of x-axis. default is x. + :param str yname: name of y-axis. default is y. + :param bool short_keys: if false, use long descriptive dict keys. + :param bool convert_time_index: if first dimension is a datetime, convert to pandas timestamp. default is true. + :param bool filter_empty_split_hists: filter out empty sub-histograms after splitting. default is True. + :returns: sorted dictionary of sub-histograms, with as keys the x-axis name and bin-number + :rtype: SortedDict + """ + hdict = dict() + + # nothing special to do + if hist.n_dim == 0: + hdict["dummy"] = hist + return hdict + + centers, values = get_bin_centers(hist) + + # MB 20191004: this happens rarely, but, in Histogrammar, if a multi-dim histogram contains *only* + # nans, overflows, or underflows for x, its sub-dimensional histograms (y, z, etc) do not get filled + # and/or are created. For sparselybin histograms this screws up the event-count, and evaluation of n-dim and + # datatype, so that the comparison of split-histograms along the x-axis gives inconsistent histograms. + # In this step we filter out any such empty sub-histograms, to ensure that + # all left-over sub-histograms are consistent with each other. + if filter_empty_split_hists: + centers, values = _filter_empty_split_hists(centers, values) + + for name, val in zip(centers, values): + name = _edit_name(hist, name, xname, yname, convert_time_index, short_keys) + hdict[name] = val + + return hdict + + +def _filter_empty_split_hists(centers, values): + """Filter empty split histograms from input centers and values + + :param list centers: input center values list + :param list values: input values list + :return: filtered centers and values lists + """ + cc = [] + vv = [] + for c, v in zip(centers, values): + # ignore nan, overflow and underflow counters in total event count + entries = sum_entries(v, default=False) + if entries > 0: + cc.append(c) + vv.append(v) + return cc, vv + + +def _edit_name(hist, axis_name, xname, yname, convert_time_index, short_keys): + if convert_time_index and is_timestamp(hist): + axis_name = pd.Timestamp(axis_name) + if not short_keys: + axis_name = f"{xname}={axis_name}" + if hist.n_dim >= 2: + axis_name = f"{yname}[{axis_name}]" + return axis_name diff --git a/popmon/hist/histogram.py b/popmon/hist/histogram.py deleted file mode 100644 index d612c84f..00000000 --- a/popmon/hist/histogram.py +++ /dev/null @@ -1,360 +0,0 @@ -# Copyright (c) 2020 ING Wholesale Banking Advanced Analytics -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -import numpy as np -import pandas as pd - -from ..hist.patched_histogrammer import COMMON_HIST_TYPES, histogrammar - -HG_FACTORY = histogrammar.Factory() - - -def sum_entries(hist_data, default=True): - """Recursively get sum of entries of histogram - - Sometimes hist.entries gives zero as answer? This function always works though. - - :param hist_data: input histogrammar histogram - :param bool default: if false, do not use default HG method for evaluating entries, but exclude nans, of, uf. - :return: total sum of entries of histogram - :rtype: int - """ - if default: - entries = hist_data.entries - if entries > 0: - return entries - - # double check number of entries, sometimes not well set - sume = 0 - if hasattr(hist_data, "bins"): - # loop over all counters and integrate over y (=j) - for i in hist_data.bins: - bi = hist_data.bins[i] - sume += sum_entries(bi) - elif hasattr(hist_data, "values"): - # loop over all counters and integrate over y (=j) - for i, bi in enumerate(hist_data.values): - sume += sum_entries(bi) - elif hasattr(hist_data, "entries"): - # only count histogrammar.Count() objects - sume += hist_data.entries - return sume - - -def project_on_x(hist_data): - """Project n-dim histogram onto x-axis - - :param hist_data: input histogrammar histogram - :return: on x-axis projected histogram (1d) - """ - # basic check: projecting on itself - if hasattr(hist_data, "n_dim") and hist_data.n_dim <= 1: - return hist_data - # basic checks on contents - if hasattr(hist_data, "bins"): - if len(hist_data.bins) == 0: - return hist_data - elif hasattr(hist_data, "values"): - if len(hist_data.values) == 0: - return hist_data - else: - return hist_data - - # make empty clone - # note: cannot do: h_x = hist.zero(), b/c it copies n-dim structure, which screws up hist.toJsonString() - if isinstance(hist_data, histogrammar.Bin): - h_x = histogrammar.Bin( - num=hist_data.num, - low=hist_data.low, - high=hist_data.high, - quantity=hist_data.quantity, - ) - elif isinstance(hist_data, histogrammar.SparselyBin): - h_x = histogrammar.SparselyBin( - binWidth=hist_data.binWidth, - origin=hist_data.origin, - quantity=hist_data.quantity, - ) - elif isinstance(hist_data, histogrammar.Categorize): - h_x = histogrammar.Categorize(quantity=hist_data.quantity) - else: - raise RuntimeError("unknown historgram type. cannot get zero copy.") - - if hasattr(hist_data, "bins"): - for key, bi in hist_data.bins.items(): - h_x.bins[key] = histogrammar.Count.ed(sum_entries(bi)) - elif hasattr(hist_data, "values"): - for i, bi in enumerate(hist_data.values): - h_x.values[i] = histogrammar.Count.ed(sum_entries(bi)) - - return h_x - - -def sum_over_x(hist_data): - """Integrate histogram over first dimension - - :param hist_data: input histogrammar histogram - :return: integrated histogram - """ - # basic check: nothing to do? - if hasattr(hist_data, "n_dim") and hist_data.n_dim == 0: - return hist_data - if hasattr(hist_data, "n_dim") and hist_data.n_dim == 1: - return histogrammar.Count.ed(sum_entries(hist_data)) - - # n_dim >= 2 from now on - # basic checks on contents - if hasattr(hist_data, "bins"): - if len(hist_data.bins) == 0: - return hist_data - elif hasattr(hist_data, "values"): - if len(hist_data.values) == 0: - return hist_data - else: - return hist_data - - # n_dim >= 2 and we have contents; here we sum over it. - h_proj = None - if hasattr(hist_data, "bins"): - h_proj = list(hist_data.bins.values())[0].zero() - # loop over all counters and integrate over x (=i) - for bi in hist_data.bins.values(): - h_proj += bi - elif hasattr(hist_data, "values"): - h_proj = hist_data.values[0].zero() - # loop over all counters and integrate - for bi in hist_data.values: - h_proj += bi - - return h_proj - - -def project_split2dhist_on_axis(splitdict, axis="x"): - """Project a split 2d-histogram onto one axis - - Project a 2d hist that's been split with function split_hist_along_first_dimension - onto x or y axis. - - :param dict splitdict: input split histogram to be projected. - :param str axis: name of axis to project on, should be x or y. default is x. - - :return: sorted dictionary of sub-histograms, with as keys the x-axis name and bin-number - :rtype: SortedDict - """ - if not isinstance(splitdict, dict): - raise TypeError( - "splitdict: {wt}, type should be a dictionary.".format(wt=type(splitdict)) - ) - if axis not in ["x", "y"]: - raise ValueError(f"axis: {axis}, can only be x or y.") - - hdict = dict() - - for key, hxy in splitdict.items(): - h = project_on_x(hxy) if axis == "x" else sum_over_x(hxy) - hdict[key] = h - - return hdict - - -class HistogramContainer: - """Wrapper class around histogrammar histograms with several utility functions.""" - - def __init__(self, hist_obj): - """Initialization - - :param hist_obj: input histogrammar object. Can also be a corresponding json object or str. - """ - self.hist = None - if isinstance(hist_obj, HistogramContainer): - self.hist = hist_obj.hist - elif isinstance(hist_obj, COMMON_HIST_TYPES): - self.hist = hist_obj - elif isinstance(hist_obj, str): - self.hist = HG_FACTORY.fromJsonString(hist_obj) - elif isinstance(hist_obj, dict): - self.hist = HG_FACTORY.fromJson(hist_obj) - if self.hist is None: - raise ValueError( - "Please provide histogram or histogram container as input." - ) - - self.is_list = isinstance(self.hist.datatype, list) - var_type = self.hist.datatype if not self.is_list else self.hist.datatype[0] - self.npdtype = np.dtype(var_type) - - # determine data-type categories - self.is_int = np.issubdtype(self.npdtype, np.integer) - self.is_ts = np.issubdtype(self.npdtype, np.datetime64) - self.is_num = self.is_ts or np.issubdtype(self.npdtype, np.number) - self.n_dim = self.hist.n_dim - self.entries = self.hist.entries - - def __repr__(self): - return f"HistogramContainer(dtype={self.npdtype}, n_dims={self.n_dim})" - - def __str__(self): - return repr(self) - - def _edit_name(self, axis_name, xname, yname, convert_time_index, short_keys): - if convert_time_index and self.is_ts: - axis_name = pd.Timestamp(axis_name) - if not short_keys: - axis_name = f"{xname}={axis_name}" - if self.n_dim >= 2: - axis_name = f"{yname}[{axis_name}]" - return axis_name - - def sparse_bin_centers_x(self): - """Get x-axis bin centers of sparse histogram""" - keys = sorted(self.hist.bins.keys()) - if self.hist.minBin is None or self.hist.maxBin is None: - # number of bins is set to 1. - centers = np.array([self.hist.origin + 0.5 * self.hist.binWidth]) - else: - centers = np.array( - [self.hist.origin + (i + 0.5) * self.hist.binWidth for i in keys] - ) - - values = [self.hist.bins[key] for key in keys] - return centers, values - - def get_bin_centers(self): - """Get bin centers or labels of histogram""" - if isinstance(self.hist, histogrammar.Bin): # Bin - centers, values = self.hist.bin_centers(), self.hist.values - elif isinstance(self.hist, histogrammar.SparselyBin): - centers, values = self.sparse_bin_centers_x() - else: # categorize - centers, values = self.hist.bin_labels(), self.hist.values - return centers, values - - def split_hist_along_first_dimension( - self, - xname="x", - yname="y", - short_keys=True, - convert_time_index=True, - filter_empty_split_hists=True, - ): - """Split (multi-dimensional) hist into sub-hists along x-axis - - Function to split a (multi-dimensional) histogram into sub-histograms - along the first dimension encountered. - - :param str xname: name of x-axis. default is x. - :param str yname: name of y-axis. default is y. - :param bool short_keys: if false, use long descriptive dict keys. - :param bool convert_time_index: if first dimension is a datetime, convert to pandas timestamp. default is true. - :param bool filter_empty_split_hists: filter out empty sub-histograms after splitting. default is True. - :returns: sorted dictionary of sub-histograms, with as keys the x-axis name and bin-number - :rtype: SortedDict - """ - hdict = dict() - - # nothing special to do - if self.n_dim == 0: - hdict["dummy"] = self.hist - return hdict - - centers, values = self.get_bin_centers() - - # MB 20191004: this happens rarely, but, in Histogrammar, if a multi-dim histogram contains *only* - # nans, overflows, or underflows for x, its sub-dimensional histograms (y, z, etc) do not get filled - # and/or are created. For sparselybin histograms this screws up the event-count, and evaluation of n-dim and - # datatype, so that the comparison of split-histograms along the x-axis gives inconsistent histograms. - # In this step we filter out any such empty sub-histograms, to ensure that - # all left-over sub-histograms are consistent with each other. - if filter_empty_split_hists: - centers, values = self._filter_empty_split_hists(centers, values) - - for name, val in zip(centers, values): - name = self._edit_name(name, xname, yname, convert_time_index, short_keys) - hdict[name] = val - - return hdict - - def _filter_empty_split_hists(self, centers, values): - """Filter empty split histograms from input centers and values - - :param list centers: input center values list - :param list values: input values list - :return: filtered centers and values lists - """ - cc = [] - vv = [] - for c, v in zip(centers, values): - # ignore nan, overflow and underflow counters in total event count - entries = sum_entries(v, default=False) - if entries > 0: - cc.append(c) - vv.append(v) - return cc, vv - - -def get_hist_props(hist): - """Get histogram datatype properties. - - :param hist: input histogram - :returns dict: Column properties - """ - hist = hist.hist if isinstance(hist, HistogramContainer) else hist - - var_type = ( - hist.datatype if not isinstance(hist.datatype, list) else hist.datatype[0] - ) - npdtype = np.dtype(var_type) - - # determine data-type categories - is_int = isinstance(npdtype.type(), np.integer) - is_ts = isinstance(npdtype.type(), np.datetime64) - is_num = is_ts or isinstance(npdtype.type(), np.number) - is_bool = isinstance(npdtype.type(), np.bool_) - - return dict( - dtype=npdtype, is_num=is_num, is_int=is_int, is_ts=is_ts, is_bool=is_bool - ) - - -def dumper(obj): - """Utility function to convert objects to json - - From: https://stackoverflow.com/questions/3768895/how-to-make-a-class-json-serializable - E.g. use to convert dict of histogrammar objects to json - - Use as: - - .. code-block:: python - - js = json.dumps(hists, default=dumper) - with open(filename, 'w') as f: - json.dump(hists, f, default=dumper) - - :param obj: input object - :return: output json object - """ - if hasattr(obj, "toJSON"): - return obj.toJSON() - elif hasattr(obj, "toJson"): - return obj.toJson() - elif hasattr(obj, "__dict__"): - return obj.__dict__ - else: - raise RuntimeError(f"Do not know how to serialize object type {type(obj)}") diff --git a/popmon/hist/patched_histogrammer.py b/popmon/hist/patched_histogrammer.py deleted file mode 100644 index 5d9eb002..00000000 --- a/popmon/hist/patched_histogrammer.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 ING Wholesale Banking Advanced Analytics -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -import histogrammar -import numpy as np - -# large numbers (time in ns since 1970) used to determine if float corresponds to a timestamp -DATE_LOW = 5e16 # 1971-08-02 16:53:20 in nanosec -DATE_HIGH = 9.9e18 # 2260-1-1 in nanosec - -COMMON_HIST_TYPES = ( - histogrammar.Categorize, - histogrammar.Bin, - histogrammar.SparselyBin, - histogrammar.specialized.CategorizeHistogramMethods, - histogrammar.specialized.HistogramMethods, - histogrammar.specialized.SparselyHistogramMethods, - histogrammar.specialized.CategorizeHistogramMethods, - histogrammar.specialized.TwoDimensionallyHistogramMethods, - histogrammar.specialized.SparselyTwoDimensionallyHistogramMethods, -) - - -def get_datatype(cls): - """Get histogrammar histogram datatype(s) of its axes - - Return data type of the variable represented by the histogram. If not - already set, will determine datatype automatically. - - :returns: list with datatypes of all dimenensions of the histogram - :rtype: list - """ - datatype = [] - if isinstance(cls, histogrammar.Count): - return datatype - if isinstance(cls, histogrammar.Categorize): - if len(cls.bins) > 0: - dt = type(list(cls.bins.keys())[0]) - dt = np.dtype(dt).type - if (dt is np.str_) or (dt is np.string_) or (dt is np.object_): - dt = str - datatype = [dt] - elif isinstance(cls, (histogrammar.Bin, histogrammar.SparselyBin)): - datatype = [np.number] - bin_centers = cls.bin_centers() - if len(bin_centers) > 0: - dt = type(bin_centers[-1]) - dt = np.dtype(dt).type - datatype = [dt] - # HACK: making an educated guess for timestamp - # timestamp is in ns since 1970, so a huge number. - is_ts = DATE_LOW < bin_centers[-1] < DATE_HIGH - if is_ts: - datatype = [np.datetime64] - # histogram may have a subhistogram. Extract it and recurse - if hasattr(cls, "bins"): - hist = list(cls.bins.values())[0] if cls.bins else histogrammar.Count() - elif hasattr(cls, "values"): - hist = cls.values[0] if cls.values else histogrammar.Count() - else: - hist = histogrammar.Count() - return datatype + get_datatype(hist) - - -@property -def datatype(self): # noqa - """Data type of histogram variable. - - Return data type of the variable represented by the histogram. If not - already set, will determine datatype automatically. - - :returns: data type - :rtype: type or list(type) - """ - # making an educated guess to determine data-type categories - if not hasattr(self, "_datatype"): - datatype = get_datatype(self) - if isinstance(datatype, list): - if len(datatype) == 1: - return datatype[0] - elif len(datatype) == 0: - return type(None) - return datatype - - if isinstance(self._datatype, list): - if len(self._datatype) == 1: - return self._datatype[0] - elif len(self._datatype) == 0: - return type(None) - return self._datatype - - -@datatype.setter -def datatype(self, dt): - """Set data type of histogram variable. - - Set data type of the variable represented by the histogram. - - :param type dt: type of the variable represented by the histogram - :raises RunTimeError: if datatype has already been set, it will not overwritten - """ - if hasattr(self, "_datatype"): - raise RuntimeError("datatype already set") - self._datatype = dt - - -# --- we decorate here -histogrammar.Bin.datatype = datatype -histogrammar.SparselyBin.datatype = datatype -histogrammar.Categorize.datatype = datatype -histogrammar.Count.datatype = datatype diff --git a/popmon/notebooks/popmon_tutorial_advanced.ipynb b/popmon/notebooks/popmon_tutorial_advanced.ipynb index 315af09f..d8457de6 100644 --- a/popmon/notebooks/popmon_tutorial_advanced.ipynb +++ b/popmon/notebooks/popmon_tutorial_advanced.ipynb @@ -290,7 +290,7 @@ "outputs": [], "source": [ "split_hist = split_hists.query(\"date == '2015-07-05 12:00:00'\")\n", - "split_hist.histogram[0].hist.plot.matplotlib()" + "split_hist.histogram[0].plot.matplotlib()" ] }, { @@ -306,7 +306,7 @@ "metadata": {}, "outputs": [], "source": [ - "split_hist.histogram_ref[0].hist.plot.matplotlib()" + "split_hist.histogram_ref[0].plot.matplotlib()" ] }, { diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py index 51abb3e4..61590864 100644 --- a/popmon/pipeline/metrics.py +++ b/popmon/pipeline/metrics.py @@ -21,8 +21,12 @@ import logging import pandas as pd +from histogrammar.dfinterface.make_histograms import ( + get_bin_specs, + get_time_axes, + make_histograms, +) -from ..hist.filling.make_histograms import get_bin_specs, get_time_axes, make_histograms from ..pipeline.metrics_pipelines import ( metrics_expanding_reference, metrics_external_reference, diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index aeec0e12..4d4e16f2 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -21,10 +21,14 @@ import logging import pandas as pd +from histogrammar.dfinterface.make_histograms import ( + get_bin_specs, + get_time_axes, + make_histograms, +) from ..base import Module from ..config import config -from ..hist.filling.make_histograms import get_bin_specs, get_time_axes, make_histograms from ..pipeline.report_pipelines import ( ReportPipe, expanding_reference, diff --git a/popmon/stitching/hist_stitcher.py b/popmon/stitching/hist_stitcher.py index 2548f794..77d88d11 100644 --- a/popmon/stitching/hist_stitcher.py +++ b/popmon/stitching/hist_stitcher.py @@ -23,7 +23,6 @@ from ..analysis.hist_numpy import assert_similar_hists from ..base import Module -from ..hist.histogram import HistogramContainer class HistStitcher(Module): @@ -233,7 +232,7 @@ def stitch_histograms( if feature not in features_basis: continue self.logger.debug(f'Now inserting into histogram "{feature}"') - hist_list = [HistogramContainer(hd[key]) for hd in hists_delta] + hist_list = [hd[key] for hd in hists_delta] stitched[feature] = self._insert_hists( hists_basis[feature], hist_list, time_bin_idx, mode ) @@ -258,7 +257,7 @@ def stitch_histograms( return hists_basis for feature in features_overlap: self.logger.debug(f'Now stitching histograms "{feature}"') - hist_list = [HistogramContainer(hd[feature]) for hd in hists_list] + hist_list = [hd[feature] for hd in hists_list] stitched[feature] = self._stitch_by_update(mode, hist_list) # add basis hists without any overlap for feature in features_basis: @@ -279,10 +278,7 @@ def _find_max_time_bin_index(self, hists_basis, features_basis, time_axis): assert len(features_basis) > 0 assert all([f.startswith(time_axis) for f in features_basis]) - hist_list = [ - h.hist if isinstance(h, HistogramContainer) else h - for h in hists_basis.values() - ] + hist_list = list(hists_basis.values()) all_sparse = all([isinstance(h, hg.SparselyBin) for h in hist_list]) all_cat = ( @@ -341,14 +337,10 @@ def _insert_hists(self, hbasis, hdelta_list, time_bin_idx, mode): raise TypeError("time_bin_idxs should be an (ordered) string or integer.") # consistency checks on histogram definitions - hbasis = hbasis.hist if isinstance(hbasis, HistogramContainer) else hbasis if not hasattr(hbasis, "bins"): raise RuntimeError( "basis histogram does not have bins attribute. cannot insert." ) - hdelta_list = [ - hd.hist if isinstance(hd, HistogramContainer) else hd for hd in hdelta_list - ] if len(hbasis.bins) > 0: hbk0 = list(hbasis.bins.values())[0] assert_similar_hists([hbk0] + hdelta_list) @@ -396,7 +388,6 @@ def _create_hist_with_time_axis(self, hist, time_bin_idx): raise TypeError( "time_bin_idx not set. should be an (ordered) string or integer." ) - hist = hist.hist if isinstance(hist, HistogramContainer) else hist ht = ( hg.SparselyBin(binWidth=1.0, origin=0.0, quantity=lambda x: x) @@ -419,10 +410,6 @@ def _stitch_by_update(self, mode, hist_list): :param list hist_list: list of input histogrammar histograms :return: list of consistent 1d numpy arrays with bin_entries for list of input histograms """ - hist_list = [ - hc.hist if isinstance(hc, HistogramContainer) else hc for hc in hist_list - ] - # --- basic checks if len(hist_list) == 0: raise RuntimeError("Input histogram list has zero length.") diff --git a/popmon/version.py b/popmon/version.py index 17e7c99f..eee51c0b 100644 --- a/popmon/version.py +++ b/popmon/version.py @@ -1,6 +1,6 @@ """THIS FILE IS AUTO-GENERATED BY SETUP.PY.""" name = "popmon" -version = "0.3.14" -full_version = "0.3.14" +version = "0.3.15" +full_version = "0.3.15" release = True diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index c710f824..5d5e60ef 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -21,6 +21,7 @@ import multiprocessing import pandas as pd +from histogrammar.util import get_hist_props from joblib import Parallel, delayed from tqdm import tqdm @@ -31,7 +32,6 @@ ) from ..base import Module from ..config import get_stat_description -from ..hist.histogram import get_hist_props from ..visualization.utils import plot_overlay_1d_histogram_b64 diff --git a/requirements.txt b/requirements.txt index 786ad260..fae04f6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy>=1.18.0 pandas>=0.25.1 scipy>=1.5.2 -histogrammar==1.0.12 +histogrammar>=1.0.23 phik jinja2 tqdm diff --git a/setup.py b/setup.py index af573b38..983945f6 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ MAJOR = 0 REVISION = 3 -PATCH = 14 +PATCH = 15 DEV = False # NOTE: also update version at: README.rst diff --git a/tests/popmon/analysis/profiling/test_hist_profiler.py b/tests/popmon/analysis/profiling/test_hist_profiler.py index 659b2ae8..3e3f6c0f 100644 --- a/tests/popmon/analysis/profiling/test_hist_profiler.py +++ b/tests/popmon/analysis/profiling/test_hist_profiler.py @@ -3,7 +3,7 @@ import pandas as pd from popmon.analysis.profiling.hist_profiler import HistProfiler -from popmon.hist.histogram import HistogramContainer +from popmon.hist.hist_utils import get_bin_centers def test_profile_hist1d(): @@ -17,9 +17,7 @@ def test_profile_hist1d(): for i in range(split_len): h = hg.Bin(num_bins, 0, 1, lambda x: x) h.fill.numpy(np.random.uniform(0, 1, num_entries)) - split.append( - {"date": pd.Timestamp("2019 - 1 - 1"), hist_name: HistogramContainer(h)} - ) + split.append({"date": pd.Timestamp("2019 - 1 - 1"), hist_name: h}) hp = HistProfiler( read_key="dummy_input", @@ -32,5 +30,5 @@ def test_profile_hist1d(): assert len(profiles) == split_len assert "p95" in profiles[0] - assert profiles[1]["max"] == np.max(split[1][hist_name].get_bin_centers()[0]) - assert len(profiles[0][hist_name].hist.bin_entries()) == num_bins + assert profiles[1]["max"] == np.max(get_bin_centers(split[1][hist_name])[0]) + assert len(profiles[0][hist_name].bin_entries()) == num_bins diff --git a/tests/popmon/analysis/test_functions.py b/tests/popmon/analysis/test_functions.py index 75eaaade..71ff0b7d 100644 --- a/tests/popmon/analysis/test_functions.py +++ b/tests/popmon/analysis/test_functions.py @@ -78,7 +78,7 @@ def test_expanding_hist(): df = datastore["output_hist"]["num_employees"] h = df["histogram_sum"].values[-1] - bin_entries = h.hist.bin_entries() + bin_entries = h.bin_entries() check = np.array( [ @@ -238,7 +238,7 @@ def test_rolling_hist(): df = datastore["output_hist"]["num_employees"] h = df["histogram_sum"].values[-2] - bin_entries = h.hist.bin_entries() + bin_entries = h.bin_entries() check = np.array( [ diff --git a/tests/popmon/analysis/test_hist_numpy.py b/tests/popmon/analysis/test_hist_numpy.py index 4a113406..ba929151 100644 --- a/tests/popmon/analysis/test_hist_numpy.py +++ b/tests/popmon/analysis/test_hist_numpy.py @@ -1,3 +1,4 @@ +import histogrammar as hg import numpy as np import pandas as pd import pytest @@ -13,8 +14,6 @@ prepare_2dgrid, set_2dgrid, ) -from popmon.hist.histogram import HistogramContainer -from popmon.hist.patched_histogrammer import histogrammar as hg def to_ns(x): @@ -50,11 +49,7 @@ def get_test_histograms1(): hist2.fill.numpy(df) hist3.fill.numpy(df) - hc1 = HistogramContainer(hist1) - hc2 = HistogramContainer(hist2) - hc3 = HistogramContainer(hist3) - - return df, hc1, hc2, hc3 + return df, hist1, hist2, hist3 def get_test_histograms2(): @@ -75,20 +70,12 @@ def get_test_histograms2(): hist3.fill.numpy(df) hist4.fill.numpy(df) - hc1 = HistogramContainer(hist1) - hc2 = HistogramContainer(hist2) - hc3 = HistogramContainer(hist3) - hc4 = HistogramContainer(hist4) - - return df, hc1, hc2, hc3, hc4 + return df, hist1, hist2, hist3, hist4 def test_histogram(): """Test the dummy histogram we're working with below""" - df, hc1, hc2, hc3 = get_test_histograms1() - hist1 = hc1.hist - hist2 = hc2.hist - hist3 = hc3.hist + df, hist1, hist2, hist3 = get_test_histograms1() assert hist1.entries == 5 assert hist1.n_dim == 1 @@ -105,10 +92,7 @@ def test_histogram(): def test_get_contentType(): """Test getting type of a histogram""" - df, hc1, hc2, hc3 = get_test_histograms1() - hist1 = hc1.hist - hist2 = hc2.hist - hist3 = hc3.hist + df, hist1, hist2, hist3 = get_test_histograms1() assert get_contentType(hist1) == "Categorize" assert get_contentType(hist2) == "Bin" @@ -149,10 +133,7 @@ def test_prepare_2dgrid(): @pytest.mark.filterwarnings("ignore:Input histogram only has") def test_set_2dgrid(): """Test setting the grid for extraction of number of entries for 2d hists""" - df, hc1, hc2, hc3 = get_test_histograms1() - hist1 = hc1.hist - hist2 = hc2.hist - hist3 = hc3.hist + df, hist1, hist2, hist3 = get_test_histograms1() xkeys1, ykeys1 = prepare_2dgrid(hist1) xkeys2, ykeys2 = prepare_2dgrid(hist2) @@ -180,10 +161,7 @@ def test_set_2dgrid(): @pytest.mark.filterwarnings("ignore:Input histogram only has") def test_get_2dgrid(): """Test extraction of number of entries for 2d hists""" - df, hc1, hc2, hc3 = get_test_histograms1() - hist1 = hc1.hist - hist2 = hc2.hist - hist3 = hc3.hist + df, hist1, hist2, hist3 = get_test_histograms1() grid1 = get_2dgrid(hist1) grid2 = get_2dgrid(hist2) @@ -232,17 +210,13 @@ def test_get_consistent_numpy_2dgrids(): hist1.fill.numpy(df1) hist2.fill.numpy(df2) - hc0 = HistogramContainer(hist0) - hc1 = HistogramContainer(hist1) - hc2 = HistogramContainer(hist2) - args = [""] try: - get_consistent_numpy_2dgrids([hc0, hc0]) + get_consistent_numpy_2dgrids([hist0, hist0]) except ValueError as e: args = e.args - grid2d_list = get_consistent_numpy_2dgrids([hc1, hc2]) + grid2d_list = get_consistent_numpy_2dgrids([hist1, hist2]) g1 = np.asarray( [ @@ -297,11 +271,12 @@ def test_get_consistent_numpy_1dhists(): hist1.fill.numpy(df1) hist2.fill.numpy(df2) - hc1 = HistogramContainer(hist1) - hc2 = HistogramContainer(hist2) - - nphist1, nphist2 = get_consistent_numpy_1dhists([hc1, hc2], get_bin_labels=False) - nphist_list, centers = get_consistent_numpy_1dhists([hc1, hc2], get_bin_labels=True) + nphist1, nphist2 = get_consistent_numpy_1dhists( + [hist1, hist2], get_bin_labels=False + ) + nphist_list, centers = get_consistent_numpy_1dhists( + [hist1, hist2], get_bin_labels=True + ) entries1 = [1.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0] entries2 = [0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0] @@ -339,18 +314,14 @@ def test_get_consistent_numpy_entries(): ) # building 1d-, 2d-, and 3d-histogram (iteratively) - hist0 = HistogramContainer(hg.Categorize(unit("C"))) - hist1 = HistogramContainer(hg.Categorize(unit("C"))) - hist2 = HistogramContainer( - hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) - ) - hist3 = HistogramContainer( - hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) - ) + hist0 = hg.Categorize(unit("C")) + hist1 = hg.Categorize(unit("C")) + hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) + hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) # fill them for hist, df in zip([hist0, hist1, hist2, hist3], [df1, df2, df1, df2]): - hist.hist.fill.numpy(df) + hist.fill.numpy(df) e0, e1 = get_consistent_numpy_entries([hist0, hist1], get_bin_labels=False) _, labels01 = get_consistent_numpy_entries([hist0, hist1], get_bin_labels=True) @@ -407,19 +378,12 @@ def test_check_similar_hists(): for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: hist.fill.numpy(df) - hc0 = HistogramContainer(hist0) - hc1 = HistogramContainer(hist1) - hc2 = HistogramContainer(hist2) - hc3 = HistogramContainer(hist3) - hc4 = HistogramContainer(hist4) - hc5 = HistogramContainer(hist5) - - for hc in [hc0, hc1, hc2, hc3, hc4, hc5]: - assert check_similar_hists([hc, hc]) + for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: + assert check_similar_hists([hist, hist]) - assert not check_similar_hists([hc0, hc1]) - assert not check_similar_hists([hc2, hc3]) - assert not check_similar_hists([hc4, hc5]) + assert not check_similar_hists([hist0, hist1]) + assert not check_similar_hists([hist2, hist3]) + assert not check_similar_hists([hist4, hist5]) @pytest.mark.filterwarnings("ignore:Input histograms have inconsistent") @@ -455,32 +419,25 @@ def test_assert_similar_hists(): for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: hist.fill.numpy(df) - hc0 = HistogramContainer(hist0) - hc1 = HistogramContainer(hist1) - hc2 = HistogramContainer(hist2) - hc3 = HistogramContainer(hist3) - hc4 = HistogramContainer(hist4) - hc5 = HistogramContainer(hist5) - - for hc in [hc0, hc1, hc2, hc3, hc4, hc5]: - assert check_similar_hists([hc, hc]) + for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: + assert check_similar_hists([hist, hist]) args01 = [""] args23 = [""] args45 = [""] try: - assert_similar_hists([hc0, hc1]) + assert_similar_hists([hist0, hist1]) except ValueError as e: args01 = e.args try: - assert_similar_hists([hc2, hc3]) + assert_similar_hists([hist2, hist3]) except ValueError as e: args23 = e.args try: - assert_similar_hists([hc4, hc5]) + assert_similar_hists([hist4, hist5]) except ValueError as e: args45 = e.args @@ -491,11 +448,8 @@ def test_assert_similar_hists(): def test_datatype(): """Test datatypes assigned to histograms""" - df, hc1, hc2, hc3 = get_test_histograms1() - hist1 = hc1.hist - hist2 = hc2.hist - hist3 = hc3.hist + df, hist1, hist2, hist3 = get_test_histograms1() assert hist1.datatype == str - np.testing.assert_array_equal(hist2.datatype, [np.float64, str]) - np.testing.assert_array_equal(hist3.datatype, [np.datetime64, np.float64, str]) + np.testing.assert_array_equal(hist2.datatype, [np.number, str]) + np.testing.assert_array_equal(hist3.datatype, [np.datetime64, np.number, str]) diff --git a/tests/popmon/hist/test_histogram.py b/tests/popmon/hist/test_histogram.py index 0d674771..213b4f8d 100644 --- a/tests/popmon/hist/test_histogram.py +++ b/tests/popmon/hist/test_histogram.py @@ -1,14 +1,17 @@ +import histogrammar as hg import numpy as np import pandas as pd -from popmon.hist.histogram import ( - HistogramContainer, +from popmon.hist.hist_utils import ( + is_numeric, + is_timestamp, project_on_x, project_split2dhist_on_axis, + sparse_bin_centers_x, + split_hist_along_first_dimension, sum_entries, sum_over_x, ) -from popmon.hist.patched_histogrammer import histogrammar as hg def get_test_data(): @@ -58,23 +61,18 @@ def test_histogrammar(): def test_histogram_attributes(): hist1, hist2, hist3 = get_histograms() - hist_obj1 = HistogramContainer(hist1) - hist_obj2 = HistogramContainer(hist2) - hist_obj3 = HistogramContainer(hist3) - - assert hist_obj1.is_num is False - assert hist_obj1.is_ts is False - assert hist_obj2.is_num is True - assert hist_obj2.is_ts is False - assert hist_obj3.is_num is True - assert hist_obj3.is_ts is True + assert is_numeric(hist1) is False + assert is_timestamp(hist1) is False + assert is_numeric(hist2) is True + assert is_timestamp(hist2) is False + assert is_numeric(hist3) is True + assert is_timestamp(hist3) is True def test_sparse_bin_centers_x(): hist1, hist2, hist3 = get_histograms() - hist_obj3 = HistogramContainer(hist3) - centers3, values3 = hist_obj3.sparse_bin_centers_x() + centers3, values3 = sparse_bin_centers_x(hist3) np.testing.assert_array_equal( centers3, [1.2308112e18, 1.2308976e18, 1.2311568e18, 1.2312432e18, 1.2313296e18] @@ -83,18 +81,15 @@ def test_sparse_bin_centers_x(): def test_split_hist_along_first_dimension(): hist1, hist2, hist3 = get_histograms() - hist_obj1 = HistogramContainer(hist1) - hist_obj2 = HistogramContainer(hist2) - hist_obj3 = HistogramContainer(hist3) - split3a = hist_obj3.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=True + split3a = split_hist_along_first_dimension( + hist=hist3, xname="x", yname="y", short_keys=True, convert_time_index=True ) - split3b = hist_obj3.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=False + split3b = split_hist_along_first_dimension( + hist=hist3, xname="x", yname="y", short_keys=True, convert_time_index=False ) - split3c = hist_obj3.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=False, convert_time_index=True + split3c = split_hist_along_first_dimension( + hist=hist3, xname="x", yname="y", short_keys=False, convert_time_index=True ) keys3a = list(split3a.keys()) @@ -121,14 +116,14 @@ def test_split_hist_along_first_dimension(): np.testing.assert_array_equal(keys3b, check3b) np.testing.assert_array_equal(keys3c, check3c) - split2a = hist_obj2.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=True + split2a = split_hist_along_first_dimension( + hist=hist2, xname="x", yname="y", short_keys=True, convert_time_index=True ) - split2b = hist_obj2.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=False + split2b = split_hist_along_first_dimension( + hist=hist2, xname="x", yname="y", short_keys=True, convert_time_index=False ) - split2c = hist_obj2.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=False, convert_time_index=False + split2c = split_hist_along_first_dimension( + hist=hist2, xname="x", yname="y", short_keys=False, convert_time_index=False ) keys2a = list(split2a.keys()) @@ -143,14 +138,14 @@ def test_split_hist_along_first_dimension(): np.testing.assert_array_equal(keys2b, check2b) np.testing.assert_array_equal(keys2c, check2c) - split1a = hist_obj1.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=True + split1a = split_hist_along_first_dimension( + hist=hist1, xname="x", yname="y", short_keys=True, convert_time_index=True ) - split1b = hist_obj1.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=False + split1b = split_hist_along_first_dimension( + hist=hist1, xname="x", yname="y", short_keys=True, convert_time_index=False ) - split1c = hist_obj1.split_hist_along_first_dimension( - xname="x", yname="y", short_keys=False, convert_time_index=False + split1c = split_hist_along_first_dimension( + hist=hist1, xname="x", yname="y", short_keys=False, convert_time_index=False ) keys1a = list(split1a.keys()) @@ -284,17 +279,17 @@ def test_project_split2dhist_on_axis(): hist.fill.numpy(df) # split along date axis - splitAC = HistogramContainer(histDAC).split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=True + splitAC = split_hist_along_first_dimension( + hist=histDAC, xname="x", yname="y", short_keys=True, convert_time_index=True ) - splitCA = HistogramContainer(histDCA).split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=True + splitCA = split_hist_along_first_dimension( + hist=histDCA, xname="x", yname="y", short_keys=True, convert_time_index=True ) - splitA0 = HistogramContainer(histDA).split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=True + splitA0 = split_hist_along_first_dimension( + hist=histDA, xname="x", yname="y", short_keys=True, convert_time_index=True ) - splitC0 = HistogramContainer(histDC).split_hist_along_first_dimension( - xname="x", yname="y", short_keys=True, convert_time_index=True + splitC0 = split_hist_along_first_dimension( + hist=histDC, xname="x", yname="y", short_keys=True, convert_time_index=True ) splitA1 = project_split2dhist_on_axis(splitAC, "x") @@ -348,5 +343,5 @@ def test_datatype(): assert isinstance(None, hist0.datatype) assert hist1.datatype == str - np.testing.assert_array_equal(hist2.datatype, [np.float64, str]) - np.testing.assert_array_equal(hist3.datatype, [np.datetime64, np.float64, str]) + np.testing.assert_array_equal(hist2.datatype, [np.number, str]) + np.testing.assert_array_equal(hist3.datatype, [np.datetime64, np.number, str]) diff --git a/tests/popmon/hist/test_numpy_histogrammar.py b/tests/popmon/hist/test_numpy_histogrammar.py deleted file mode 100644 index 5e4409c3..00000000 --- a/tests/popmon/hist/test_numpy_histogrammar.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 - -import pytest - -from popmon.base import Pipeline -from popmon.hist.filling import NumpyHistogrammar - - -def test_assert_dataframe(): - pandas_filler = NumpyHistogrammar( - features=["age", "fruit", "latitude", ["longitude", "active"]] - ) - with pytest.raises(TypeError): - pandas_filler.assert_dataframe("coconut") - - -def test_get_histograms(): - - np_array = pytest.test_df.to_records(index=False) - - np_filler = NumpyHistogrammar( - features=[ - "date", - "isActive", - "age", - "eyeColor", - "gender", - "company", - "latitude", - "longitude", - ["isActive", "age"], - ["latitude", "longitude"], - ], - bin_specs={ - "longitude": {"bin_width": 5, "bin_offset": 0}, - "latitude": {"bin_width": 5, "bin_offset": 0}, - }, - ) - current_hists = np_filler.get_histograms(np_array) - - assert current_hists["age"].toJson() == pytest.age - assert current_hists["company"].toJson() == pytest.company - assert current_hists["date"].toJson() == pytest.date - assert current_hists["eyeColor"].toJson() == pytest.eyesColor - assert current_hists["gender"].toJson() == pytest.gender - assert current_hists["isActive"].toJson() == pytest.isActive - assert current_hists["isActive:age"].toJson() == pytest.isActive_age - assert current_hists["latitude"].toJson() == pytest.latitude - assert current_hists["longitude"].toJson() == pytest.longitude - assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude - - -def test_get_histograms_module(): - - np_filler = NumpyHistogrammar( - features=[ - "date", - "isActive", - "age", - "eyeColor", - "gender", - "company", - "latitude", - "longitude", - ["isActive", "age"], - ["latitude", "longitude"], - ], - bin_specs={ - "longitude": {"bin_width": 5, "bin_offset": 0}, - "latitude": {"bin_width": 5, "bin_offset": 0}, - }, - read_key="input", - store_key="output", - ) - - pipeline = Pipeline(modules=[np_filler]) - datastore = pipeline.transform( - datastore={"input": pytest.test_df.to_records(index=False)} - ) - - assert "output" in datastore - current_hists = datastore["output"] - - assert current_hists["age"].toJson() == pytest.age - assert current_hists["company"].toJson() == pytest.company - assert current_hists["date"].toJson() == pytest.date - assert current_hists["eyeColor"].toJson() == pytest.eyesColor - assert current_hists["gender"].toJson() == pytest.gender - assert current_hists["isActive"].toJson() == pytest.isActive - assert current_hists["isActive:age"].toJson() == pytest.isActive_age - assert current_hists["latitude"].toJson() == pytest.latitude - assert current_hists["longitude"].toJson() == pytest.longitude - assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude diff --git a/tests/popmon/hist/test_pandas_histogrammar.py b/tests/popmon/hist/test_pandas_histogrammar.py deleted file mode 100644 index eaa7f28d..00000000 --- a/tests/popmon/hist/test_pandas_histogrammar.py +++ /dev/null @@ -1,231 +0,0 @@ -#!/usr/bin/env python3 - -import numpy as np -import pytest - -from popmon.base import Pipeline -from popmon.hist.filling import ( - PandasHistogrammar, - get_bin_specs, - get_time_axes, - make_histograms, -) - - -def test_get_histograms(): - - pandas_filler = PandasHistogrammar( - features=[ - "date", - "isActive", - "age", - "eyeColor", - "gender", - "company", - "latitude", - "longitude", - ["isActive", "age"], - ["latitude", "longitude"], - ], - bin_specs={ - "longitude": {"bin_width": 5, "bin_offset": 0}, - "latitude": {"bin_width": 5, "bin_offset": 0}, - }, - ) - current_hists = pandas_filler.get_histograms(pytest.test_df) - - assert current_hists["age"].toJson() == pytest.age - assert current_hists["company"].toJson() == pytest.company - assert current_hists["date"].toJson() == pytest.date - assert current_hists["eyeColor"].toJson() == pytest.eyesColor - assert current_hists["gender"].toJson() == pytest.gender - assert current_hists["isActive"].toJson() == pytest.isActive - assert current_hists["isActive:age"].toJson() == pytest.isActive_age - assert current_hists["latitude"].toJson() == pytest.latitude - assert current_hists["longitude"].toJson() == pytest.longitude - assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude - - -def test_make_histograms(): - - features = [ - "date", - "isActive", - "age", - "eyeColor", - "gender", - "company", - "latitude", - "longitude", - ["isActive", "age"], - ["latitude", "longitude"], - "transaction", - ] - bin_specs = { - "transaction": {"num": 100, "low": -2000, "high": 2000}, - "longitude": {"bin_width": 5, "bin_offset": 0}, - "latitude": {"bin_width": 5, "bin_offset": 0}, - } - - current_hists = make_histograms( - pytest.test_df, features=features, binning="unit", bin_specs=bin_specs - ) - - assert current_hists["age"].toJson() == pytest.age - assert current_hists["company"].toJson() == pytest.company - assert current_hists["date"].toJson() == pytest.date - assert current_hists["eyeColor"].toJson() == pytest.eyesColor - assert current_hists["gender"].toJson() == pytest.gender - assert current_hists["isActive"].toJson() == pytest.isActive - assert current_hists["isActive:age"].toJson() == pytest.isActive_age - assert current_hists["latitude"].toJson() == pytest.latitude - assert current_hists["longitude"].toJson() == pytest.longitude - assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude - assert current_hists["transaction"].toJson() == pytest.transaction - - -def test_make_histograms_no_time_axis(): - - hists, features, bin_specs, time_axis, var_dtype = make_histograms( - pytest.test_df, time_axis="", ret_specs=True - ) - - assert len(hists) == 21 - assert len(features) == 21 - assert len(bin_specs) == 6 - assert len(var_dtype) == 21 - assert time_axis == "" - assert "date" in hists - h = hists["date"] - assert h.binWidth == 751582381944448.0 - for cols in features: - cols = cols.split(":") - assert len(cols) == 1 - for f, bs in bin_specs.items(): - assert isinstance(bs, dict) - assert "age" in bin_specs - dateage = bin_specs["age"] - assert dateage["bin_width"] == 2.0 - assert dateage["bin_offset"] == 9.5 - - -def test_make_histograms_with_time_axis(): - - hists, features, bin_specs, time_axis, var_dtype = make_histograms( - pytest.test_df, time_axis=True, ret_specs=True - ) - - assert len(hists) == 20 - assert len(features) == 20 - assert len(bin_specs) == 20 - assert len(var_dtype) == 21 - assert time_axis == "date" - assert "date:age" in hists - h = hists["date:age"] - assert h.binWidth == 751582381944448.0 - for cols in features: - cols = cols.split(":") - assert len(cols) == 2 and cols[0] == "date" - for f, bs in bin_specs.items(): - assert len(bs) == 2 - assert "date:age" in bin_specs - dateage = bin_specs["date:age"] - assert dateage[0]["bin_width"] == 751582381944448.0 - assert dateage[1]["bin_width"] == 2.0 - assert dateage[1]["bin_offset"] == 9.5 - - # test get_bin_specs 1 - bin_specs = get_bin_specs(hists) - assert "date:age" in bin_specs - dateage = bin_specs["date:age"] - assert dateage[0]["bin_width"] == 751582381944448.0 - assert dateage[1]["bin_width"] == 2.0 - assert dateage[1]["bin_offset"] == 9.5 - - # test get_bin_specs 2 - bin_specs = get_bin_specs(hists, skip_first_axis=True) - assert "age" in bin_specs - age = bin_specs["age"] - assert age["bin_width"] == 2.0 - assert age["bin_offset"] == 9.5 - - # test get_bin_specs 3 - bin_specs = get_bin_specs(hists["date:age"]) - assert bin_specs[0]["bin_width"] == 751582381944448.0 - assert bin_specs[1]["bin_width"] == 2.0 - assert bin_specs[1]["bin_offset"] == 9.5 - - # test get_bin_specs 4 - bin_specs = get_bin_specs(hists["date:age"], skip_first_axis=True) - assert bin_specs["bin_width"] == 2.0 - assert bin_specs["bin_offset"] == 9.5 - - -def test_make_histograms_unit_binning(): - - hists, features, bin_specs, time_axis, var_dtype = make_histograms( - pytest.test_df, binning="unit", time_axis="", ret_specs=True - ) - - assert len(hists) == 21 - assert len(features) == 21 - assert len(bin_specs) == 0 - assert len(var_dtype) == 21 - assert time_axis == "" - assert "date" in hists - h = hists["date"] - assert h.binWidth == 2592000000000000 - for cols in features: - cols = cols.split(":") - assert len(cols) == 1 - for f, bs in bin_specs.items(): - assert isinstance(bs, dict) - assert "age" in hists - h = hists["age"] - assert h.binWidth == 1.0 - assert h.origin == 0.0 - - -def test_get_histograms_module(): - - pandas_filler = PandasHistogrammar( - features=[ - "date", - "isActive", - "age", - "eyeColor", - "gender", - "company", - "latitude", - "longitude", - ["isActive", "age"], - ["latitude", "longitude"], - ], - bin_specs={ - "longitude": {"bin_width": 5, "bin_offset": 0}, - "latitude": {"bin_width": 5, "bin_offset": 0}, - }, - read_key="input", - store_key="output", - ) - - pipeline = Pipeline(modules=[pandas_filler]) - datastore = pipeline.transform(datastore={"input": pytest.test_df}) - - assert "output" in datastore - current_hists = datastore["output"] - assert current_hists["age"].toJson() == pytest.age - assert current_hists["company"].toJson() == pytest.company - assert current_hists["date"].toJson() == pytest.date - assert current_hists["eyeColor"].toJson() == pytest.eyesColor - assert current_hists["gender"].toJson() == pytest.gender - assert current_hists["isActive"].toJson() == pytest.isActive - assert current_hists["isActive:age"].toJson() == pytest.isActive_age - assert current_hists["latitude"].toJson() == pytest.latitude - assert current_hists["longitude"].toJson() == pytest.longitude - assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude - - -def test_get_time_axes(): - time_axes = get_time_axes(pytest.test_df) - np.testing.assert_array_equal(time_axes, ["date"]) diff --git a/tests/popmon/hist/test_spark_histogrammar.py b/tests/popmon/hist/test_spark_histogrammar.py deleted file mode 100644 index 714362f7..00000000 --- a/tests/popmon/hist/test_spark_histogrammar.py +++ /dev/null @@ -1,255 +0,0 @@ -from os.path import abspath, dirname, join - -import pandas as pd -import pytest - -# from popmon.hist.filling import make_histograms -from popmon.base import Pipeline -from popmon.hist.filling import SparkHistogrammar - -try: - from pyspark.sql import SparkSession - - spark_found = True -except (ModuleNotFoundError, AttributeError): - spark_found = False - - -def get_spark(): - if not spark_found: - return None - - current_path = dirname(abspath(__file__)) - - hist_spark_jar = join(current_path, "jars/histogrammar-sparksql_2.11-1.0.11.jar") - hist_jar = join(current_path, "jars/histogrammar_2.11-1.0.11.jar") - - spark = ( - SparkSession.builder.master("local") - .appName("popmon-pytest") - .config("spark.jars", f"{hist_spark_jar},{hist_jar}") - .config("spark.sql.execution.arrow.enabled", "false") - .config("spark.sql.session.timeZone", "GMT") - .getOrCreate() - ) - return spark - - -@pytest.fixture -def spark_co(): - """ - :return: Spark configuration - """ - spark = get_spark() - return spark - - -@pytest.mark.spark -@pytest.mark.skipif(not spark_found, reason="spark not found") -@pytest.mark.filterwarnings( - "ignore:createDataFrame attempted Arrow optimization because" -) -def test_get_histograms(spark_co): - pytest.age["data"]["name"] = "b'age'" - pytest.company["data"]["name"] = "b'company'" - pytest.eyesColor["data"]["name"] = "b'eyeColor'" - pytest.gender["data"]["name"] = "b'gender'" - pytest.isActive["data"]["name"] = "b'isActive'" - pytest.latitude["data"]["name"] = "b'latitude'" - pytest.longitude["data"]["name"] = "b'longitude'" - pytest.transaction["data"]["name"] = "b'transaction'" - - pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'" - pytest.latitude_longitude["data"]["bins:name"] = "unit_func" - - spark = spark_co - - spark_df = spark.createDataFrame(pytest.test_df) - - spark_filler = SparkHistogrammar( - features=[ - "date", - "isActive", - "age", - "eyeColor", - "gender", - "company", - "latitude", - "longitude", - ["isActive", "age"], - ["latitude", "longitude"], - "transaction", - ], - bin_specs={ - "transaction": {"num": 100, "low": -2000, "high": 2000}, - "longitude": {"bin_width": 5.0, "bin_offset": 0.0}, - "latitude": {"bin_width": 5.0, "bin_offset": 0.0}, - }, - read_key="input", - store_key="output", - ) - - # test get_histograms() function call - current_hists = spark_filler.get_histograms(spark_df) - # current_hists = make_histograms(spark_df, features, bin_specs) - assert current_hists["age"].toJson() == pytest.age - assert current_hists["company"].toJson() == pytest.company - assert current_hists["eyeColor"].toJson() == pytest.eyesColor - assert current_hists["gender"].toJson() == pytest.gender - assert current_hists["latitude"].toJson() == pytest.latitude - assert current_hists["longitude"].toJson() == pytest.longitude - assert current_hists["transaction"].toJson() == pytest.transaction - - # import json - # with open('tests/popmon/hist/resource/transaction.json', 'w') as outfile: - # json.dump(current_hists["transaction"].toJson(), outfile, indent=4) - - -@pytest.mark.spark -@pytest.mark.skipif(not spark_found, reason="spark not found") -@pytest.mark.filterwarnings( - "ignore:createDataFrame attempted Arrow optimization because" -) -def test_get_histograms_module(spark_co): - pytest.age["data"]["name"] = "b'age'" - pytest.company["data"]["name"] = "b'company'" - pytest.eyesColor["data"]["name"] = "b'eyeColor'" - pytest.gender["data"]["name"] = "b'gender'" - pytest.isActive["data"]["name"] = "b'isActive'" - pytest.latitude["data"]["name"] = "b'latitude'" - pytest.longitude["data"]["name"] = "b'longitude'" - - pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'" - pytest.latitude_longitude["data"]["bins:name"] = "unit_func" - - spark = spark_co - - spark_df = spark.createDataFrame(pytest.test_df) - - spark_filler = SparkHistogrammar( - features=[ - "date", - "isActive", - "age", - "eyeColor", - "gender", - "company", - "latitude", - "longitude", - ["isActive", "age"], - ["latitude", "longitude"], - ], - bin_specs={ - "longitude": {"bin_width": 5.0, "bin_offset": 0.0}, - "latitude": {"bin_width": 5.0, "bin_offset": 0.0}, - }, - read_key="input", - store_key="output", - ) - - # test transform() function call - pipeline = Pipeline(modules=[spark_filler]) - datastore = pipeline.transform(datastore={"input": spark_df}) - - assert "output" in datastore - current_hists = datastore["output"] - assert current_hists["age"].toJson() == pytest.age - assert current_hists["company"].toJson() == pytest.company - assert current_hists["eyeColor"].toJson() == pytest.eyesColor - assert current_hists["gender"].toJson() == pytest.gender - assert current_hists["latitude"].toJson() == pytest.latitude - assert current_hists["longitude"].toJson() == pytest.longitude - # assert current_hists['date'].toJson() == pytest.date - # assert current_hists['isActive'].toJson() == pytest.isActive - # assert current_hists['isActive:age'].toJson() == pytest.isActive_age - # assert current_hists['latitude:longitude'].toJson() == pytest.latitude_longitude - - -@pytest.mark.spark -@pytest.mark.skipif(not spark_found, reason="spark not found") -@pytest.mark.filterwarnings( - "ignore:createDataFrame attempted Arrow optimization because" -) -def test_get_histograms_timestamp(spark_co): - from pyspark.sql.functions import to_timestamp - - spark = spark_co - - data_date = [ - "2018-12-10 00:00:00", - "2018-12-10 00:00:00", - "2018-12-10 00:00:00", - "2018-12-10 00:00:00", - "2018-12-10 00:00:00", - "2018-12-17 00:00:00", - "2018-12-17 00:00:00", - "2018-12-17 00:00:00", - "2018-12-17 00:00:00", - "2018-12-19 00:00:00", - ] - - df = pd.DataFrame(data_date, columns=["dt"]) - sdf = spark.createDataFrame(df).withColumn( - "dt", to_timestamp("dt", "yyyy-MM-dd HH:mm:ss") - ) - expected = { - "data": { - "binWidth": 2592000000000000.0, - "bins": {"108": 9.0, "109": 1.0}, - "bins:type": "Count", - "entries": 10.0, - "name": "b'dt'", - "nanflow": 0.0, - "nanflow:type": "Count", - "origin": 1.2625632e18, - }, - "type": "SparselyBin", - "version": "1.0", - } - filler = SparkHistogrammar(features=["dt"]) - current_hists = filler.get_histograms(sdf) - assert current_hists["dt"].toJson() == expected - - -@pytest.mark.spark -@pytest.mark.skipif(not spark_found, reason="spark not found") -@pytest.mark.filterwarnings( - "ignore:createDataFrame attempted Arrow optimization because" -) -def test_get_histograms_date(spark_co): - from pyspark.sql.functions import to_date - - spark = spark_co - - data_date = [ - "2018-12-10", - "2018-12-10", - "2018-12-10", - "2018-12-10", - "2018-12-10", - "2018-12-17", - "2018-12-17", - "2018-12-17", - "2018-12-17", - "2018-12-19", - ] - - df = pd.DataFrame(data_date, columns=["dt"]) - sdf = spark.createDataFrame(df).withColumn("dt", to_date("dt", "yyyy-MM-dd")) - expected = { - "data": { - "binWidth": 2592000000000000.0, - "bins": {"108": 9.0, "109": 1.0}, - "bins:type": "Count", - "entries": 10.0, - "name": "b'dt'", - "nanflow": 0.0, - "nanflow:type": "Count", - "origin": 1.2625632e18, - }, - "type": "SparselyBin", - "version": "1.0", - } - filler = SparkHistogrammar(features=["dt"]) - current_hists = filler.get_histograms(sdf) - assert current_hists["dt"].toJson() == expected diff --git a/tests/popmon/pipeline/test_report.py b/tests/popmon/pipeline/test_report.py index c033d558..a2efe8d8 100644 --- a/tests/popmon/pipeline/test_report.py +++ b/tests/popmon/pipeline/test_report.py @@ -3,7 +3,7 @@ from popmon import resources from popmon.base import Pipeline -from popmon.hist.filling.make_histograms import get_bin_specs +from popmon.hist.filling import get_bin_specs from popmon.io import JsonReader from popmon.pipeline.report import df_stability_report, stability_report @@ -74,14 +74,10 @@ def test_df_stability_report_self(): hists = datastore["hists"] bin_specs = get_bin_specs(hists) - assert pd.Timedelta(time_width).value == bin_specs["date:eyeColor"][0]["bin_width"] - assert ( - pd.Timestamp(time_offset).value == bin_specs["date:eyeColor"][0]["bin_offset"] - ) - assert pd.Timedelta(time_width).value == bin_specs["date:latitude"][0]["bin_width"] - assert ( - pd.Timestamp(time_offset).value == bin_specs["date:latitude"][0]["bin_offset"] - ) + assert pd.Timedelta(time_width).value == bin_specs["date:eyeColor"][0]["binWidth"] + assert pd.Timestamp(time_offset).value == bin_specs["date:eyeColor"][0]["origin"] + assert pd.Timedelta(time_width).value == bin_specs["date:latitude"][0]["binWidth"] + assert pd.Timestamp(time_offset).value == bin_specs["date:latitude"][0]["origin"] def test_df_stability_report_external(): diff --git a/tests/popmon/stats/test_numpy.py b/tests/popmon/stats/test_numpy.py index ca92a2ba..3cf15ff7 100644 --- a/tests/popmon/stats/test_numpy.py +++ b/tests/popmon/stats/test_numpy.py @@ -197,15 +197,15 @@ def test_statistics_1(): def get_quantiles(q): _quantiles = np.zeros((3, 6)) for i in range(a.shape[0]): - for l in range(a.shape[3]): - isort = np.argsort(_values[i, l]) - v = _values[i, l][isort] - u = _weights[i, l][isort] + for ll in range(a.shape[3]): + isort = np.argsort(_values[i, ll]) + v = _values[i, ll][isort] + u = _weights[i, ll][isort] U = u.cumsum() r = (U - 0.5 * u) / U[-1] for m in range(1, len(u)): if r[m - 1] <= q and r[m] > q: - _quantiles[i, l] = v[m - 1] + (q - r[m - 1]) / ( + _quantiles[i, ll] = v[m - 1] + (q - r[m - 1]) / ( r[m] - r[m - 1] ) * (v[m] - v[m - 1]) break diff --git a/tests/popmon/visualization/test_report_generator.py b/tests/popmon/visualization/test_report_generator.py index 75ffa93e..ddbd4d34 100644 --- a/tests/popmon/visualization/test_report_generator.py +++ b/tests/popmon/visualization/test_report_generator.py @@ -53,5 +53,5 @@ def test_report_generator(): assert "final_report" in datastore assert ( isinstance(datastore["final_report"], str) - and len(datastore["final_report"]) > 0 + and len(datastore["final_report"]) > 0 # noqa: W503 )