Skip to content

Commit

Permalink
Merge branch 'popmon_hgr_migration' of https://github.com/ing-bank/po…
Browse files Browse the repository at this point in the history
…pmon into popmon_hgr_migration
  • Loading branch information
mbaak committed Mar 23, 2021
2 parents 3aaf347 + 02189f2 commit 1bbaac1
Show file tree
Hide file tree
Showing 18 changed files with 67 additions and 52 deletions.
9 changes: 7 additions & 2 deletions popmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


# histogram and report functions
from histogrammar.dfinterface.make_histograms import (
get_bin_specs,
get_time_axes,
make_histograms,
)

# flake8: noqa
# pandas/spark dataframe decorators
from popmon import decorators

# histogram and report functions
from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms
from .pipeline.metrics import df_stability_metrics, stability_metrics
from .pipeline.report import df_stability_report, stability_report
from .stitching import stitch_histograms
Expand Down
2 changes: 1 addition & 1 deletion popmon/analysis/comparison/hist_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
get_consistent_numpy_entries,
)
from ...base import Pipeline
from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2
from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric
from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2


def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
Expand Down
5 changes: 3 additions & 2 deletions popmon/analysis/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@
from scipy import linalg, stats
from scipy.stats import linregress, norm

from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric
from ..analysis.hist_numpy import (
check_similar_hists,
get_consistent_numpy_2dgrids,
get_consistent_numpy_entries,
set_2dgrid,
)
from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric
from ..stats.numpy import probability_distribution_mean_covariance


Expand Down Expand Up @@ -502,7 +502,8 @@ def relative_chi_squared(
single_norm, _ = probability_distribution_mean_covariance([entries])

if (
np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps
np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps
and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps
):
# check if covariance matrix is invertible
# see: https://stackoverflow.com/questions/13249108/efficient-pythonic-check-for-singular-matrix
Expand Down
4 changes: 2 additions & 2 deletions popmon/analysis/hist_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@

import warnings

import numpy as np

import histogrammar
import numpy as np
from histogrammar.util import get_hist_props

from ..hist.hist_utils import is_numeric

used_hist_types = (histogrammar.Bin, histogrammar.SparselyBin, histogrammar.Categorize)
Expand Down
6 changes: 2 additions & 4 deletions popmon/analysis/profiling/hist_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from ...analysis.hist_numpy import get_2dgrid
from ...base import Module
from ...hist.hist_utils import sum_entries, is_numeric, is_timestamp, get_bin_centers
from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp, sum_entries

DEFAULT_STATS = {
"mean": pm_np.mean,
Expand Down Expand Up @@ -115,9 +115,7 @@ def _profile_1d_histogram(self, name, hist):
profile = dict()
profile["filled"] = bin_counts.sum()
profile["nan"] = hist.nanflow.entries if hasattr(hist, "nanflow") else 0
profile["overflow"] = (
hist.overflow.entries if hasattr(hist, "overflow") else 0
)
profile["overflow"] = hist.overflow.entries if hasattr(hist, "overflow") else 0
profile["underflow"] = (
hist.underflow.entries if hasattr(hist, "underflow") else 0
)
Expand Down
2 changes: 1 addition & 1 deletion popmon/decorators/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


from pandas import DataFrame
from histogrammar.dfinterface.make_histograms import make_histograms
from pandas import DataFrame

from ..pipeline.metrics import df_stability_metrics
from ..pipeline.report import df_stability_report
Expand Down
1 change: 1 addition & 0 deletions popmon/decorators/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


from histogrammar.dfinterface.make_histograms import make_histograms

from popmon.pipeline.metrics import df_stability_metrics
from popmon.pipeline.report import df_stability_report

Expand Down
13 changes: 7 additions & 6 deletions popmon/hist/hist_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
import pandas as pd

from ..base import Module
from ..hist.hist_utils import is_timestamp, split_hist_along_first_dimension, get_histogram
from ..hist.hist_utils import (
get_histogram,
is_timestamp,
split_hist_along_first_dimension,
)


class HistSplitter(Module):
Expand Down Expand Up @@ -87,8 +91,7 @@ def update_divided(self, divided, split, yname):
divided.update(split)
else:
divided[yname] = [
{self.index_col: k, self.hist_col: h}
for k, h in split.items()
{self.index_col: k, self.hist_col: h} for k, h in split.items()
]
return divided

Expand Down Expand Up @@ -122,9 +125,7 @@ def transform(self, datastore):

xname, yname = cols[0], ":".join(cols[1:]) # 'time:x:y' -> 'time', 'x:y'
if yname in divided:
self.logger.debug(
f'Histogram "{yname}" already divided; skipping.'
)
self.logger.debug(f'Histogram "{yname}" already divided; skipping.')
continue

# if requested split selected histograms along first axis. e.g. time:x:y is split along time
Expand Down
15 changes: 5 additions & 10 deletions popmon/hist/hist_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


import histogrammar
import numpy as np
import pandas as pd
import histogrammar
from histogrammar.util import get_hist_props


COMMON_HIST_TYPES = (
histogrammar.Categorize,
histogrammar.Bin,
Expand Down Expand Up @@ -202,20 +201,18 @@ def get_histogram(hist_obj):
elif isinstance(hist_obj, dict):
hist = HG_FACTORY.fromJson(hist_obj)
if hist is None:
raise ValueError(
"Please provide histogram object as input."
)
raise ValueError("Please provide histogram object as input.")
return hist


def is_timestamp(hist):
props = get_hist_props(hist)
return props['is_ts']
return props["is_ts"]


def is_numeric(hist):
props = get_hist_props(hist)
return props['is_num']
return props["is_num"]


def sparse_bin_centers_x(hist):
Expand All @@ -225,9 +222,7 @@ def sparse_bin_centers_x(hist):
# number of bins is set to 1.
centers = np.array([hist.origin + 0.5 * hist.binWidth])
else:
centers = np.array(
[hist.origin + (i + 0.5) * hist.binWidth for i in keys]
)
centers = np.array([hist.origin + (i + 0.5) * hist.binWidth for i in keys])

values = [hist.bins[key] for key in keys]
return centers, values
Expand Down
6 changes: 5 additions & 1 deletion popmon/pipeline/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@
import logging

import pandas as pd
from histogrammar.dfinterface.make_histograms import (
get_bin_specs,
get_time_axes,
make_histograms,
)

from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms
from ..pipeline.metrics_pipelines import (
metrics_expanding_reference,
metrics_external_reference,
Expand Down
6 changes: 5 additions & 1 deletion popmon/pipeline/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
import logging

import pandas as pd
from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms
from histogrammar.dfinterface.make_histograms import (
get_bin_specs,
get_time_axes,
make_histograms,
)

from ..base import Module
from ..config import config
Expand Down
8 changes: 5 additions & 3 deletions popmon/stats/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def quantile(a, q, weights=None, axis=None, keepdims: bool = False):

# Reshape into a 2D-array, with the first axis the dimensions
# that are not reduced, and the second the dimensions that are reduced
shape = (-1, np.prod(a_moved.shape[-len(axis):]))
shape = (-1, np.prod(a_moved.shape[-len(axis) :]))
a_shaped = a_moved.reshape(shape)

w = np.moveaxis(weights, source=axis, destination=destination).reshape(shape)
Expand Down Expand Up @@ -366,7 +366,8 @@ def probability_distribution_mean_covariance(entries_list):
# Normalize the histograms along the bin axis, so that histograms with different number of entries
# are still comparable
normed_list = entries_list / (
np.sum(entries_list, axis=1, dtype=np.float)[:, np.newaxis] + np.finfo(np.float).eps
np.sum(entries_list, axis=1, dtype=np.float)[:, np.newaxis]
+ np.finfo(np.float).eps
)

# Determine the mean histogram (unbiased)
Expand All @@ -380,7 +381,8 @@ def probability_distribution_mean_covariance(entries_list):
# Determine the unbiased covariance matrices between bins for all the histograms.
# note: use one degree of freedom less because of we're using the evaluated mean as input
norm_hist_cov = (
sum2_cross_entries - norm_hist_mean[:, np.newaxis] * norm_hist_mean[np.newaxis, :]
sum2_cross_entries
- norm_hist_mean[:, np.newaxis] * norm_hist_mean[np.newaxis, :]
) * (n_histos / (n_histos - 1))

return norm_hist_mean, norm_hist_cov
Expand Down
10 changes: 8 additions & 2 deletions popmon/stitching/hist_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,10 @@ def stitch_histograms(

# 1. if there are no basis hists starting with "time_axis:", assume that this the very first batch.
if (
len(features_basis) == 0 and time_axis and len(hists_basis) > 0 and time_axis
len(features_basis) == 0
and time_axis
and len(hists_basis) > 0
and time_axis
):
if time_bin_idx is None:
self.logger.info(
Expand Down Expand Up @@ -204,7 +207,10 @@ def stitch_histograms(
list(delta_keys)
) # delta keys that start with time_axis
if (
len(features_basis) > 0 and len(features_delta) == 0 and len(delta_keys) > 0 and time_axis
len(features_basis) > 0
and len(features_delta) == 0
and len(delta_keys) > 0
and time_axis
):
if time_bin_idx is None or len(time_bin_idx) == 0:
time_bin_idx = self._generate_time_bin_idx(
Expand Down
2 changes: 1 addition & 1 deletion popmon/visualization/histogram_section.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import multiprocessing

import pandas as pd
from histogrammar.util import get_hist_props
from joblib import Parallel, delayed
from tqdm import tqdm

Expand All @@ -31,7 +32,6 @@
)
from ..base import Module
from ..config import get_stat_description
from histogrammar.util import get_hist_props
from ..visualization.utils import plot_overlay_1d_histogram_b64


Expand Down
4 changes: 1 addition & 3 deletions tests/popmon/analysis/profiling/test_hist_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ def test_profile_hist1d():
for i in range(split_len):
h = hg.Bin(num_bins, 0, 1, lambda x: x)
h.fill.numpy(np.random.uniform(0, 1, num_entries))
split.append(
{"date": pd.Timestamp("2019 - 1 - 1"), hist_name: h}
)
split.append({"date": pd.Timestamp("2019 - 1 - 1"), hist_name: h})

hp = HistProfiler(
read_key="dummy_input",
Expand Down
10 changes: 7 additions & 3 deletions tests/popmon/analysis/test_hist_numpy.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import histogrammar as hg
import numpy as np
import pandas as pd
import pytest
Expand All @@ -13,7 +14,6 @@
prepare_2dgrid,
set_2dgrid,
)
import histogrammar as hg


def to_ns(x):
Expand Down Expand Up @@ -271,8 +271,12 @@ def test_get_consistent_numpy_1dhists():
hist1.fill.numpy(df1)
hist2.fill.numpy(df2)

nphist1, nphist2 = get_consistent_numpy_1dhists([hist1, hist2], get_bin_labels=False)
nphist_list, centers = get_consistent_numpy_1dhists([hist1, hist2], get_bin_labels=True)
nphist1, nphist2 = get_consistent_numpy_1dhists(
[hist1, hist2], get_bin_labels=False
)
nphist_list, centers = get_consistent_numpy_1dhists(
[hist1, hist2], get_bin_labels=True
)

entries1 = [1.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0]
entries2 = [0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0]
Expand Down
8 changes: 4 additions & 4 deletions tests/popmon/hist/test_histogram.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import histogrammar as hg
import numpy as np
import pandas as pd

from popmon.hist.hist_utils import (
is_numeric,
is_timestamp,
project_on_x,
split_hist_along_first_dimension,
project_split2dhist_on_axis,
sparse_bin_centers_x,
split_hist_along_first_dimension,
sum_entries,
sum_over_x,
is_numeric,
is_timestamp,
)
import histogrammar as hg


def get_test_data():
Expand Down
8 changes: 2 additions & 6 deletions tests/popmon/pipeline/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,9 @@ def test_df_stability_report_self():
bin_specs = get_bin_specs(hists)

assert pd.Timedelta(time_width).value == bin_specs["date:eyeColor"][0]["binWidth"]
assert (
pd.Timestamp(time_offset).value == bin_specs["date:eyeColor"][0]["origin"]
)
assert pd.Timestamp(time_offset).value == bin_specs["date:eyeColor"][0]["origin"]
assert pd.Timedelta(time_width).value == bin_specs["date:latitude"][0]["binWidth"]
assert (
pd.Timestamp(time_offset).value == bin_specs["date:latitude"][0]["origin"]
)
assert pd.Timestamp(time_offset).value == bin_specs["date:latitude"][0]["origin"]


def test_df_stability_report_external():
Expand Down

0 comments on commit 1bbaac1

Please sign in to comment.