From 02189f2f2ca6718cc853f295008ba9630bbb4446 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Mar 2021 15:18:51 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- popmon/__init__.py | 9 +++++++-- popmon/analysis/comparison/hist_comparer.py | 2 +- popmon/analysis/functions.py | 5 +++-- popmon/analysis/hist_numpy.py | 4 ++-- popmon/analysis/profiling/hist_profiler.py | 6 ++---- popmon/decorators/pandas.py | 2 +- popmon/decorators/spark.py | 1 + popmon/hist/hist_splitter.py | 13 +++++++------ popmon/hist/hist_utils.py | 15 +++++---------- popmon/pipeline/metrics.py | 6 +++++- popmon/pipeline/report.py | 6 +++++- popmon/stats/numpy.py | 8 +++++--- popmon/stitching/hist_stitcher.py | 10 ++++++++-- popmon/visualization/histogram_section.py | 2 +- .../analysis/profiling/test_hist_profiler.py | 4 +--- tests/popmon/analysis/test_hist_numpy.py | 10 +++++++--- tests/popmon/hist/test_histogram.py | 8 ++++---- tests/popmon/pipeline/test_report.py | 8 ++------ 18 files changed, 67 insertions(+), 52 deletions(-) diff --git a/popmon/__init__.py b/popmon/__init__.py index afa85ade..6427670d 100644 --- a/popmon/__init__.py +++ b/popmon/__init__.py @@ -18,12 +18,17 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# histogram and report functions +from histogrammar.dfinterface.make_histograms import ( + get_bin_specs, + get_time_axes, + make_histograms, +) + # flake8: noqa # pandas/spark dataframe decorators from popmon import decorators -# histogram and report functions -from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms from .pipeline.metrics import df_stability_metrics, stability_metrics from .pipeline.report import df_stability_report, stability_report from .stitching import stitch_histograms diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 2b160ebb..db005a64 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -39,8 +39,8 @@ get_consistent_numpy_entries, ) from ...base import Pipeline -from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2 from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric +from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2 def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py index aa09499f..131140e5 100644 --- a/popmon/analysis/functions.py +++ b/popmon/analysis/functions.py @@ -24,13 +24,13 @@ from scipy import linalg, stats from scipy.stats import linregress, norm -from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric from ..analysis.hist_numpy import ( check_similar_hists, get_consistent_numpy_2dgrids, get_consistent_numpy_entries, set_2dgrid, ) +from ..hist.hist_utils import COMMON_HIST_TYPES, is_numeric from ..stats.numpy import probability_distribution_mean_covariance @@ -502,7 +502,8 @@ def relative_chi_squared( single_norm, _ = probability_distribution_mean_covariance([entries]) if ( - np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps + np.linalg.cond(cov) < 0.1 / np.finfo(cov.dtype).eps + and np.abs(np.linalg.det(cov)) > np.finfo(cov.dtype).eps ): # check if covariance matrix is invertible # see: https://stackoverflow.com/questions/13249108/efficient-pythonic-check-for-singular-matrix diff --git a/popmon/analysis/hist_numpy.py b/popmon/analysis/hist_numpy.py index bb76f2b9..f3e88f29 100644 --- a/popmon/analysis/hist_numpy.py +++ b/popmon/analysis/hist_numpy.py @@ -20,10 +20,10 @@ import warnings -import numpy as np - import histogrammar +import numpy as np from histogrammar.util import get_hist_props + from ..hist.hist_utils import is_numeric used_hist_types = (histogrammar.Bin, histogrammar.SparselyBin, histogrammar.Categorize) diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 9d9a40e9..86cbc3eb 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -26,7 +26,7 @@ from ...analysis.hist_numpy import get_2dgrid from ...base import Module -from ...hist.hist_utils import sum_entries, is_numeric, is_timestamp, get_bin_centers +from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp, sum_entries DEFAULT_STATS = { "mean": pm_np.mean, @@ -115,9 +115,7 @@ def _profile_1d_histogram(self, name, hist): profile = dict() profile["filled"] = bin_counts.sum() profile["nan"] = hist.nanflow.entries if hasattr(hist, "nanflow") else 0 - profile["overflow"] = ( - hist.overflow.entries if hasattr(hist, "overflow") else 0 - ) + profile["overflow"] = hist.overflow.entries if hasattr(hist, "overflow") else 0 profile["underflow"] = ( hist.underflow.entries if hasattr(hist, "underflow") else 0 ) diff --git a/popmon/decorators/pandas.py b/popmon/decorators/pandas.py index e25feabb..dc88ed30 100644 --- a/popmon/decorators/pandas.py +++ b/popmon/decorators/pandas.py @@ -18,8 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from pandas import DataFrame from histogrammar.dfinterface.make_histograms import make_histograms +from pandas import DataFrame from ..pipeline.metrics import df_stability_metrics from ..pipeline.report import df_stability_report diff --git a/popmon/decorators/spark.py b/popmon/decorators/spark.py index d9e8533b..52a9cd8c 100644 --- a/popmon/decorators/spark.py +++ b/popmon/decorators/spark.py @@ -19,6 +19,7 @@ from histogrammar.dfinterface.make_histograms import make_histograms + from popmon.pipeline.metrics import df_stability_metrics from popmon.pipeline.report import df_stability_report diff --git a/popmon/hist/hist_splitter.py b/popmon/hist/hist_splitter.py index b2a8bcac..663eb305 100644 --- a/popmon/hist/hist_splitter.py +++ b/popmon/hist/hist_splitter.py @@ -21,7 +21,11 @@ import pandas as pd from ..base import Module -from ..hist.hist_utils import is_timestamp, split_hist_along_first_dimension, get_histogram +from ..hist.hist_utils import ( + get_histogram, + is_timestamp, + split_hist_along_first_dimension, +) class HistSplitter(Module): @@ -87,8 +91,7 @@ def update_divided(self, divided, split, yname): divided.update(split) else: divided[yname] = [ - {self.index_col: k, self.hist_col: h} - for k, h in split.items() + {self.index_col: k, self.hist_col: h} for k, h in split.items() ] return divided @@ -122,9 +125,7 @@ def transform(self, datastore): xname, yname = cols[0], ":".join(cols[1:]) # 'time:x:y' -> 'time', 'x:y' if yname in divided: - self.logger.debug( - f'Histogram "{yname}" already divided; skipping.' - ) + self.logger.debug(f'Histogram "{yname}" already divided; skipping.') continue # if requested split selected histograms along first axis. e.g. time:x:y is split along time diff --git a/popmon/hist/hist_utils.py b/popmon/hist/hist_utils.py index daf95a6b..2ab97ccf 100644 --- a/popmon/hist/hist_utils.py +++ b/popmon/hist/hist_utils.py @@ -18,12 +18,11 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import histogrammar import numpy as np import pandas as pd -import histogrammar from histogrammar.util import get_hist_props - COMMON_HIST_TYPES = ( histogrammar.Categorize, histogrammar.Bin, @@ -202,20 +201,18 @@ def get_histogram(hist_obj): elif isinstance(hist_obj, dict): hist = HG_FACTORY.fromJson(hist_obj) if hist is None: - raise ValueError( - "Please provide histogram object as input." - ) + raise ValueError("Please provide histogram object as input.") return hist def is_timestamp(hist): props = get_hist_props(hist) - return props['is_ts'] + return props["is_ts"] def is_numeric(hist): props = get_hist_props(hist) - return props['is_num'] + return props["is_num"] def sparse_bin_centers_x(hist): @@ -225,9 +222,7 @@ def sparse_bin_centers_x(hist): # number of bins is set to 1. centers = np.array([hist.origin + 0.5 * hist.binWidth]) else: - centers = np.array( - [hist.origin + (i + 0.5) * hist.binWidth for i in keys] - ) + centers = np.array([hist.origin + (i + 0.5) * hist.binWidth for i in keys]) values = [hist.bins[key] for key in keys] return centers, values diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py index a5a63194..61590864 100644 --- a/popmon/pipeline/metrics.py +++ b/popmon/pipeline/metrics.py @@ -21,8 +21,12 @@ import logging import pandas as pd +from histogrammar.dfinterface.make_histograms import ( + get_bin_specs, + get_time_axes, + make_histograms, +) -from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms from ..pipeline.metrics_pipelines import ( metrics_expanding_reference, metrics_external_reference, diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index 8d762a84..4d4e16f2 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -21,7 +21,11 @@ import logging import pandas as pd -from histogrammar.dfinterface.make_histograms import get_bin_specs, get_time_axes, make_histograms +from histogrammar.dfinterface.make_histograms import ( + get_bin_specs, + get_time_axes, + make_histograms, +) from ..base import Module from ..config import config diff --git a/popmon/stats/numpy.py b/popmon/stats/numpy.py index 1b4ac106..b7ad7cb8 100644 --- a/popmon/stats/numpy.py +++ b/popmon/stats/numpy.py @@ -176,7 +176,7 @@ def quantile(a, q, weights=None, axis=None, keepdims: bool = False): # Reshape into a 2D-array, with the first axis the dimensions # that are not reduced, and the second the dimensions that are reduced - shape = (-1, np.prod(a_moved.shape[-len(axis):])) + shape = (-1, np.prod(a_moved.shape[-len(axis) :])) a_shaped = a_moved.reshape(shape) w = np.moveaxis(weights, source=axis, destination=destination).reshape(shape) @@ -366,7 +366,8 @@ def probability_distribution_mean_covariance(entries_list): # Normalize the histograms along the bin axis, so that histograms with different number of entries # are still comparable normed_list = entries_list / ( - np.sum(entries_list, axis=1, dtype=np.float)[:, np.newaxis] + np.finfo(np.float).eps + np.sum(entries_list, axis=1, dtype=np.float)[:, np.newaxis] + + np.finfo(np.float).eps ) # Determine the mean histogram (unbiased) @@ -380,7 +381,8 @@ def probability_distribution_mean_covariance(entries_list): # Determine the unbiased covariance matrices between bins for all the histograms. # note: use one degree of freedom less because of we're using the evaluated mean as input norm_hist_cov = ( - sum2_cross_entries - norm_hist_mean[:, np.newaxis] * norm_hist_mean[np.newaxis, :] + sum2_cross_entries + - norm_hist_mean[:, np.newaxis] * norm_hist_mean[np.newaxis, :] ) * (n_histos / (n_histos - 1)) return norm_hist_mean, norm_hist_cov diff --git a/popmon/stitching/hist_stitcher.py b/popmon/stitching/hist_stitcher.py index 01d23b32..77d88d11 100644 --- a/popmon/stitching/hist_stitcher.py +++ b/popmon/stitching/hist_stitcher.py @@ -166,7 +166,10 @@ def stitch_histograms( # 1. if there are no basis hists starting with "time_axis:", assume that this the very first batch. if ( - len(features_basis) == 0 and time_axis and len(hists_basis) > 0 and time_axis + len(features_basis) == 0 + and time_axis + and len(hists_basis) > 0 + and time_axis ): if time_bin_idx is None: self.logger.info( @@ -204,7 +207,10 @@ def stitch_histograms( list(delta_keys) ) # delta keys that start with time_axis if ( - len(features_basis) > 0 and len(features_delta) == 0 and len(delta_keys) > 0 and time_axis + len(features_basis) > 0 + and len(features_delta) == 0 + and len(delta_keys) > 0 + and time_axis ): if time_bin_idx is None or len(time_bin_idx) == 0: time_bin_idx = self._generate_time_bin_idx( diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index 3e34f94c..5d5e60ef 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -21,6 +21,7 @@ import multiprocessing import pandas as pd +from histogrammar.util import get_hist_props from joblib import Parallel, delayed from tqdm import tqdm @@ -31,7 +32,6 @@ ) from ..base import Module from ..config import get_stat_description -from histogrammar.util import get_hist_props from ..visualization.utils import plot_overlay_1d_histogram_b64 diff --git a/tests/popmon/analysis/profiling/test_hist_profiler.py b/tests/popmon/analysis/profiling/test_hist_profiler.py index bdf32eb2..3e3f6c0f 100644 --- a/tests/popmon/analysis/profiling/test_hist_profiler.py +++ b/tests/popmon/analysis/profiling/test_hist_profiler.py @@ -17,9 +17,7 @@ def test_profile_hist1d(): for i in range(split_len): h = hg.Bin(num_bins, 0, 1, lambda x: x) h.fill.numpy(np.random.uniform(0, 1, num_entries)) - split.append( - {"date": pd.Timestamp("2019 - 1 - 1"), hist_name: h} - ) + split.append({"date": pd.Timestamp("2019 - 1 - 1"), hist_name: h}) hp = HistProfiler( read_key="dummy_input", diff --git a/tests/popmon/analysis/test_hist_numpy.py b/tests/popmon/analysis/test_hist_numpy.py index 739a705c..ba929151 100644 --- a/tests/popmon/analysis/test_hist_numpy.py +++ b/tests/popmon/analysis/test_hist_numpy.py @@ -1,3 +1,4 @@ +import histogrammar as hg import numpy as np import pandas as pd import pytest @@ -13,7 +14,6 @@ prepare_2dgrid, set_2dgrid, ) -import histogrammar as hg def to_ns(x): @@ -271,8 +271,12 @@ def test_get_consistent_numpy_1dhists(): hist1.fill.numpy(df1) hist2.fill.numpy(df2) - nphist1, nphist2 = get_consistent_numpy_1dhists([hist1, hist2], get_bin_labels=False) - nphist_list, centers = get_consistent_numpy_1dhists([hist1, hist2], get_bin_labels=True) + nphist1, nphist2 = get_consistent_numpy_1dhists( + [hist1, hist2], get_bin_labels=False + ) + nphist_list, centers = get_consistent_numpy_1dhists( + [hist1, hist2], get_bin_labels=True + ) entries1 = [1.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0] entries2 = [0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0] diff --git a/tests/popmon/hist/test_histogram.py b/tests/popmon/hist/test_histogram.py index 637e27e8..213b4f8d 100644 --- a/tests/popmon/hist/test_histogram.py +++ b/tests/popmon/hist/test_histogram.py @@ -1,17 +1,17 @@ +import histogrammar as hg import numpy as np import pandas as pd from popmon.hist.hist_utils import ( + is_numeric, + is_timestamp, project_on_x, - split_hist_along_first_dimension, project_split2dhist_on_axis, sparse_bin_centers_x, + split_hist_along_first_dimension, sum_entries, sum_over_x, - is_numeric, - is_timestamp, ) -import histogrammar as hg def get_test_data(): diff --git a/tests/popmon/pipeline/test_report.py b/tests/popmon/pipeline/test_report.py index c3fe3874..a2efe8d8 100644 --- a/tests/popmon/pipeline/test_report.py +++ b/tests/popmon/pipeline/test_report.py @@ -75,13 +75,9 @@ def test_df_stability_report_self(): bin_specs = get_bin_specs(hists) assert pd.Timedelta(time_width).value == bin_specs["date:eyeColor"][0]["binWidth"] - assert ( - pd.Timestamp(time_offset).value == bin_specs["date:eyeColor"][0]["origin"] - ) + assert pd.Timestamp(time_offset).value == bin_specs["date:eyeColor"][0]["origin"] assert pd.Timedelta(time_width).value == bin_specs["date:latitude"][0]["binWidth"] - assert ( - pd.Timestamp(time_offset).value == bin_specs["date:latitude"][0]["origin"] - ) + assert pd.Timestamp(time_offset).value == bin_specs["date:latitude"][0]["origin"] def test_df_stability_report_external():