diff --git a/src/pytesmo/temporal_matching.py b/src/pytesmo/temporal_matching.py index 9347eb6b..aa255230 100644 --- a/src/pytesmo/temporal_matching.py +++ b/src/pytesmo/temporal_matching.py @@ -30,8 +30,9 @@ def df_match(reference, *args, **kwds): dropduplicates : boolean Drop duplicated temporal matched (default: False) asym_window: string, optional - ``<=`` stands for using a smaller and equal only for the left/smaller side of the window comparison - ``>=`` stands for using a larger and equal only for the right/larger side of the window comparison + ``<=`` stands for using a smaller and equal only for the left/smaller + side of the window comparison, ``>=`` stands for using a larger and + equal only for the right/larger side of the window comparison. The default is to use <= and >= for both sides of the search window Returns @@ -99,14 +100,18 @@ def df_match(reference, *args, **kwds): if asym_window == "<=": # this means that only distance in the interval [distance[ are # taken - valid_dist = ((arg_matched['distance'] >= 0.0) & (arg_matched['distance'] <= window)) | ( - (arg_matched['distance'] <= 0.0) & (arg_matched['distance'] > -window)) + valid_dist = (((arg_matched['distance'] >= 0.0) + & (arg_matched['distance'] <= window)) + | ((arg_matched['distance'] <= 0.0) + & (arg_matched['distance'] > -window))) invalid_dist = ~valid_dist if asym_window == ">=": # this means that only distance in the interval ]distance] are # taken - valid_dist = ((arg_matched['distance'] >= 0.0) & (arg_matched['distance'] < window)) | ( - (arg_matched['distance'] <= 0.0) & (arg_matched['distance'] >= -window)) + valid_dist = (((arg_matched['distance'] >= 0.0) + & (arg_matched['distance'] < window)) + | ((arg_matched['distance'] <= 0.0) + & (arg_matched['distance'] >= -window))) invalid_dist = ~valid_dist arg_matched.loc[invalid_dist] = np.nan @@ -147,8 +152,8 @@ def matching(reference, *args, **kwargs): Returns ------- temporal_match : pandas.DataFrame - containing the index of the reference Series and a column for each of the - other input Series + containing the index of the reference Series and a column for each of + the other input Series """ warnings.warn( "'pytesmo.temporal_matching.matching' is deprecated. Use" @@ -172,8 +177,8 @@ def matching(reference, *args, **kwargs): def temporal_collocation(reference, other, window, method="nearest", return_index=False, return_distance=False, - dropduplicates=False, dropna=False, flag=None, - use_invalid=False): + dropduplicates=False, dropna=False, checkna=False, + flag=None, use_invalid=False): """ Temporally collocates values to reference. @@ -213,7 +218,11 @@ def temporal_collocation(reference, other, window, method="nearest", dropna : bool, optional Whether to drop NaNs from the resulting dataframe (arising for example from duplicates with ``duplicates_nan=True`` or from missing values). - Default is ``False`` + Default is ``False``. + checkna: bool, optional + Whether to check if only NaNs are returned (i.e. no match has been + found). If set to ``True``, raises a ``UserWarning`` in case no match + has been found. Default is ``False``. flag : np.ndarray, str or None, optional Flag column as array or name of the flag column in `other`. If this is given, the column will be interpreted as validity indicator. Any @@ -256,7 +265,6 @@ def temporal_collocation(reference, other, window, method="nearest", else: has_invalid = False - # preprocessing # ------------ if ref_dr.tz is None: @@ -303,6 +311,9 @@ def collocate(df): # postprocessing # -------------- + if checkna: + if np.any(collocated.isnull().apply(np.all)): + warnings.warn("No match has been found") if dropna: collocated.dropna(inplace=True) diff --git a/tests/test_temporal_matching.py b/tests/test_temporal_matching.py index e5767ac0..9e5b6105 100644 --- a/tests/test_temporal_matching.py +++ b/tests/test_temporal_matching.py @@ -26,10 +26,10 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -''' +""" Tests for the temporal matching module Created on Wed Jul 8 19:37:14 2015 -''' +""" from copy import deepcopy @@ -48,18 +48,25 @@ def test_df_match_borders(): See issue #51 """ - ref_df = pd.DataFrame({"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0), - "2007-01-05", freq="D")) - match_df = pd.DataFrame({"matched_data": np.arange(5)}, - index=[datetime(2007, 1, 1, 9), - datetime(2007, 1, 2, 9), - datetime(2007, 1, 3, 9), - datetime(2007, 1, 4, 9), - datetime(2007, 1, 5, 9)]) + ref_df = pd.DataFrame( + {"data": np.arange(5)}, + index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"), + ) + match_df = pd.DataFrame( + {"matched_data": np.arange(5)}, + index=[ + datetime(2007, 1, 1, 9), + datetime(2007, 1, 2, 9), + datetime(2007, 1, 3, 9), + datetime(2007, 1, 4, 9), + datetime(2007, 1, 5, 9), + ], + ) matched = tmatching.df_match(ref_df, match_df) nptest.assert_allclose( - np.array([0.375, 0.375, 0.375, 0.375, 0.375]), matched.distance.values) + np.array([0.375, 0.375, 0.375, 0.375, 0.375]), matched.distance.values + ) nptest.assert_allclose(np.arange(5), matched.matched_data) @@ -68,32 +75,43 @@ def test_df_match_match_on_window_border(): test matching if a value lies exactly on the window border. """ - ref_df = pd.DataFrame({"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0), - "2007-01-05", freq="D")) - match_df = pd.DataFrame({"matched_data": np.arange(4)}, - index=[datetime(2007, 1, 1, 9), - datetime(2007, 1, 2, 9), - datetime(2007, 1, 3, 12), - datetime(2007, 1, 5, 9)]) + ref_df = pd.DataFrame( + {"data": np.arange(5)}, + index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"), + ) + match_df = pd.DataFrame( + {"matched_data": np.arange(4)}, + index=[ + datetime(2007, 1, 1, 9), + datetime(2007, 1, 2, 9), + datetime(2007, 1, 3, 12), + datetime(2007, 1, 5, 9), + ], + ) matched = tmatching.df_match(ref_df, match_df, window=0.5) nptest.assert_allclose( - np.array([0.375, 0.375, 0.5, -0.5, 0.375]), matched.distance.values) + np.array([0.375, 0.375, 0.5, -0.5, 0.375]), matched.distance.values + ) nptest.assert_allclose([0, 1, 2, 2, 3], matched.matched_data) # test asym_window keyword matched = tmatching.df_match( - ref_df, match_df, window=0.5, asym_window="<=") + ref_df, match_df, window=0.5, asym_window="<=" + ) nptest.assert_allclose( - np.array([0.375, 0.375, 0.5, np.nan, 0.375]), matched.distance.values) + np.array([0.375, 0.375, 0.5, np.nan, 0.375]), matched.distance.values + ) nptest.assert_allclose([0, 1, 2, np.nan, 3], matched.matched_data) matched = tmatching.df_match( - ref_df, match_df, window=0.5, asym_window=">=") + ref_df, match_df, window=0.5, asym_window=">=" + ) nptest.assert_allclose( - np.array([0.375, 0.375, np.nan, -0.5, 0.375]), matched.distance.values) + np.array([0.375, 0.375, np.nan, -0.5, 0.375]), matched.distance.values + ) nptest.assert_allclose([0, 1, np.nan, 2, 3], matched.matched_data) @@ -104,17 +122,24 @@ def test_df_match_borders_unequal_query_points(): See issue #51 """ - ref_df = pd.DataFrame({"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0), - "2007-01-05", freq="D")) - match_df = pd.DataFrame({"matched_data": np.arange(4)}, - index=[datetime(2007, 1, 1, 9), - datetime(2007, 1, 2, 9), - datetime(2007, 1, 4, 9), - datetime(2007, 1, 5, 9)]) + ref_df = pd.DataFrame( + {"data": np.arange(5)}, + index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"), + ) + match_df = pd.DataFrame( + {"matched_data": np.arange(4)}, + index=[ + datetime(2007, 1, 1, 9), + datetime(2007, 1, 2, 9), + datetime(2007, 1, 4, 9), + datetime(2007, 1, 5, 9), + ], + ) matched = tmatching.df_match(ref_df, match_df) nptest.assert_allclose( - np.array([0.375, 0.375, -0.625, 0.375, 0.375]), matched.distance.values) + np.array([0.375, 0.375, -0.625, 0.375, 0.375]), matched.distance.values + ) nptest.assert_allclose(np.array([0, 1, 1, 2, 3]), matched.matched_data) @@ -125,14 +150,20 @@ def test_matching(): data = np.arange(5.0) data[3] = np.nan - ref_df = pd.DataFrame({"data": data}, index=pd.date_range(datetime(2007, 1, 1, 0), - "2007-01-05", freq="D")) - match_df = pd.DataFrame({"matched_data": np.arange(5)}, - index=[datetime(2007, 1, 1, 9), - datetime(2007, 1, 2, 9), - datetime(2007, 1, 3, 9), - datetime(2007, 1, 4, 9), - datetime(2007, 1, 5, 9)]) + ref_df = pd.DataFrame( + {"data": data}, + index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"), + ) + match_df = pd.DataFrame( + {"matched_data": np.arange(5)}, + index=[ + datetime(2007, 1, 1, 9), + datetime(2007, 1, 2, 9), + datetime(2007, 1, 3, 9), + datetime(2007, 1, 4, 9), + datetime(2007, 1, 5, 9), + ], + ) matched = tmatching.matching(ref_df, match_df) nptest.assert_allclose(np.array([0, 1, 2, 4]), matched.matched_data) @@ -146,15 +177,21 @@ def test_matching_series(): data = np.arange(5.0) data[3] = np.nan - ref_ser = pd.Series(data, index=pd.date_range(datetime(2007, 1, 1, 0), - "2007-01-05", freq="D")) - match_ser = pd.Series(np.arange(5), - index=[datetime(2007, 1, 1, 9), - datetime(2007, 1, 2, 9), - datetime(2007, 1, 3, 9), - datetime(2007, 1, 4, 9), - datetime(2007, 1, 5, 9)], - name='matched_data') + ref_ser = pd.Series( + data, + index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"), + ) + match_ser = pd.Series( + np.arange(5), + index=[ + datetime(2007, 1, 1, 9), + datetime(2007, 1, 2, 9), + datetime(2007, 1, 3, 9), + datetime(2007, 1, 4, 9), + datetime(2007, 1, 5, 9), + ], + name="matched_data", + ) matched = tmatching.matching(ref_ser, match_ser) @@ -195,13 +232,13 @@ def test_data(): indicating where NaNs are expected (i.e. no matching was taking place) """ # the reference date range - ref_dr = pd.date_range('1970', '2020', freq='D', tz="UTC") + ref_dr = pd.date_range("1970", "2020", freq="D", tz="UTC") test_dr = {} test_dr["shifted_3"] = ref_dr + pd.Timedelta(3, "H") test_dr["shifted_7"] = ref_dr + pd.Timedelta(7, "H") - test_dr["shifted_3_asia"] = ( - test_dr["shifted_3"].tz_convert("Asia/Yekaterinburg") + test_dr["shifted_3_asia"] = test_dr["shifted_3"].tz_convert( + "Asia/Yekaterinburg" ) test_dr["shifted_7_us"] = test_dr["shifted_7"].tz_convert("US/Eastern") @@ -212,7 +249,7 @@ def test_data(): test_dr["random_shift"] = dr_random_shift # missing data - drop_mask = np.zeros(len(ref_dr), dtype=np.bool) + drop_mask = np.zeros(len(ref_dr), dtype=bool) drop_mask[100:200] = True dr_random_shift = dr_random_shift[~drop_mask] test_dr["missing"] = dr_random_shift @@ -220,20 +257,24 @@ def test_data(): # with duplicates test_dr["duplicates"] = deepcopy(test_dr["shifted_3"]) - duplicates_mask = np.zeros(len(ref_dr), dtype=np.bool) - for idx in np.random.randint(0, len(test_dr["duplicates"])-1, 5): - test_dr["duplicates"].values[idx] = test_dr["duplicates"].values[idx+1] + duplicates_mask = np.zeros(len(ref_dr), dtype=bool) + for idx in np.random.randint(0, len(test_dr["duplicates"]) - 1, 5): + test_dr["duplicates"].values[idx] = test_dr["duplicates"].values[ + idx + 1 + ] duplicates_mask[idx] = True # setting up dataframes test_frames = { - key: pd.DataFrame(np.random.randn(len(test_dr[key]), 3), - index=test_dr[key]) for key in test_dr + key: pd.DataFrame( + np.random.randn(len(test_dr[key]), 3), index=test_dr[key] + ) + for key in test_dr } ref_frame = pd.DataFrame(np.random.randn(len(ref_dr), 3), index=ref_dr) # mask for where we expect nans in the output - all_nan = np.ones(len(ref_dr), dtype=np.bool) + all_nan = np.ones(len(ref_dr), dtype=bool) expected_nan = { "shifted_3": ~all_nan, "shifted_7": all_nan, @@ -250,9 +291,11 @@ def setup_data(data, key): """Returns only relevant data of test_data for given key""" return data[0], data[1][key], data[2][key] + def compare_with_nan(a, b): return (a == b) | (np.isnan(a) & np.isnan(b)) + def assert_equal_except_nan(res, ref, nan_mask, index_shifted=False): expected_nan_idx = nan_mask.nonzero()[0] expected_nonan_idx = (~nan_mask).nonzero()[0] @@ -269,8 +312,16 @@ def assert_equal_except_nan(res, ref, nan_mask, index_shifted=False): ) -@pytest.mark.parametrize("key", ["shifted_3", "shifted_7", "shifted_7_us", - "shifted_3_asia", "random_shift"]) +@pytest.mark.parametrize( + "key", + [ + "shifted_3", + "shifted_7", + "shifted_7_us", + "shifted_3_asia", + "random_shift", + ], +) def test_collocation_nearest_neighbour(test_data, key): ref_frame, test_frame, expected_nan = setup_data(test_data, key) res = tmatching.temporal_collocation( @@ -283,7 +334,9 @@ def test_collocation_nearest_neighbour(test_data, key): def test_collocation_missing_duplicates(test_data, key): ref_frame, test_frame, expected_nan = setup_data(test_data, key) res = tmatching.temporal_collocation( - ref_frame, test_frame, pd.Timedelta(6, "H"), + ref_frame, + test_frame, + pd.Timedelta(6, "H"), ) # indices of test_frame are shifted w.r.t expected_nan, therefore we can't # compare values @@ -294,7 +347,7 @@ def test_collocation_missing_duplicates(test_data, key): def test_collocation_window(test_data, key): ref_frame, test_frame, expected_nan = setup_data(test_data, key) res = tmatching.temporal_collocation( - ref_frame, test_frame, 6/24, dropduplicates=True + ref_frame, test_frame, 6 / 24, dropduplicates=True ) assert_equal_except_nan(res, test_frame, expected_nan, index_shifted=True) @@ -303,7 +356,7 @@ def test_collocation_window(test_data, key): def test_collocation_input(test_data, key): ref_frame, test_frame, expected_nan = setup_data(test_data, key) - no_timezone = pd.date_range('1970', '2020', freq='D') + no_timezone = pd.date_range("1970", "2020", freq="D") # test with series and index: for ref in [ref_frame[0], ref_frame.index, no_timezone]: res = tmatching.temporal_collocation( @@ -312,8 +365,16 @@ def test_collocation_input(test_data, key): assert_equal_except_nan(res, test_frame, expected_nan) -@pytest.mark.parametrize("key", ["shifted_3", "shifted_7", "shifted_7_us", - "shifted_3_asia", "random_shift"]) +@pytest.mark.parametrize( + "key", + [ + "shifted_3", + "shifted_7", + "shifted_7_us", + "shifted_3_asia", + "random_shift", + ], +) def test_collocation_dropna(test_data, key): ref_frame, test_frame, expected_nan = setup_data(test_data, key) res = tmatching.temporal_collocation( @@ -323,44 +384,57 @@ def test_collocation_dropna(test_data, key): assert np.all(test_frame.iloc[expected_nonan_idx, :].values == res.values) -@pytest.mark.parametrize("key", ["shifted_3", "shifted_7", "shifted_7_us", - "shifted_3_asia", "random_shift"]) +@pytest.mark.parametrize( + "key", + [ + "shifted_3", + "shifted_7", + "shifted_7_us", + "shifted_3_asia", + "random_shift", + ], +) def test_collocation_flag(test_data, key): ref_frame, test_frame, expected_nan = setup_data(test_data, key) flag = np.random.choice([True, False], len(ref_frame)) # with array res = tmatching.temporal_collocation( - ref_frame, test_frame, pd.Timedelta(6, "H"), flag=flag, + ref_frame, + test_frame, + pd.Timedelta(6, "H"), + flag=flag, ) compare_with_nan( - res.iloc[:, 0].values[~flag], - test_frame.iloc[:, 0].values[~flag] - ) + res.iloc[:, 0].values[~flag], test_frame.iloc[:, 0].values[~flag] + ) assert np.all(np.isnan(res.values[:, 0][flag])) # with array, using invalid as replacement res = tmatching.temporal_collocation( - ref_frame, test_frame, pd.Timedelta(6, "H"), flag=flag, - use_invalid=True + ref_frame, + test_frame, + pd.Timedelta(6, "H"), + flag=flag, + use_invalid=True, ) - compare_with_nan( - res.iloc[:, 0].values, - test_frame.iloc[:, 0].values - ) + compare_with_nan(res.iloc[:, 0].values, test_frame.iloc[:, 0].values) # with dataframe - test_frame['flag'] = flag + test_frame["flag"] = flag res = tmatching.temporal_collocation( - ref_frame, test_frame, pd.Timedelta(6, "H"), flag='flag', + ref_frame, + test_frame, + pd.Timedelta(6, "H"), + flag="flag", ) compare_with_nan( - res.iloc[:, 0].values[~flag], - test_frame.iloc[:, 0].values[~flag] - ) + res.iloc[:, 0].values[~flag], test_frame.iloc[:, 0].values[~flag] + ) assert np.all(np.isnan(res.iloc[:, 0].values[flag])) + # using only shifted_3, because comparison won't work when there are nans @pytest.mark.parametrize("key", ["shifted_3"]) def test_return_index(test_data, key): @@ -371,6 +445,7 @@ def test_return_index(test_data, key): assert_equal_except_nan(res, test_frame, expected_nan) assert np.all(test_frame.index.values == res["index_other"].values) + # using only shifted_3, because comparison won't work when there are nans @pytest.mark.parametrize("key", ["shifted_3", "shifted_7"]) def test_return_distance(test_data, key): @@ -383,3 +458,42 @@ def test_return_distance(test_data, key): assert np.all(res["distance_other"] == pd.Timedelta(3, "H")) if key == "shifted_7": assert np.all(np.isnan(res["distance_other"])) + + +def test_timezone_handling(): + # Issue #150 + data = np.arange(5.0) + data[3] = np.nan + + match_df = pd.DataFrame( + {"matched_data": data}, + index=pd.date_range( + datetime(2007, 1, 1, 0), "2007-01-05", freq="D", tz="UTC" + ), + ) + index = pd.DatetimeIndex([ + datetime(2007, 1, 1, 9), + datetime(2007, 1, 2, 9), + datetime(2007, 1, 3, 9), + datetime(2007, 1, 4, 9), + datetime(2007, 1, 5, 9), + ]).tz_localize("utc") + ref_df = pd.DataFrame({"data": np.arange(5)}, index=index) + matched = tmatching.temporal_collocation( + ref_df, match_df, pd.Timedelta(12, "H"), dropna=True, + ) + + nptest.assert_allclose(np.array([0, 1, 2, 4]), matched.matched_data) + assert len(matched) == 4 + + +def test_warning_on_no_match(test_data): + # Issue #152 + ref_frame, test_frame, expected_nan = setup_data(test_data, "shifted_7") + with pytest.warns(UserWarning): + tmatching.temporal_collocation( + ref_frame, test_frame, pd.Timedelta(6, "H"), checkna=True + ) + + +