From 608828ba96dd211817d36ce4101c2f9cc241d2ab Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 20 Apr 2023 08:52:45 -0400 Subject: [PATCH 1/9] Add failing test for empty site --- tests/test_ish.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_ish.py b/tests/test_ish.py index 525d0af0..e2976630 100644 --- a/tests/test_ish.py +++ b/tests/test_ish.py @@ -79,6 +79,13 @@ def test_ish_no_resample(): assert sum(col.endswith("_quality") for col in df.columns) == 8 +def test_ish_one_site_empty(): + dates = pd.date_range("2020-09-01", "2020-09-02") + site = "99816999999" + + ish.add_data(dates, site=site) + + def test_ish_resample(): dates = pd.date_range("2020-09-01", "2020-09-02") site = "72224400358" # "College Park AP" From 349e5a427045af1e4c225bc0a2fd09787525e8ff Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 20 Apr 2023 09:02:36 -0400 Subject: [PATCH 2/9] Decode bytes cols no fail if df empty one site case still fails in the resample --- monetio/obs/ish.py | 2 ++ tests/test_ish.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/monetio/obs/ish.py b/monetio/obs/ish.py index 22c26e86..751473ea 100644 --- a/monetio/obs/ish.py +++ b/monetio/obs/ish.py @@ -182,6 +182,8 @@ def _clean(frame): @staticmethod def _decode_bytes(df): + if df.empty: + return df bytes_cols = [col for col in df.columns if type(df[col][0]) == bytes] with pd.option_context("mode.chained_assignment", None): df.loc[:, bytes_cols] = df[bytes_cols].apply( diff --git a/tests/test_ish.py b/tests/test_ish.py index e2976630..01a46ee9 100644 --- a/tests/test_ish.py +++ b/tests/test_ish.py @@ -79,6 +79,13 @@ def test_ish_no_resample(): assert sum(col.endswith("_quality") for col in df.columns) == 8 +def test_ish_one_state_partially_empty(): + dates = pd.date_range("2020-09-01", "2020-09-02") + state = "DE" + + ish.add_data(dates, state=state) + + def test_ish_one_site_empty(): dates = pd.date_range("2020-09-01", "2020-09-02") site = "99816999999" From b2f103437c06036c292340d2075b5292010fb972 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 20 Apr 2023 09:09:52 -0400 Subject: [PATCH 3/9] Don't try to resample if empty one site test passes now --- monetio/obs/ish.py | 2 +- tests/test_ish.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/monetio/obs/ish.py b/monetio/obs/ish.py index 751473ea..6a62f6b2 100644 --- a/monetio/obs/ish.py +++ b/monetio/obs/ish.py @@ -352,7 +352,7 @@ def add_data( print(f"Aggregating {len(urls.name)} URLs...") self.df = self.aggregrate_files(urls, n_procs=n_procs) - if resample: + if resample and not self.df.empty: if verbose: print("Resampling to every " + window) self.df.index = self.df.time diff --git a/tests/test_ish.py b/tests/test_ish.py index 01a46ee9..15be93fe 100644 --- a/tests/test_ish.py +++ b/tests/test_ish.py @@ -81,9 +81,10 @@ def test_ish_no_resample(): def test_ish_one_state_partially_empty(): dates = pd.date_range("2020-09-01", "2020-09-02") + # TODO: one with fewer sites if possible? state = "DE" - ish.add_data(dates, state=state) + ish.add_data(dates, state=state, n_procs=2) def test_ish_one_site_empty(): From 78b37be2ed8ac4e0fb50df1247d99fff87e35d99 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 20 Apr 2023 09:21:44 -0400 Subject: [PATCH 4/9] Keep all site meta cols and clean up the output df a bit --- monetio/obs/ish.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/monetio/obs/ish.py b/monetio/obs/ish.py index 6a62f6b2..70c9ba2c 100644 --- a/monetio/obs/ish.py +++ b/monetio/obs/ish.py @@ -358,23 +358,10 @@ def add_data( self.df.index = self.df.time self.df = self.df.groupby("station_id").resample(window).mean().reset_index() - self.df = self.df.merge( - dfloc[ - [ - "station_id", - "latitude", - "longitude", - "station name", - "ctry", - "state", - "usaf", - "wban", - ] - ], - on=["station_id"], - how="left", + self.df = self.df.merge(dfloc, on="station_id", how="left") + self.df = self.df.rename(columns={"station_id": "siteid", "ctry": "country"}).drop( + columns=["fname"] ) - self.df = self.df.rename(columns={"station_id": "siteid", "ctry": "country"}) return self.df From 8cbfaba422cc6daaec1dbf3f83c9457e0884faac Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 3 May 2023 08:22:20 -0500 Subject: [PATCH 5/9] Ensure 1-d records array when constructing df seems like site(s) with one record load with genfromtxt as 0-d arrays (no len) --- monetio/obs/ish.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/obs/ish.py b/monetio/obs/ish.py index 70c9ba2c..4310a8d3 100644 --- a/monetio/obs/ish.py +++ b/monetio/obs/ish.py @@ -210,7 +210,7 @@ def read_data_frame(self, url_or_file): else: frame_as_array = np.genfromtxt(url_or_file, delimiter=self.WIDTHS, dtype=self.DTYPES) - frame = pd.DataFrame.from_records(frame_as_array) + frame = pd.DataFrame.from_records(np.atleast_1d(frame_as_array)) df = self._clean(frame) df.drop(["latitude", "longitude"], axis=1, inplace=True) # df.latitude = self.history.groupby('station_id').get_group( From 2738aaebe37d861e7f87c5d807929b3280f66496 Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 8 May 2023 15:32:06 -0600 Subject: [PATCH 6/9] Update ISH history file dupe WBAN/USAF checks there were two dupe USAF when set this up, now there are three --- monetio/obs/ish.py | 4 ++-- monetio/obs/ish_lite.py | 4 ++-- tests/test_ish.py | 10 ++++++++-- tests/test_ish_lite.py | 10 ++++++++-- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/monetio/obs/ish.py b/monetio/obs/ish.py index 4310a8d3..9f515c9f 100644 --- a/monetio/obs/ish.py +++ b/monetio/obs/ish.py @@ -238,8 +238,8 @@ def read_ish_history(self, dates=None): The constructed 'station_id' column is a combination of the USAF and WBAN columns. This is done since USAF and WBAN alone are not unique in the history file. - For example, USAF 725244 and 722158 appear twice, as do - WBAN 24267, 41420, 23176, 13752, and 41231. + For example, USAF 720481, 722158, and 725244 appear twice, as do + WBAN 13752, 23176, 24267, 41231, and 41420. Additionally, there are many cases of unset (999999 for USAF or 99999 for WBAN), though more so for WBAN than USAF. However, combining USAF and WBAN does give a unique station ID. diff --git a/monetio/obs/ish_lite.py b/monetio/obs/ish_lite.py index 0f0b5cfe..94bc55b5 100644 --- a/monetio/obs/ish_lite.py +++ b/monetio/obs/ish_lite.py @@ -86,8 +86,8 @@ def read_ish_history(self, dates=None): The constructed 'station_id' column is a combination of the USAF and WBAN columns. This is done since USAF and WBAN alone are not unique in the history file. - For example, USAF 725244 and 722158 appear twice, as do - WBAN 24267, 41420, 23176, 13752, and 41231. + For example, USAF 720481, 722158, and 725244 appear twice, as do + WBAN 13752, 23176, 24267, 41231, and 41420. Additionally, there are many cases of unset (999999 for USAF or 99999 for WBAN), though more so for WBAN than USAF. However, combining USAF and WBAN does give a unique station ID. diff --git a/tests/test_ish.py b/tests/test_ish.py index 15be93fe..9d3c91d3 100644 --- a/tests/test_ish.py +++ b/tests/test_ish.py @@ -22,8 +22,14 @@ def test_ish_read_history(): assert (df[col].dt.hour == 0).all() assert df.station_id.nunique() == len(df), "unique ID for station" - assert (df.usaf.value_counts() == 2).sum() == 2 - assert (df.wban.value_counts() == 2).sum() == 5 + + # Ensure docstring info matches this + x = df.usaf.value_counts() + assert sorted(x[x == 2].index) == ["720481", "722158", "725244"] + assert x[x.index != "999999"].max() == 2 + x = df.wban.value_counts() + assert sorted(x[x == 2].index) == ["13752", "23176", "24267", "41231", "41420"] + assert x[x.index != "99999"].max() == 2 assert (df.usaf == "999999").sum() > 100 assert (df.wban == "99999").sum() > 10_000 diff --git a/tests/test_ish_lite.py b/tests/test_ish_lite.py index 5443629a..f93d34e5 100644 --- a/tests/test_ish_lite.py +++ b/tests/test_ish_lite.py @@ -19,8 +19,14 @@ def test_ish_read_history(): assert (df[col].dt.hour == 0).all() assert df.station_id.nunique() == len(df), "unique ID for station" - assert (df.usaf.value_counts() == 2).sum() == 2 - assert (df.wban.value_counts() == 2).sum() == 5 + + # Ensure docstring info matches this + x = df.usaf.value_counts() + assert sorted(x[x == 2].index) == ["720481", "722158", "725244"] + assert x[x.index != "999999"].max() == 2 + x = df.wban.value_counts() + assert sorted(x[x == 2].index) == ["13752", "23176", "24267", "41231", "41420"] + assert x[x.index != "99999"].max() == 2 assert (df.usaf == "999999").sum() > 100 assert (df.wban == "99999").sum() > 10_000 From f766a51627e5cb1bfeb33f815199a7f33b4d088f Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 8 May 2023 15:57:56 -0600 Subject: [PATCH 7/9] test emptiness --- tests/test_ish.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/test_ish.py b/tests/test_ish.py index 9d3c91d3..800b6c65 100644 --- a/tests/test_ish.py +++ b/tests/test_ish.py @@ -87,17 +87,28 @@ def test_ish_no_resample(): def test_ish_one_state_partially_empty(): dates = pd.date_range("2020-09-01", "2020-09-02") - # TODO: one with fewer sites if possible? state = "DE" - ish.add_data(dates, state=state, n_procs=2) + ish_ = ish.ISH() + ish_.dates = dates + ish_.read_ish_history() + meta = ish_.history + all_sites = sorted(meta.query("state == @state").station_id) # 8 + + df = ish.add_data(dates, state=state, n_procs=2) + assert len(df) >= 1 + sites = sorted(df.siteid.unique()) + assert set(all_sites) - set(sites) == { + "99816999999" + }, "one empty site not included in state results" def test_ish_one_site_empty(): dates = pd.date_range("2020-09-01", "2020-09-02") site = "99816999999" - ish.add_data(dates, site=site) + df = ish.add_data(dates, site=site) + assert df.empty def test_ish_resample(): From 8a3c4a1181b681794997861a407176e55839cb88 Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 8 May 2023 16:05:12 -0600 Subject: [PATCH 8/9] notes --- tests/test_ish.py | 6 +++--- tests/test_ish_lite.py | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_ish.py b/tests/test_ish.py index 800b6c65..9fbeb3b4 100644 --- a/tests/test_ish.py +++ b/tests/test_ish.py @@ -93,19 +93,19 @@ def test_ish_one_state_partially_empty(): ish_.dates = dates ish_.read_ish_history() meta = ish_.history - all_sites = sorted(meta.query("state == @state").station_id) # 8 + all_sites = sorted(meta.query("state == @state").station_id) # 8 sites df = ish.add_data(dates, state=state, n_procs=2) assert len(df) >= 1 sites = sorted(df.siteid.unique()) assert set(all_sites) - set(sites) == { - "99816999999" + "99816999999" # "Delaware Reserve" }, "one empty site not included in state results" def test_ish_one_site_empty(): dates = pd.date_range("2020-09-01", "2020-09-02") - site = "99816999999" + site = "99816999999" # "Delaware Reserve" df = ish.add_data(dates, site=site) assert df.empty diff --git a/tests/test_ish_lite.py b/tests/test_ish_lite.py index f93d34e5..6667f03d 100644 --- a/tests/test_ish_lite.py +++ b/tests/test_ish_lite.py @@ -64,6 +64,14 @@ def test_ish_lite_one_site(): assert (df.temp < 100).all(), "temp in degC" +def test_ish_lite_one_site_empty(): + dates = pd.date_range("2020-09-01", "2020-09-02") + site = "99816999999" # "Delaware Reserve" + + df = ish_lite.add_data(dates, site=site) + assert df.empty + + def test_ish_lite_resample(): dates = pd.date_range("2020-09-01", "2020-09-02") site = "72224400358" # "College Park AP" From bde1db688d9213fa01dd8f3e35dd8b7b0921fb97 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 9 May 2023 10:59:39 -0600 Subject: [PATCH 9/9] Test resample on empty ISH doesn't cause error --- monetio/obs/ish_lite.py | 2 +- tests/test_ish.py | 5 +++-- tests/test_ish_lite.py | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/monetio/obs/ish_lite.py b/monetio/obs/ish_lite.py index 94bc55b5..7df9f52a 100644 --- a/monetio/obs/ish_lite.py +++ b/monetio/obs/ish_lite.py @@ -315,7 +315,7 @@ def add_data( df = df.loc[(df.time >= self.dates.min()) & (df.time <= self.dates.max())] df = df.replace(-999.9, np.NaN) - if resample: + if resample and not df.empty: print("Resampling to every " + window) df = df.set_index("time").groupby("siteid").resample(window).mean().reset_index() diff --git a/tests/test_ish.py b/tests/test_ish.py index 9fbeb3b4..081ef369 100644 --- a/tests/test_ish.py +++ b/tests/test_ish.py @@ -103,11 +103,12 @@ def test_ish_one_state_partially_empty(): }, "one empty site not included in state results" -def test_ish_one_site_empty(): +@pytest.mark.parametrize("resample", [False, True]) +def test_ish_one_site_empty(resample): dates = pd.date_range("2020-09-01", "2020-09-02") site = "99816999999" # "Delaware Reserve" - df = ish.add_data(dates, site=site) + df = ish.add_data(dates, site=site, resample=resample) assert df.empty diff --git a/tests/test_ish_lite.py b/tests/test_ish_lite.py index 6667f03d..94e9e75f 100644 --- a/tests/test_ish_lite.py +++ b/tests/test_ish_lite.py @@ -64,11 +64,12 @@ def test_ish_lite_one_site(): assert (df.temp < 100).all(), "temp in degC" -def test_ish_lite_one_site_empty(): +@pytest.mark.parametrize("resample", [False, True]) +def test_ish_lite_one_site_empty(resample): dates = pd.date_range("2020-09-01", "2020-09-02") site = "99816999999" # "Delaware Reserve" - df = ish_lite.add_data(dates, site=site) + df = ish_lite.add_data(dates, site=site, resample=resample) assert df.empty