noaa-oar-arl · zmoon · May 9, 2023 · Apr 20, 2023 · Apr 20, 2023 · Apr 20, 2023
diff --git a/monetio/obs/ish.py b/monetio/obs/ish.py
@@ -182,6 +182,8 @@ def _clean(frame):
 
     @staticmethod
     def _decode_bytes(df):
+        if df.empty:
+            return df
         bytes_cols = [col for col in df.columns if type(df[col][0]) == bytes]
         with pd.option_context("mode.chained_assignment", None):
             df.loc[:, bytes_cols] = df[bytes_cols].apply(
@@ -208,7 +210,7 @@ def read_data_frame(self, url_or_file):
         else:
             frame_as_array = np.genfromtxt(url_or_file, delimiter=self.WIDTHS, dtype=self.DTYPES)
 
-        frame = pd.DataFrame.from_records(frame_as_array)
+        frame = pd.DataFrame.from_records(np.atleast_1d(frame_as_array))
         df = self._clean(frame)
         df.drop(["latitude", "longitude"], axis=1, inplace=True)
         # df.latitude = self.history.groupby('station_id').get_group(
@@ -236,8 +238,8 @@ def read_ish_history(self, dates=None):
 
         The constructed 'station_id' column is a combination of the USAF and WBAN columns.
         This is done since USAF and WBAN alone are not unique in the history file.
-        For example, USAF 725244 and 722158 appear twice, as do
-        WBAN 24267, 41420, 23176, 13752, and 41231.
+        For example, USAF 720481, 722158, and 725244 appear twice, as do
+        WBAN 13752, 23176, 24267, 41231, and 41420.
         Additionally, there are many cases of unset (999999 for USAF or 99999 for WBAN),
         though more so for WBAN than USAF.
         However, combining USAF and WBAN does give a unique station ID.
@@ -350,29 +352,16 @@ def add_data(
                 print(f"Aggregating {len(urls.name)} URLs...")
             self.df = self.aggregrate_files(urls, n_procs=n_procs)
 
-        if resample:
+        if resample and not self.df.empty:
             if verbose:
                 print("Resampling to every " + window)
             self.df.index = self.df.time
             self.df = self.df.groupby("station_id").resample(window).mean().reset_index()
 
-        self.df = self.df.merge(
-            dfloc[
-                [
-                    "station_id",
-                    "latitude",
-                    "longitude",
-                    "station name",
-                    "ctry",
-                    "state",
-                    "usaf",
-                    "wban",
-                ]
-            ],
-            on=["station_id"],
-            how="left",
+        self.df = self.df.merge(dfloc, on="station_id", how="left")
+        self.df = self.df.rename(columns={"station_id": "siteid", "ctry": "country"}).drop(
+            columns=["fname"]
         )
-        self.df = self.df.rename(columns={"station_id": "siteid", "ctry": "country"})
 
         return self.df
 

diff --git a/monetio/obs/ish_lite.py b/monetio/obs/ish_lite.py
@@ -86,8 +86,8 @@ def read_ish_history(self, dates=None):
 
         The constructed 'station_id' column is a combination of the USAF and WBAN columns.
         This is done since USAF and WBAN alone are not unique in the history file.
-        For example, USAF 725244 and 722158 appear twice, as do
-        WBAN 24267, 41420, 23176, 13752, and 41231.
+        For example, USAF 720481, 722158, and 725244 appear twice, as do
+        WBAN 13752, 23176, 24267, 41231, and 41420.
         Additionally, there are many cases of unset (999999 for USAF or 99999 for WBAN),
         though more so for WBAN than USAF.
         However, combining USAF and WBAN does give a unique station ID.
@@ -315,7 +315,7 @@ def add_data(
         df = df.loc[(df.time >= self.dates.min()) & (df.time <= self.dates.max())]
         df = df.replace(-999.9, np.NaN)
 
-        if resample:
+        if resample and not df.empty:
             print("Resampling to every " + window)
             df = df.set_index("time").groupby("siteid").resample(window).mean().reset_index()
 

diff --git a/tests/test_ish.py b/tests/test_ish.py
@@ -22,8 +22,14 @@ def test_ish_read_history():
         assert (df[col].dt.hour == 0).all()
 
     assert df.station_id.nunique() == len(df), "unique ID for station"
-    assert (df.usaf.value_counts() == 2).sum() == 2
-    assert (df.wban.value_counts() == 2).sum() == 5
+
+    # Ensure docstring info matches this
+    x = df.usaf.value_counts()
+    assert sorted(x[x == 2].index) == ["720481", "722158", "725244"]
+    assert x[x.index != "999999"].max() == 2
+    x = df.wban.value_counts()
+    assert sorted(x[x == 2].index) == ["13752", "23176", "24267", "41231", "41420"]
+    assert x[x.index != "99999"].max() == 2
     assert (df.usaf == "999999").sum() > 100
     assert (df.wban == "99999").sum() > 10_000
 
@@ -79,6 +85,33 @@ def test_ish_no_resample():
     assert sum(col.endswith("_quality") for col in df.columns) == 8
 
 
+def test_ish_one_state_partially_empty():
+    dates = pd.date_range("2020-09-01", "2020-09-02")
+    state = "DE"
+
+    ish_ = ish.ISH()
+    ish_.dates = dates
+    ish_.read_ish_history()
+    meta = ish_.history
+    all_sites = sorted(meta.query("state == @state").station_id)  # 8 sites
+
+    df = ish.add_data(dates, state=state, n_procs=2)
+    assert len(df) >= 1
+    sites = sorted(df.siteid.unique())
+    assert set(all_sites) - set(sites) == {
+        "99816999999"  # "Delaware Reserve"
+    }, "one empty site not included in state results"
+
+
+@pytest.mark.parametrize("resample", [False, True])
+def test_ish_one_site_empty(resample):
+    dates = pd.date_range("2020-09-01", "2020-09-02")
+    site = "99816999999"  # "Delaware Reserve"
+
+    df = ish.add_data(dates, site=site, resample=resample)
+    assert df.empty
+
+
 def test_ish_resample():
     dates = pd.date_range("2020-09-01", "2020-09-02")
     site = "72224400358"  # "College Park AP"

diff --git a/tests/test_ish_lite.py b/tests/test_ish_lite.py
@@ -19,8 +19,14 @@ def test_ish_read_history():
         assert (df[col].dt.hour == 0).all()
 
     assert df.station_id.nunique() == len(df), "unique ID for station"
-    assert (df.usaf.value_counts() == 2).sum() == 2
-    assert (df.wban.value_counts() == 2).sum() == 5
+
+    # Ensure docstring info matches this
+    x = df.usaf.value_counts()
+    assert sorted(x[x == 2].index) == ["720481", "722158", "725244"]
+    assert x[x.index != "999999"].max() == 2
+    x = df.wban.value_counts()
+    assert sorted(x[x == 2].index) == ["13752", "23176", "24267", "41231", "41420"]
+    assert x[x.index != "99999"].max() == 2
     assert (df.usaf == "999999").sum() > 100
     assert (df.wban == "99999").sum() > 10_000
 
@@ -58,6 +64,15 @@ def test_ish_lite_one_site():
     assert (df.temp < 100).all(), "temp in degC"
 
 
+@pytest.mark.parametrize("resample", [False, True])
+def test_ish_lite_one_site_empty(resample):
+    dates = pd.date_range("2020-09-01", "2020-09-02")
+    site = "99816999999"  # "Delaware Reserve"
+
+    df = ish_lite.add_data(dates, site=site, resample=resample)
+    assert df.empty
+
+
 def test_ish_lite_resample():
     dates = pd.date_range("2020-09-01", "2020-09-02")
     site = "72224400358"  # "College Park AP"