Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ISH for empty sites #109

Merged
merged 11 commits into from
May 9, 2023
29 changes: 9 additions & 20 deletions monetio/obs/ish.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def _clean(frame):

@staticmethod
def _decode_bytes(df):
if df.empty:
return df
bytes_cols = [col for col in df.columns if type(df[col][0]) == bytes]
with pd.option_context("mode.chained_assignment", None):
df.loc[:, bytes_cols] = df[bytes_cols].apply(
Expand All @@ -208,7 +210,7 @@ def read_data_frame(self, url_or_file):
else:
frame_as_array = np.genfromtxt(url_or_file, delimiter=self.WIDTHS, dtype=self.DTYPES)

frame = pd.DataFrame.from_records(frame_as_array)
frame = pd.DataFrame.from_records(np.atleast_1d(frame_as_array))
df = self._clean(frame)
df.drop(["latitude", "longitude"], axis=1, inplace=True)
# df.latitude = self.history.groupby('station_id').get_group(
Expand Down Expand Up @@ -236,8 +238,8 @@ def read_ish_history(self, dates=None):

The constructed 'station_id' column is a combination of the USAF and WBAN columns.
This is done since USAF and WBAN alone are not unique in the history file.
For example, USAF 725244 and 722158 appear twice, as do
WBAN 24267, 41420, 23176, 13752, and 41231.
For example, USAF 720481, 722158, and 725244 appear twice, as do
WBAN 13752, 23176, 24267, 41231, and 41420.
Additionally, there are many cases of unset (999999 for USAF or 99999 for WBAN),
though more so for WBAN than USAF.
However, combining USAF and WBAN does give a unique station ID.
Expand Down Expand Up @@ -350,29 +352,16 @@ def add_data(
print(f"Aggregating {len(urls.name)} URLs...")
self.df = self.aggregrate_files(urls, n_procs=n_procs)

if resample:
if resample and not self.df.empty:
if verbose:
print("Resampling to every " + window)
self.df.index = self.df.time
self.df = self.df.groupby("station_id").resample(window).mean().reset_index()

self.df = self.df.merge(
dfloc[
[
"station_id",
"latitude",
"longitude",
"station name",
"ctry",
"state",
"usaf",
"wban",
]
],
on=["station_id"],
how="left",
self.df = self.df.merge(dfloc, on="station_id", how="left")
self.df = self.df.rename(columns={"station_id": "siteid", "ctry": "country"}).drop(
columns=["fname"]
)
self.df = self.df.rename(columns={"station_id": "siteid", "ctry": "country"})

return self.df

Expand Down
6 changes: 3 additions & 3 deletions monetio/obs/ish_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def read_ish_history(self, dates=None):

The constructed 'station_id' column is a combination of the USAF and WBAN columns.
This is done since USAF and WBAN alone are not unique in the history file.
For example, USAF 725244 and 722158 appear twice, as do
WBAN 24267, 41420, 23176, 13752, and 41231.
For example, USAF 720481, 722158, and 725244 appear twice, as do
WBAN 13752, 23176, 24267, 41231, and 41420.
Additionally, there are many cases of unset (999999 for USAF or 99999 for WBAN),
though more so for WBAN than USAF.
However, combining USAF and WBAN does give a unique station ID.
Expand Down Expand Up @@ -315,7 +315,7 @@ def add_data(
df = df.loc[(df.time >= self.dates.min()) & (df.time <= self.dates.max())]
df = df.replace(-999.9, np.NaN)

if resample:
if resample and not df.empty:
print("Resampling to every " + window)
df = df.set_index("time").groupby("siteid").resample(window).mean().reset_index()

Expand Down
37 changes: 35 additions & 2 deletions tests/test_ish.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ def test_ish_read_history():
assert (df[col].dt.hour == 0).all()

assert df.station_id.nunique() == len(df), "unique ID for station"
assert (df.usaf.value_counts() == 2).sum() == 2
assert (df.wban.value_counts() == 2).sum() == 5

# Ensure docstring info matches this
x = df.usaf.value_counts()
assert sorted(x[x == 2].index) == ["720481", "722158", "725244"]
assert x[x.index != "999999"].max() == 2
x = df.wban.value_counts()
assert sorted(x[x == 2].index) == ["13752", "23176", "24267", "41231", "41420"]
assert x[x.index != "99999"].max() == 2
assert (df.usaf == "999999").sum() > 100
assert (df.wban == "99999").sum() > 10_000

Expand Down Expand Up @@ -79,6 +85,33 @@ def test_ish_no_resample():
assert sum(col.endswith("_quality") for col in df.columns) == 8


def test_ish_one_state_partially_empty():
dates = pd.date_range("2020-09-01", "2020-09-02")
state = "DE"

ish_ = ish.ISH()
ish_.dates = dates
ish_.read_ish_history()
meta = ish_.history
all_sites = sorted(meta.query("state == @state").station_id) # 8 sites

df = ish.add_data(dates, state=state, n_procs=2)
assert len(df) >= 1
sites = sorted(df.siteid.unique())
assert set(all_sites) - set(sites) == {
"99816999999" # "Delaware Reserve"
}, "one empty site not included in state results"


@pytest.mark.parametrize("resample", [False, True])
def test_ish_one_site_empty(resample):
dates = pd.date_range("2020-09-01", "2020-09-02")
site = "99816999999" # "Delaware Reserve"

df = ish.add_data(dates, site=site, resample=resample)
assert df.empty


def test_ish_resample():
dates = pd.date_range("2020-09-01", "2020-09-02")
site = "72224400358" # "College Park AP"
Expand Down
19 changes: 17 additions & 2 deletions tests/test_ish_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,14 @@ def test_ish_read_history():
assert (df[col].dt.hour == 0).all()

assert df.station_id.nunique() == len(df), "unique ID for station"
assert (df.usaf.value_counts() == 2).sum() == 2
assert (df.wban.value_counts() == 2).sum() == 5

# Ensure docstring info matches this
x = df.usaf.value_counts()
assert sorted(x[x == 2].index) == ["720481", "722158", "725244"]
assert x[x.index != "999999"].max() == 2
x = df.wban.value_counts()
assert sorted(x[x == 2].index) == ["13752", "23176", "24267", "41231", "41420"]
assert x[x.index != "99999"].max() == 2
assert (df.usaf == "999999").sum() > 100
assert (df.wban == "99999").sum() > 10_000

Expand Down Expand Up @@ -58,6 +64,15 @@ def test_ish_lite_one_site():
assert (df.temp < 100).all(), "temp in degC"


@pytest.mark.parametrize("resample", [False, True])
def test_ish_lite_one_site_empty(resample):
dates = pd.date_range("2020-09-01", "2020-09-02")
site = "99816999999" # "Delaware Reserve"

df = ish_lite.add_data(dates, site=site, resample=resample)
assert df.empty


def test_ish_lite_resample():
dates = pd.date_range("2020-09-01", "2020-09-02")
site = "72224400358" # "College Park AP"
Expand Down