Skip to content

Commit

Permalink
place -> location
Browse files Browse the repository at this point in the history
fancier sounding

not using 'siteid' since one folder's data can have multiple
unique 'station' values (akin to 'siteid')
  • Loading branch information
zmoon committed Apr 11, 2024
1 parent 26250bd commit 2751b13
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 41 deletions.
64 changes: 33 additions & 31 deletions monetio/profile/gml_ozonesonde.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def wrapper(*args, **kwargs):
return wrapper


PLACES = [
LOCATIONS = [
"Boulder, Colorado",
"Hilo, Hawaii",
"Huntsville, Alabama",
Expand All @@ -54,37 +54,37 @@ def wrapper(*args, **kwargs):
]


_FILES_L100_CACHE = {place: None for place in PLACES}
_FILES_L100_CACHE = {location: None for location in LOCATIONS}


def discover_files(place=None, *, n_threads=3, cache=True):
def discover_files(location=None, *, n_threads=3, cache=True):
import itertools
from multiprocessing.pool import ThreadPool

base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde"

if place is None:
places = PLACES
elif isinstance(place, str):
places = [place]
if location is None:
locations = LOCATIONS
elif isinstance(location, str):
locations = [location]
else:
places = place
locations = location

invalid = set(places) - set(PLACES)
invalid = set(locations) - set(LOCATIONS)
if invalid:
raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.")
raise ValueError(f"Invalid location(s): {invalid}. Valid options: {LOCATIONS}.")

@retry
def get_files(place):
cached = _FILES_L100_CACHE[place]
def get_files(location):
cached = _FILES_L100_CACHE[location]
if cached is not None:
return cached

if place == "South Pole, Antarctica":
url_place = "South Pole, Antartica" # note sp
if location == "South Pole, Antarctica":
url_location = "South Pole, Antartica" # note sp
else:
url_place = place
url = f"{base}/{url_place}/100 Meter Average Files/".replace(" ", "%20")
url_location = location
url = f"{base}/{url_location}/100 Meter Average Files/".replace(" ", "%20")
print(url)

r = requests.get(url, timeout=10)
Expand All @@ -103,40 +103,40 @@ def get_files(place):
except ValueError:
warnings.warn(f"Failed to parse file name {fn!r} for time.")
t = np.nan
data.append((place, t, fn, f"{url}{fn}"))
data.append((location, t, fn, f"{url}{fn}"))

if not data:
warnings.warn(f"No files detected for place {place!r}.")
warnings.warn(f"No files detected for location {location!r}.")

return data

with ThreadPool(processes=min(n_threads, len(places))) as pool:
data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, places)))
with ThreadPool(processes=min(n_threads, len(locations))) as pool:
data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, locations)))

df = pd.DataFrame(data, columns=["place", "time", "fn", "url"])
df = pd.DataFrame(data, columns=["location", "time", "fn", "url"])

if cache:
for place in places:
_FILES_L100_CACHE[place] = list(
df[df["place"] == place].itertuples(index=False, name=None)
for location in locations:
_FILES_L100_CACHE[location] = list(
df[df["location"] == location].itertuples(index=False, name=None)
)

return df


def add_data(dates, *, place=None, n_procs=1, errors="raise"):
def add_data(dates, *, location=None, n_procs=1, errors="raise"):
"""Retrieve and load GML ozonesonde data as a DataFrame.
Parameters
----------
dates : sequence of datetime-like
The period between the min and max (both inclusive)
will be used to select the files to load.
place : str or sequence of str, optional
location : str or sequence of str, optional
For example 'Boulder, Colorado'.
If not provided, all places will be used.
If not provided, all locations will be used.
Valid options correspond to the directories in https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
and may include data from more than one unique site ('station').
and may include data from more than one unique site (output column 'station').
n_procs : int
For Dask.
errors : {'raise', 'warn', 'skip'}
Expand All @@ -152,13 +152,15 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"):
raise ValueError(f"Invalid errors setting: {errors!r}.")

print("Discovering files...")
df_urls = discover_files(place=place)
df_urls = discover_files(location=location)
print(f"Discovered {len(df_urls)} 100-m files.")

urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist()

if not urls:
raise RuntimeError(f"No files found for dates {dates_min} to {dates_max}, place={place}.")
raise RuntimeError(
f"No files found for dates {dates_min} to {dates_max}, location={location!r}."
)

def func(fp_or_url):
try:
Expand Down Expand Up @@ -208,7 +210,7 @@ def func(fp_or_url):
"South Pole": "South Pole, Antarctica",
"Trinidad Head, CA": "Trinidad Head, California",
}
assert set(repl.values()) <= set(PLACES)
assert set(repl.values()) <= set(LOCATIONS)
df["station"] = df["station"].replace(repl)

# Add metadata
Expand Down
20 changes: 10 additions & 10 deletions tests/test_gml_ozonesonde.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
def test_discover_files():
files = gml_ozonesonde.discover_files()
assert len(files) > 0
assert set(files["place"].unique()) == set(gml_ozonesonde.PLACES)
assert set(files["location"].unique()) == set(gml_ozonesonde.LOCATIONS)


def test_read_100m():
Expand Down Expand Up @@ -74,37 +74,37 @@ def test_add_data():
assert df["station"].nunique() == latlon.nunique()


def test_add_data_place_sel():
def test_add_data_location_sel():
dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
df = gml_ozonesonde.add_data(
dates,
place=["Boulder, Colorado", "South Pole, Antarctica"],
location=["Boulder, Colorado", "South Pole, Antarctica"],
n_procs=2,
)
assert len(df) > 0

latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str)
assert latlon.nunique() == 2, "selected two places"
assert latlon.nunique() == 2, "selected two locations"


@pytest.mark.parametrize(
"place",
"location",
["asdf", ["asdf", "blah"], ("asdf", "blah")],
)
def test_add_data_invalid_place(place):
def test_add_data_invalid_location(location):
dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
with pytest.raises(ValueError, match="Invalid place"):
_ = gml_ozonesonde.add_data(dates, place=place)
with pytest.raises(ValueError, match="Invalid location"):
_ = gml_ozonesonde.add_data(dates, location=location)


def test_same_place_and_launch_time():
def test_same_location_and_launch_time():
# Two files with same file time and launch time:
# - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl774_2003_03_10_20.l100
# - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl775_2003_03_10_20.l100
# File time: 2003-03-10 20
# Launch time: 2003-03-10 20:41:11
dates = ["2003-03-10 20", "2003-03-10 21"]
df = gml_ozonesonde.add_data(dates, place="Boulder, Colorado", n_procs=2)
df = gml_ozonesonde.add_data(dates, location="Boulder, Colorado", n_procs=2)
assert len(df) > 0

# Only one launch time
Expand Down

0 comments on commit 2751b13

Please sign in to comment.