From 102113523c3ecea4b428346d71dc42e9cf8190e2 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 12 Jan 2024 11:33:20 -0700 Subject: [PATCH 01/51] Testing GML ozonesonde --- t3.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 t3.py diff --git a/t3.py b/t3.py new file mode 100644 index 00000000..8d82800d --- /dev/null +++ b/t3.py @@ -0,0 +1,88 @@ +""" +Testing loading GML ozonesondes +""" +# import re +from io import StringIO + +import pandas as pd +import requests + +# from tempfile import NamedTemporaryFile + + +# from monetio import icartt + +# 100-m +url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100" + +r = requests.get(url) +r.raise_for_status() + +# # ICARTT parser doesn't seem to work for it +# with NamedTemporaryFile(delete=False) as f: +# f.write(r.content) +# f.seek(0) +# ic = icartt.add_data(f.name) + +blocks = r.text.replace("\r", "").split("\n\n") +assert len(blocks) == 5 + +# Metadata +meta = {} +todo = blocks[3].splitlines() +blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "] +for line in todo: + key, val = line.split(":", 1) + # maybes = re.split(r"\s{2,}", val.strip()) + # if len(maybes) == 1: + # meta[key] = val + # else: + # meta[key] = maybes[0] + # todo.extend(maybes[1:]) + # continue + for key_ish in blah: + if key_ish in val: + i = val.index(key_ish) + meta[key.strip()] = val[:i].strip() + todo.append(val[i:]) + break + else: + meta[key.strip()] = val.strip() + # TODO: replace multi space in val with single + +col_info = [ + # name, units, na + ("lev", "", None), + ("press", "hPa", None), + ("alt", "km", None), + ("theta", "K", None), # "Pottp", pretty sure this potential temperature + ("temp", "degC", None), + ("ftempv", "degC", "999.9"), # TODO: what is? + ("rh", "%", "999"), + ("press_o3", "mPa", "99.90"), + ("o3", "ppmv", "99.999"), + ("o3_tot", "atm-cm", "99.9990"), # 1 DU = 0.001 atm-cm + ("pumptemp", "degC", "999.9"), # "Ptemp", I think this is the pump temperature + ("o3_num", "10^11 cm-3", "999.999"), + ("o3_res", "DU", "9999"), + ("o3_uncert", "%", "99999.000"), +] + +assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14 + +names = [c[0] for c in col_info] +dtype = {c[0]: float for c in col_info} +dtype["lev"] = int +na_values = {c[0]: c[2] for c in col_info if c[2] is not None} + +df = pd.read_csv( + StringIO(blocks[4]), + skiprows=2, + header=None, + delimiter=r"\s+", + names=names, + dtype=dtype, + na_values=na_values, +) + +theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # close to "Pottp" From eaef72bb563a6a3ccdbf4245b9824583065d8bce Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 12 Jan 2024 12:16:03 -0700 Subject: [PATCH 02/51] Move to profile group --- t3.py => monetio/profile/gml_ozonesonde.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename t3.py => monetio/profile/gml_ozonesonde.py (100%) diff --git a/t3.py b/monetio/profile/gml_ozonesonde.py similarity index 100% rename from t3.py rename to monetio/profile/gml_ozonesonde.py From 22f1b42346f5bc5a2a585fb0f3603c144768315f Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 12 Jan 2024 12:24:44 -0700 Subject: [PATCH 03/51] Check meta keys; clean up --- monetio/profile/gml_ozonesonde.py | 42 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 8d82800d..59adb8a2 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -1,45 +1,27 @@ """ Testing loading GML ozonesondes """ -# import re from io import StringIO import pandas as pd import requests -# from tempfile import NamedTemporaryFile - - -# from monetio import icartt - # 100-m url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100" r = requests.get(url) r.raise_for_status() -# # ICARTT parser doesn't seem to work for it -# with NamedTemporaryFile(delete=False) as f: -# f.write(r.content) -# f.seek(0) -# ic = icartt.add_data(f.name) - blocks = r.text.replace("\r", "").split("\n\n") assert len(blocks) == 5 # Metadata meta = {} -todo = blocks[3].splitlines() +todo = blocks[3].splitlines()[::-1] blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "] -for line in todo: +while todo: + line = todo.pop() key, val = line.split(":", 1) - # maybes = re.split(r"\s{2,}", val.strip()) - # if len(maybes) == 1: - # meta[key] = val - # else: - # meta[key] = maybes[0] - # todo.extend(maybes[1:]) - # continue for key_ish in blah: if key_ish in val: i = val.index(key_ish) @@ -50,6 +32,24 @@ meta[key.strip()] = val.strip() # TODO: replace multi space in val with single +assert list(meta) == [ + "Station", + "Station Height", + "Latitude", + "Longitude", + "Flight Number", + "Launch Date", + "Launch Time", + "Radiosonde Type", + "Radiosonde Num", + "O3 Sonde ID", + "Background", + "Flowrate", + "RH Corr", + "Sonde Total O3", + "Sonde Total O3 (SBUV)", +] + col_info = [ # name, units, na ("lev", "", None), From c76bacecb78643a23f8f97b254829cc66fe78f08 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 12 Jan 2024 12:27:42 -0700 Subject: [PATCH 04/51] Eliminate extra spaces in meta values --- monetio/profile/gml_ozonesonde.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 59adb8a2..a5160f02 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -1,6 +1,7 @@ """ Testing loading GML ozonesondes """ +import re from io import StringIO import pandas as pd @@ -30,7 +31,9 @@ break else: meta[key.strip()] = val.strip() - # TODO: replace multi space in val with single + +for k, v in meta.items(): + meta[k] = re.sub(r"\s{2,}", " ", v) assert list(meta) == [ "Station", From cf86140757f4e2b15a7679034178db7e2375bf68 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 12 Jan 2024 12:39:13 -0700 Subject: [PATCH 05/51] Add time; adjust variable info --- monetio/profile/gml_ozonesonde.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index a5160f02..3f6f8438 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -56,18 +56,20 @@ col_info = [ # name, units, na ("lev", "", None), - ("press", "hPa", None), - ("alt", "km", None), - ("theta", "K", None), # "Pottp", pretty sure this potential temperature - ("temp", "degC", None), - ("ftempv", "degC", "999.9"), # TODO: what is? + ("press", "hPa", "9999.9"), + ("alt", "km", "999.999"), # TODO: not sure about this na val + ("theta", "K", "9999.9"), # "Pottp", pretty sure this potential temperature + ("temp", "degC", "999.9"), + ("ftempv", "degC", "999.9"), # TODO: what is this? ("rh", "%", "999"), - ("press_o3", "mPa", "99.90"), + ("o3_press", "mPa", "99.90"), ("o3", "ppmv", "99.999"), - ("o3_tot", "atm-cm", "99.9990"), # 1 DU = 0.001 atm-cm + ("o3_cm", "atm-cm", "99.9990"), + # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below? ("pumptemp", "degC", "999.9"), # "Ptemp", I think this is the pump temperature - ("o3_num", "10^11 cm-3", "999.999"), - ("o3_res", "DU", "9999"), + ("o3_nd", "10^11 cm-3", "999.999"), + ("o3_col", "DU", "9999"), + # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above ("o3_uncert", "%", "99999.000"), ] @@ -89,3 +91,6 @@ ) theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # close to "Pottp" +time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") + +df["time"] = time.tz_localize(None) From 501e218c47f4404dfc5910cdb3490444d00a7dac Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 12 Jan 2024 12:40:23 -0700 Subject: [PATCH 06/51] Add lat/lon --- monetio/profile/gml_ozonesonde.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 3f6f8438..222cddd0 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -94,3 +94,5 @@ time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") df["time"] = time.tz_localize(None) +df["latitude"] = float(meta["Latitude"]) +df["longitude"] = float(meta["Longitude"]) From 5bd8bd240547f804583e05070e10bb3eab1758fc Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 12 Jan 2024 12:42:02 -0700 Subject: [PATCH 07/51] "altitude" --- monetio/profile/gml_ozonesonde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 222cddd0..dcae4495 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -57,7 +57,7 @@ # name, units, na ("lev", "", None), ("press", "hPa", "9999.9"), - ("alt", "km", "999.999"), # TODO: not sure about this na val + ("altitude", "km", "999.999"), # TODO: not sure about this na val ("theta", "K", "9999.9"), # "Pottp", pretty sure this potential temperature ("temp", "degC", "999.9"), ("ftempv", "degC", "999.9"), # TODO: what is this? From 70926797baf59056979a745454b477307932bccb Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 14:41:09 -0700 Subject: [PATCH 08/51] Move to read func and initial test --- monetio/__init__.py | 3 +- monetio/profile/__init__.py | 4 +- monetio/profile/gml_ozonesonde.py | 189 ++++++++++++++++-------------- tests/test_gml_ozonesonde.py | 7 ++ 4 files changed, 110 insertions(+), 93 deletions(-) create mode 100644 tests/test_gml_ozonesonde.py diff --git a/monetio/__init__.py b/monetio/__init__.py index 9c60c152..e40494ed 100644 --- a/monetio/__init__.py +++ b/monetio/__init__.py @@ -1,7 +1,7 @@ from . import grids from .models import camx, cmaq, fv3chem, hysplit, hytraj, ncep_grib, pardump, prepchem, raqms from .obs import aeronet, airnow, aqs, cems, crn, improve, ish, ish_lite, nadp, openaq, pams -from .profile import geoms, icartt, tolnet +from .profile import geoms, gml_ozonesonde, icartt, tolnet from .sat import goes __version__ = "0.2.5" @@ -33,6 +33,7 @@ # # profile obs "geoms", + "gml_ozonesonde", "icartt", "tolnet", # diff --git a/monetio/profile/__init__.py b/monetio/profile/__init__.py index aa328999..b60f8841 100644 --- a/monetio/profile/__init__.py +++ b/monetio/profile/__init__.py @@ -1,5 +1,5 @@ -from . import geoms, icartt, tolnet +from . import geoms, gml_ozonesonde, icartt, tolnet -__all__ = ["tolnet", "icartt", "geoms"] +__all__ = ["tolnet", "icartt", "geoms", "gml_ozonesonde"] __name__ = "profile" diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index dcae4495..9d81ff70 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -5,94 +5,103 @@ from io import StringIO import pandas as pd -import requests - -# 100-m -url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100" - -r = requests.get(url) -r.raise_for_status() - -blocks = r.text.replace("\r", "").split("\n\n") -assert len(blocks) == 5 - -# Metadata -meta = {} -todo = blocks[3].splitlines()[::-1] -blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "] -while todo: - line = todo.pop() - key, val = line.split(":", 1) - for key_ish in blah: - if key_ish in val: - i = val.index(key_ish) - meta[key.strip()] = val[:i].strip() - todo.append(val[i:]) - break + + +def read_100m(fp_or_url): + import requests + + if fp_or_url.startswith("http"): + r = requests.get(fp_or_url, timeout=10) + r.raise_for_status() + text = r.text else: - meta[key.strip()] = val.strip() - -for k, v in meta.items(): - meta[k] = re.sub(r"\s{2,}", " ", v) - -assert list(meta) == [ - "Station", - "Station Height", - "Latitude", - "Longitude", - "Flight Number", - "Launch Date", - "Launch Time", - "Radiosonde Type", - "Radiosonde Num", - "O3 Sonde ID", - "Background", - "Flowrate", - "RH Corr", - "Sonde Total O3", - "Sonde Total O3 (SBUV)", -] - -col_info = [ - # name, units, na - ("lev", "", None), - ("press", "hPa", "9999.9"), - ("altitude", "km", "999.999"), # TODO: not sure about this na val - ("theta", "K", "9999.9"), # "Pottp", pretty sure this potential temperature - ("temp", "degC", "999.9"), - ("ftempv", "degC", "999.9"), # TODO: what is this? - ("rh", "%", "999"), - ("o3_press", "mPa", "99.90"), - ("o3", "ppmv", "99.999"), - ("o3_cm", "atm-cm", "99.9990"), - # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below? - ("pumptemp", "degC", "999.9"), # "Ptemp", I think this is the pump temperature - ("o3_nd", "10^11 cm-3", "999.999"), - ("o3_col", "DU", "9999"), - # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above - ("o3_uncert", "%", "99999.000"), -] - -assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14 - -names = [c[0] for c in col_info] -dtype = {c[0]: float for c in col_info} -dtype["lev"] = int -na_values = {c[0]: c[2] for c in col_info if c[2] is not None} - -df = pd.read_csv( - StringIO(blocks[4]), - skiprows=2, - header=None, - delimiter=r"\s+", - names=names, - dtype=dtype, - na_values=na_values, -) - -theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # close to "Pottp" -time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") - -df["time"] = time.tz_localize(None) -df["latitude"] = float(meta["Latitude"]) -df["longitude"] = float(meta["Longitude"]) + with open(fp_or_url) as f: + text = f.read() + + blocks = text.replace("\r", "").split("\n\n") + assert len(blocks) == 5 + + # Metadata + meta = {} + todo = blocks[3].splitlines()[::-1] + blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "] + while todo: + line = todo.pop() + key, val = line.split(":", 1) + for key_ish in blah: + if key_ish in val: + i = val.index(key_ish) + meta[key.strip()] = val[:i].strip() + todo.append(val[i:]) + break + else: + meta[key.strip()] = val.strip() + + for k, v in meta.items(): + meta[k] = re.sub(r"\s{2,}", " ", v) + + assert list(meta) == [ + "Station", + "Station Height", + "Latitude", + "Longitude", + "Flight Number", + "Launch Date", + "Launch Time", + "Radiosonde Type", + "Radiosonde Num", + "O3 Sonde ID", + "Background", + "Flowrate", + "RH Corr", + "Sonde Total O3", + "Sonde Total O3 (SBUV)", + ] + + col_info = [ + # name, units, na + ("lev", "", None), + ("press", "hPa", "9999.9"), + ("altitude", "km", "999.999"), # TODO: not sure about this na val + ("theta", "K", "9999.9"), # "Pottp", pretty sure this potential temperature + ("temp", "degC", "999.9"), + ("ftempv", "degC", "999.9"), # TODO: what is this? + ("rh", "%", "999"), + ("o3_press", "mPa", "99.90"), + ("o3", "ppmv", "99.999"), + ("o3_cm", "atm-cm", "99.9990"), + # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below? + ("pumptemp", "degC", "999.9"), # "Ptemp", I think this is the pump temperature + ("o3_nd", "10^11 cm-3", "999.999"), + ("o3_col", "DU", "9999"), + # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above + ("o3_uncert", "%", "99999.000"), + ] + + assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14 + + names = [c[0] for c in col_info] + dtype = {c[0]: float for c in col_info} + dtype["lev"] = int + na_values = {c[0]: c[2] for c in col_info if c[2] is not None} + + df = pd.read_csv( + StringIO(blocks[4]), + skiprows=2, + header=None, + delimiter=r"\s+", + names=names, + dtype=dtype, + na_values=na_values, + ) + + # This close to "Pottp" but not exactly the same + theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # noqa: F841 + + time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") + + df["time"] = time.tz_localize(None) + df["latitude"] = float(meta["Latitude"]) + df["longitude"] = float(meta["Longitude"]) + + return df diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py new file mode 100644 index 00000000..86c568f0 --- /dev/null +++ b/tests/test_gml_ozonesonde.py @@ -0,0 +1,7 @@ +from monetio import gml_ozonesonde + + +def test_read_100m(): + url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100" + df = gml_ozonesonde.read_100m(url) + assert len(df) > 0 From 4111dbecdcabb7e438e693abcd5aeb259c37e006 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 14:42:42 -0700 Subject: [PATCH 09/51] Little more robust --- monetio/profile/gml_ozonesonde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 9d81ff70..2923673b 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -10,7 +10,7 @@ def read_100m(fp_or_url): import requests - if fp_or_url.startswith("http"): + if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")): r = requests.get(fp_or_url, timeout=10) r.raise_for_status() text = r.text From e4bf0e92027c7f67170713ad4167862203eb4608 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 15:19:51 -0700 Subject: [PATCH 10/51] Discover 100-m files --- monetio/profile/gml_ozonesonde.py | 51 +++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 2923673b..12be9710 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -1,14 +1,59 @@ """ -Testing loading GML ozonesondes +Load GML ozonesondes from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/ """ import re -from io import StringIO +import numpy as np import pandas as pd +import requests + +PLACES = [ + "Boulder, Colorado", + "Hilo, Hawaii", + "Huntsville, Alabama", + "Narragansett, Rhode Island", + "Pago Pago, American Samoa", + "San Cristobal, Galapagos", + "South Pole, Antartica", # note sp + "Summit, Greenland", + "Suva, Fiji", + "Trinidad Head, California", +] + + +def discover_files(): + base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde" + data = [] + for place in PLACES: + url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20") + print(url) + r = requests.get(url, timeout=10) + r.raise_for_status() + for m in re.finditer(r'href="([a-z0-9_]+\.l100)"', r.text): + fn = m.group(1) + if fn.startswith("san_cristobal_"): + a, b = 3, -1 + else: + a, b = 1, -1 + t_str = "".join(re.split(r"[_\.]", fn)[a:b]) + try: + t = pd.to_datetime(t_str, format=r"%Y%m%d%H") + except ValueError: + print(f"warning: Failed to parse {fn} for time") + t = np.nan + data.append((place, t, fn, f"{url}{fn}")) + + df = pd.DataFrame(data, columns=["place", "time", "fn", "url"]) + + missing = set(PLACES) - set(df["place"].unique()) + if missing: + print(f"warning: No files detected for these places: {missing}") + + return df def read_100m(fp_or_url): - import requests + from io import StringIO if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")): r = requests.get(fp_or_url, timeout=10) From d9fbefbb1d1b65e06f18bf392e023ac70c836e0e Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 15:49:28 -0700 Subject: [PATCH 11/51] Initial `add_data` --- monetio/profile/gml_ozonesonde.py | 25 ++++++++++++++++++++++++- tests/test_gml_ozonesonde.py | 11 +++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 12be9710..3d2345af 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -24,7 +24,7 @@ def discover_files(): base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde" data = [] - for place in PLACES: + for place in PLACES: # TODO: multithread? url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20") print(url) r = requests.get(url, timeout=10) @@ -52,6 +52,29 @@ def discover_files(): return df +def add_data(dates, *, n_procs=1): + import dask + import dask.dataframe as dd + + dates = pd.DatetimeIndex(dates) + dates_min, dates_max = dates.min(), dates.max() + + print("Discovering files...") + df_urls = discover_files() + + urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist() + + print("Aggregating files...") + dfs = [dask.delayed(read_100m)(f) for f in urls] + dff = dd.from_delayed(dfs) + df = dff.compute(num_workers=n_procs).reset_index() + + # Time subset again in case of times in files extending + df = df[df["time"].between(dates_min, dates_max, inclusive="both")] + + return df + + def read_100m(fp_or_url): from io import StringIO diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 86c568f0..2e5ffd38 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -1,3 +1,5 @@ +import pandas as pd + from monetio import gml_ozonesonde @@ -5,3 +7,12 @@ def test_read_100m(): url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100" df = gml_ozonesonde.read_100m(url) assert len(df) > 0 + + +def test_add_data(): + dates = pd.date_range("2023-01-01", "2023-02-01")[:-1] + df = gml_ozonesonde.add_data(dates, n_procs=2) + assert len(df) > 0 + + latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str) + assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile" From 19eab98ab9c7b72655f76622332b594001303da0 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 15:58:09 -0700 Subject: [PATCH 12/51] Multithread discovering files --- monetio/profile/gml_ozonesonde.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 3d2345af..51a3af1d 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -21,14 +21,18 @@ ] -def discover_files(): +def discover_files(*, n_threads=3): + import itertools + from multiprocessing.pool import ThreadPool + base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde" - data = [] - for place in PLACES: # TODO: multithread? + + def get_files(place): url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20") print(url) r = requests.get(url, timeout=10) r.raise_for_status() + data = [] for m in re.finditer(r'href="([a-z0-9_]+\.l100)"', r.text): fn = m.group(1) if fn.startswith("san_cristobal_"): @@ -42,6 +46,12 @@ def discover_files(): print(f"warning: Failed to parse {fn} for time") t = np.nan data.append((place, t, fn, f"{url}{fn}")) + if not data: + print(f"warning: No files detected for pace {place!r}.") + return data + + with ThreadPool(processes=n_threads) as pool: + data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, PLACES))) df = pd.DataFrame(data, columns=["place", "time", "fn", "url"]) From b7481e8c73bbdb47c3b93ff803b590291a9ba883 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 16:17:04 -0700 Subject: [PATCH 13/51] Add place selection --- monetio/profile/gml_ozonesonde.py | 39 +++++++++++++++++++++++-------- tests/test_gml_ozonesonde.py | 26 ++++++++++++++++++++- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 51a3af1d..5c56bc84 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -2,6 +2,7 @@ Load GML ozonesondes from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/ """ import re +import warnings import numpy as np import pandas as pd @@ -21,12 +22,23 @@ ] -def discover_files(*, n_threads=3): +def discover_files(place=None, *, n_threads=3): import itertools from multiprocessing.pool import ThreadPool base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde" + if place is None: + places = PLACES + elif isinstance(place, str): + places = [place] + else: + places = place + + invalid = set(places) - set(PLACES) + if invalid: + raise ValueError(f"Invalid place(s): {invalid}.") + def get_files(place): url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20") print(url) @@ -43,26 +55,33 @@ def get_files(place): try: t = pd.to_datetime(t_str, format=r"%Y%m%d%H") except ValueError: - print(f"warning: Failed to parse {fn} for time") + warnings.warn(f"Failed to parse file name {fn!r} for time.") t = np.nan data.append((place, t, fn, f"{url}{fn}")) if not data: - print(f"warning: No files detected for pace {place!r}.") + warnings.warn(f"No files detected for place {place!r}.") return data with ThreadPool(processes=n_threads) as pool: - data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, PLACES))) + data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, places))) df = pd.DataFrame(data, columns=["place", "time", "fn", "url"]) - missing = set(PLACES) - set(df["place"].unique()) - if missing: - print(f"warning: No files detected for these places: {missing}") - return df -def add_data(dates, *, n_procs=1): +def add_data(dates, *, place=None, n_procs=1): + """ + + Parameters + ---------- + dates : sequence of datetime-like + place : str or sequence of str, optional + For example 'Boulder, Colorado'. + If not provided, all places will be used. + n_procs : int + For Dask. + """ import dask import dask.dataframe as dd @@ -70,7 +89,7 @@ def add_data(dates, *, n_procs=1): dates_min, dates_max = dates.min(), dates.max() print("Discovering files...") - df_urls = discover_files() + df_urls = discover_files(place=place) urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist() diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 2e5ffd38..fa398a30 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest from monetio import gml_ozonesonde @@ -10,9 +11,32 @@ def test_read_100m(): def test_add_data(): - dates = pd.date_range("2023-01-01", "2023-02-01")[:-1] + dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") df = gml_ozonesonde.add_data(dates, n_procs=2) assert len(df) > 0 latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str) assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile" + + +def test_add_data_place_sel(): + dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") + df = gml_ozonesonde.add_data( + dates, + place=["Boulder, Colorado", "South Pole, Antartica"], + n_procs=2, + ) + assert len(df) > 0 + + latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str) + assert latlon.nunique() == 2, "selected two places" + + +@pytest.mark.parametrize( + "place", + ["asdf", ["asdf", "blah"], ("asdf", "blah")], +) +def test_add_data_invalid_place(place): + dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") + with pytest.raises(ValueError, match="Invalid place"): + _ = gml_ozonesonde.add_data(dates, place=place) From c02c677e6200ba8fe22a9a78742e1c0899ca03e1 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 16:21:23 -0700 Subject: [PATCH 14/51] Include valid places in error msg; smaller threadpool if appropriate --- monetio/profile/gml_ozonesonde.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 5c56bc84..3286310f 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -37,7 +37,7 @@ def discover_files(place=None, *, n_threads=3): invalid = set(places) - set(PLACES) if invalid: - raise ValueError(f"Invalid place(s): {invalid}.") + raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.") def get_files(place): url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20") @@ -62,7 +62,7 @@ def get_files(place): warnings.warn(f"No files detected for place {place!r}.") return data - with ThreadPool(processes=n_threads) as pool: + with ThreadPool(processes=min(n_threads, len(places))) as pool: data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, places))) df = pd.DataFrame(data, columns=["place", "time", "fn", "url"]) From 1bb5d1928066ae560fe4b67347fad6c7c931a7cc Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 16:28:23 -0700 Subject: [PATCH 15/51] Error if no files since otherwise Dask does --- monetio/profile/gml_ozonesonde.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 3286310f..d5a22273 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -71,7 +71,7 @@ def get_files(place): def add_data(dates, *, place=None, n_procs=1): - """ + """Retrieve and load GML ozonesonde data as a DataFrame. Parameters ---------- @@ -90,10 +90,14 @@ def add_data(dates, *, place=None, n_procs=1): print("Discovering files...") df_urls = discover_files(place=place) + print(f"Discovered {len(df_urls)} 100-m files.") urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist() - print("Aggregating files...") + if not urls: + raise RuntimeError(f"No files found for dates {dates_min} to {dates_max}, place={place}.") + + print(f"Aggregating {len(urls)} files...") dfs = [dask.delayed(read_100m)(f) for f in urls] dff = dd.from_delayed(dfs) df = dff.compute(num_workers=n_procs).reset_index() From 6a1fead1caf8c9d1c42831912c37488f3058ad3d Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 16:32:54 -0700 Subject: [PATCH 16/51] notes --- monetio/profile/gml_ozonesonde.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index d5a22273..ac901454 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -1,5 +1,8 @@ """ -Load GML ozonesondes from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/ +Load NOAA Global Monitoring Laboratory (GML) ozonesondes +from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/ + +More info: https://gml.noaa.gov/ozwv/ozsondes/ """ import re import warnings @@ -109,6 +112,12 @@ def add_data(dates, *, place=None, n_procs=1): def read_100m(fp_or_url): + """Read a GML ozonesonde 100-m file (``.l100``). + + Notes + ----- + Close to ICARTT format, but not quite conformant enough to use the ICARTT reader. + """ from io import StringIO if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")): From f86fafa49cecac8aea1bde66db7224a81401d198 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 6 Feb 2024 16:39:33 -0700 Subject: [PATCH 17/51] Add attrs though they don't survive the agg maybe better to extract col_info so it can be used for the agg result --- monetio/profile/gml_ozonesonde.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index ac901454..31c88792 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -170,6 +170,7 @@ def read_100m(fp_or_url): col_info = [ # name, units, na + # TODO: long_name? ("lev", "", None), ("press", "hPa", "9999.9"), ("altitude", "km", "999.999"), # TODO: not sure about this na val @@ -205,7 +206,7 @@ def read_100m(fp_or_url): na_values=na_values, ) - # This close to "Pottp" but not exactly the same + # This is close to "Pottp" but not exactly the same theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # noqa: F841 time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") @@ -214,4 +215,8 @@ def read_100m(fp_or_url): df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) + if hasattr(df, "attrs"): + df.attrs["ds_attrs"] = meta + df.attrs["var_attrs"] = {name: {"units": units} for name, units, _ in col_info} + return df From 0ab42d46a702df1892ec0b3760fc3722e1053730 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 08:39:43 -0700 Subject: [PATCH 18/51] Add initial long names with the help of the doc still some questions though, as the variables seem a bit different compared to the fle ones --- monetio/profile/gml_ozonesonde.py | 77 +++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 20 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 31c88792..03c91286 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -169,32 +169,63 @@ def read_100m(fp_or_url): ] col_info = [ - # name, units, na - # TODO: long_name? - ("lev", "", None), - ("press", "hPa", "9999.9"), - ("altitude", "km", "999.999"), # TODO: not sure about this na val - ("theta", "K", "9999.9"), # "Pottp", pretty sure this potential temperature - ("temp", "degC", "999.9"), - ("ftempv", "degC", "999.9"), # TODO: what is this? - ("rh", "%", "999"), - ("o3_press", "mPa", "99.90"), - ("o3", "ppmv", "99.999"), - ("o3_cm", "atm-cm", "99.9990"), - # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below? - ("pumptemp", "degC", "999.9"), # "Ptemp", I think this is the pump temperature - ("o3_nd", "10^11 cm-3", "999.999"), - ("o3_col", "DU", "9999"), - # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above - ("o3_uncert", "%", "99999.000"), + # name, long name, units, na val + # + # "Level" (just a counter, should never be nan) + ("lev", "level", "", None), + # + # "Press" + ("press", "radiosonde corrected pressure", "hPa", "9999.9"), + # + # "Alt" + # TODO: not sure about this na val + ("altitude", "altitude", "km", "999.999"), + # + # "Pottp" + ("theta", "potential temperature", "K", "9999.9"), + # + # "Temp" + ("temp", "radiosonde corrected temperature", "degC", "999.9"), + # + # "FtempV" + ("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"), + # + # "Hum" + ("rh", "radiosonde corrected relative humidity", "%", "999"), + # + # "Ozone" + ("o3_press", "ozone partial pressure", "mPa", "99.90"), + # + # "Ozone" + ("o3", "ozone mixing ratio", "ppmv", "99.999"), + # + # "Ozone" + # note 1 DU = 0.001 atm-cm + # TODO: goes up with height so could be ozone below? + ("o3_cm", "total ozone", "atm-cm", "99.9990"), + # + # "Ptemp" + ("ptemp", "pump temperature", "degC", "999.9"), + # + # "O3 # DN" + ("o3_nd", "ozone number density", "10^11 cm-3", "999.999"), + # + # "O3 Res" + # TODO: goes down with height so could be total ozone above? + ("o3_col", "total column ozone above", "DU", "9999"), + # + # "O3 Uncert" + # TODO: uncertainty in which ozone value? + ("o3_uncert", "uncertainty in ozone", "%", "99999.000"), ] + assert all(len(c) == 4 for c in col_info) assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14 names = [c[0] for c in col_info] dtype = {c[0]: float for c in col_info} dtype["lev"] = int - na_values = {c[0]: c[2] for c in col_info if c[2] is not None} + na_values = {c[0]: c[-1] for c in col_info if c[-1] is not None} df = pd.read_csv( StringIO(blocks[4]), @@ -217,6 +248,12 @@ def read_100m(fp_or_url): if hasattr(df, "attrs"): df.attrs["ds_attrs"] = meta - df.attrs["var_attrs"] = {name: {"units": units} for name, units, _ in col_info} + df.attrs["var_attrs"] = { + name: { + "long_name": long_name, + "units": units, + } + for name, long_name, units, _ in col_info + } return df From 2e849f4b4668838ccfa9f5a023f89bf09670cc88 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 08:48:00 -0700 Subject: [PATCH 19/51] Extract col info --- monetio/profile/gml_ozonesonde.py | 128 ++++++++++++++++-------------- 1 file changed, 68 insertions(+), 60 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 03c91286..a117b40d 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -6,6 +6,7 @@ """ import re import warnings +from typing import NamedTuple, Optional import numpy as np import pandas as pd @@ -111,6 +112,65 @@ def add_data(dates, *, place=None, n_procs=1): return df +class ColInfo(NamedTuple): + name: str + long_name: str + units: str + na_val: Optional[str] + + +COL_INFO_100m = [ + # name, long name, units, na val + # + # "Level" (just a counter, should never be nan) + ColInfo("lev", "level", "", None), + # + # "Press" + ColInfo("press", "radiosonde corrected pressure", "hPa", "9999.9"), + # + # "Alt" + # TODO: not sure about this na val + ColInfo("altitude", "altitude", "km", "999.999"), + # + # "Pottp" + ColInfo("theta", "potential temperature", "K", "9999.9"), + # + # "Temp" + ColInfo("temp", "radiosonde corrected temperature", "degC", "999.9"), + # + # "FtempV" + ColInfo("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"), + # + # "Hum" + ColInfo("rh", "radiosonde corrected relative humidity", "%", "999"), + # + # "Ozone" + ColInfo("o3_press", "ozone partial pressure", "mPa", "99.90"), + # + # "Ozone" + ColInfo("o3", "ozone mixing ratio", "ppmv", "99.999"), + # + # "Ozone" + # note 1 DU = 0.001 atm-cm + # TODO: goes up with height so could be ozone below? + ColInfo("o3_cm", "total ozone", "atm-cm", "99.9990"), + # + # "Ptemp" + ColInfo("ptemp", "pump temperature", "degC", "999.9"), + # + # "O3 # DN" + ColInfo("o3_nd", "ozone number density", "10^11 cm-3", "999.999"), + # + # "O3 Res" + # TODO: goes down with height so could be total ozone above? + ColInfo("o3_col", "total column ozone above", "DU", "9999"), + # + # "O3 Uncert" + # TODO: uncertainty in which ozone value? + ColInfo("o3_uncert", "uncertainty in ozone", "%", "99999.000"), +] + + def read_100m(fp_or_url): """Read a GML ozonesonde 100-m file (``.l100``). @@ -168,64 +228,12 @@ def read_100m(fp_or_url): "Sonde Total O3 (SBUV)", ] - col_info = [ - # name, long name, units, na val - # - # "Level" (just a counter, should never be nan) - ("lev", "level", "", None), - # - # "Press" - ("press", "radiosonde corrected pressure", "hPa", "9999.9"), - # - # "Alt" - # TODO: not sure about this na val - ("altitude", "altitude", "km", "999.999"), - # - # "Pottp" - ("theta", "potential temperature", "K", "9999.9"), - # - # "Temp" - ("temp", "radiosonde corrected temperature", "degC", "999.9"), - # - # "FtempV" - ("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"), - # - # "Hum" - ("rh", "radiosonde corrected relative humidity", "%", "999"), - # - # "Ozone" - ("o3_press", "ozone partial pressure", "mPa", "99.90"), - # - # "Ozone" - ("o3", "ozone mixing ratio", "ppmv", "99.999"), - # - # "Ozone" - # note 1 DU = 0.001 atm-cm - # TODO: goes up with height so could be ozone below? - ("o3_cm", "total ozone", "atm-cm", "99.9990"), - # - # "Ptemp" - ("ptemp", "pump temperature", "degC", "999.9"), - # - # "O3 # DN" - ("o3_nd", "ozone number density", "10^11 cm-3", "999.999"), - # - # "O3 Res" - # TODO: goes down with height so could be total ozone above? - ("o3_col", "total column ozone above", "DU", "9999"), - # - # "O3 Uncert" - # TODO: uncertainty in which ozone value? - ("o3_uncert", "uncertainty in ozone", "%", "99999.000"), - ] - - assert all(len(c) == 4 for c in col_info) - assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14 + assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_100m) == 14 - names = [c[0] for c in col_info] - dtype = {c[0]: float for c in col_info} + names = [c.name for c in COL_INFO_100m] + dtype = {c.name: float for c in COL_INFO_100m} dtype["lev"] = int - na_values = {c[0]: c[-1] for c in col_info if c[-1] is not None} + na_values = {c.name: c.na_val for c in COL_INFO_100m if c.na_val is not None} df = pd.read_csv( StringIO(blocks[4]), @@ -249,11 +257,11 @@ def read_100m(fp_or_url): if hasattr(df, "attrs"): df.attrs["ds_attrs"] = meta df.attrs["var_attrs"] = { - name: { - "long_name": long_name, - "units": units, + c.name: { + "long_name": c.long_name, + "units": c.units, } - for name, long_name, units, _ in col_info + for c in COL_INFO_100m } return df From 04c3dd290e9dd3fc7617c964c115cebdcf5660c3 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 08:51:39 -0700 Subject: [PATCH 20/51] Add attrs to add_data result --- monetio/profile/gml_ozonesonde.py | 14 +++++++++++++- tests/test_gml_ozonesonde.py | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index a117b40d..10931662 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -109,6 +109,17 @@ def add_data(dates, *, place=None, n_procs=1): # Time subset again in case of times in files extending df = df[df["time"].between(dates_min, dates_max, inclusive="both")] + # Add metadata + if hasattr(df, "attrs"): + df.attrs["ds_attrs"] = {"urls": urls} + df.attrs["var_attrs"] = { + c.name: { + "long_name": c.long_name, + "units": c.units, + } + for c in COL_INFO_100m + } + return df @@ -248,12 +259,13 @@ def read_100m(fp_or_url): # This is close to "Pottp" but not exactly the same theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # noqa: F841 + # Add some variables from header time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") - df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) + # Add metadata if hasattr(df, "attrs"): df.attrs["ds_attrs"] = meta df.attrs["var_attrs"] = { diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index fa398a30..330afb38 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -15,6 +15,8 @@ def test_add_data(): df = gml_ozonesonde.add_data(dates, n_procs=2) assert len(df) > 0 + assert df.attrs["var_attrs"]["o3"]["units"] == "ppmv" + latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str) assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile" From 0d8a6cb969189e27b6159a15c07a9d682a593ad5 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 09:09:06 -0700 Subject: [PATCH 21/51] Cache file discovery --- monetio/profile/gml_ozonesonde.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 10931662..62add3a8 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -26,7 +26,10 @@ ] -def discover_files(place=None, *, n_threads=3): +_FILES_L100_CACHE = {place: None for place in PLACES} + + +def discover_files(place=None, *, n_threads=3, cache=True): import itertools from multiprocessing.pool import ThreadPool @@ -44,6 +47,10 @@ def discover_files(place=None, *, n_threads=3): raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.") def get_files(place): + cached = _FILES_L100_CACHE[place] + if cached is not None: + return cached + url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20") print(url) r = requests.get(url, timeout=10) @@ -62,8 +69,10 @@ def get_files(place): warnings.warn(f"Failed to parse file name {fn!r} for time.") t = np.nan data.append((place, t, fn, f"{url}{fn}")) + if not data: warnings.warn(f"No files detected for place {place!r}.") + return data with ThreadPool(processes=min(n_threads, len(places))) as pool: @@ -71,6 +80,12 @@ def get_files(place): df = pd.DataFrame(data, columns=["place", "time", "fn", "url"]) + if cache: + for place in places: + _FILES_L100_CACHE[place] = list( + df[df["place"] == place].itertuples(index=False, name=None) + ) + return df @@ -117,7 +132,7 @@ def add_data(dates, *, place=None, n_procs=1): "long_name": c.long_name, "units": c.units, } - for c in COL_INFO_100m + for c in COL_INFO_L100 } return df @@ -130,7 +145,7 @@ class ColInfo(NamedTuple): na_val: Optional[str] -COL_INFO_100m = [ +COL_INFO_L100 = [ # name, long name, units, na val # # "Level" (just a counter, should never be nan) @@ -239,12 +254,12 @@ def read_100m(fp_or_url): "Sonde Total O3 (SBUV)", ] - assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_100m) == 14 + assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_L100) == 14 - names = [c.name for c in COL_INFO_100m] - dtype = {c.name: float for c in COL_INFO_100m} + names = [c.name for c in COL_INFO_L100] + dtype = {c.name: float for c in COL_INFO_L100} dtype["lev"] = int - na_values = {c.name: c.na_val for c in COL_INFO_100m if c.na_val is not None} + na_values = {c.name: c.na_val for c in COL_INFO_L100 if c.na_val is not None} df = pd.read_csv( StringIO(blocks[4]), @@ -273,7 +288,7 @@ def read_100m(fp_or_url): "long_name": c.long_name, "units": c.units, } - for c in COL_INFO_100m + for c in COL_INFO_L100 } return df From 384130cca8986682bcd23e75a8d094d5731d553a Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 09:11:38 -0700 Subject: [PATCH 22/51] Antarctica --- monetio/profile/gml_ozonesonde.py | 10 ++++++++-- tests/test_gml_ozonesonde.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 62add3a8..8d943cd7 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -19,7 +19,7 @@ "Narragansett, Rhode Island", "Pago Pago, American Samoa", "San Cristobal, Galapagos", - "South Pole, Antartica", # note sp + "South Pole, Antarctica", "Summit, Greenland", "Suva, Fiji", "Trinidad Head, California", @@ -51,10 +51,16 @@ def get_files(place): if cached is not None: return cached - url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20") + if place == "South Pole, Antarctica": + url_place = "South Pole, Antartica" # note sp + else: + url_place = place + url = f"{base}/{url_place}/100 Meter Average Files/".replace(" ", "%20") print(url) + r = requests.get(url, timeout=10) r.raise_for_status() + data = [] for m in re.finditer(r'href="([a-z0-9_]+\.l100)"', r.text): fn = m.group(1) diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 330afb38..03a0acfa 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -25,7 +25,7 @@ def test_add_data_place_sel(): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") df = gml_ozonesonde.add_data( dates, - place=["Boulder, Colorado", "South Pole, Antartica"], + place=["Boulder, Colorado", "South Pole, Antarctica"], n_procs=2, ) assert len(df) > 0 From dbd7e30f2ad81ec6c6c11010eb4f3ee6ba1dde2e Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 09:27:07 -0700 Subject: [PATCH 23/51] Add station name and height to frame --- monetio/profile/gml_ozonesonde.py | 6 ++++-- tests/test_gml_ozonesonde.py | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 8d943cd7..89e2fe8b 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -277,14 +277,16 @@ def read_100m(fp_or_url): na_values=na_values, ) - # This is close to "Pottp" but not exactly the same - theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # noqa: F841 + # Note: This is close to "Pottp" but not exactly the same + # theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) # Add some variables from header time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) + df["station"] = meta["Station"] + df["station_height"] = float(meta["Station Height"]) # Add metadata if hasattr(df, "attrs"): diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 03a0acfa..e774bd50 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -20,6 +20,9 @@ def test_add_data(): latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str) assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile" + # NOTE: Similar to the place folder names, but not all the same + assert df["station"].nunique() == latlon.nunique() + def test_add_data_place_sel(): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") From 2fa6d48c8fd4d2c2ee6eb8d8335450caf5ec0036 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 09:48:57 -0700 Subject: [PATCH 24/51] Add retry for requests did get timeout once this morn --- monetio/profile/gml_ozonesonde.py | 51 ++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 89e2fe8b..994a9077 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -12,6 +12,35 @@ import pandas as pd import requests + +def retry(func): + import time + from functools import wraps + from random import random as rand + + n = 3 + + @wraps(func) + def wrapper(*args, **kwargs): + for i in range(n): + try: + res = func(*args, **kwargs) + except ( + requests.exceptions.ReadTimeout, + requests.exceptions.ConnectionError, + ) as e: + print(f"Failed: {e}") + time.sleep(0.5 * i + rand() * 0.1) + else: + break + else: + raise RuntimeError(f"failed after {n} tries") + + return res + + return wrapper + + PLACES = [ "Boulder, Colorado", "Hilo, Hawaii", @@ -46,6 +75,7 @@ def discover_files(place=None, *, n_threads=3, cache=True): if invalid: raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.") + @retry def get_files(place): cached = _FILES_L100_CACHE[place] if cached is not None: @@ -213,14 +243,21 @@ def read_100m(fp_or_url): from io import StringIO if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")): - r = requests.get(fp_or_url, timeout=10) - r.raise_for_status() - text = r.text + + @retry + def get_text(): + r = requests.get(fp_or_url, timeout=10) + r.raise_for_status() + return r.text + else: - with open(fp_or_url) as f: - text = f.read() - blocks = text.replace("\r", "").split("\n\n") + def get_text(): + with open(fp_or_url) as f: + text = f.read() + return text + + blocks = get_text().replace("\r", "").split("\n\n") assert len(blocks) == 5 # Metadata @@ -286,7 +323,7 @@ def read_100m(fp_or_url): df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) df["station"] = meta["Station"] - df["station_height"] = float(meta["Station Height"]) + df["station_height_str"] = meta["Station Height"] # Add metadata if hasattr(df, "attrs"): From 945b1d3d5fbcb5f553dfea44ff8b02e6e618f1ce Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 09:51:35 -0700 Subject: [PATCH 25/51] cleanup --- monetio/profile/gml_ozonesonde.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 994a9077..7c1db7d2 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -28,13 +28,12 @@ def wrapper(*args, **kwargs): except ( requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, - ) as e: - print(f"Failed: {e}") + ): time.sleep(0.5 * i + rand() * 0.1) else: break else: - raise RuntimeError(f"failed after {n} tries") + raise RuntimeError(f"{func.__name__} failed after {n} tries.") return res From 8e6c98457cbf64467d092c3ab43df6e6e0471f1a Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 10:04:27 -0700 Subject: [PATCH 26/51] Add sonde total o3 strings --- monetio/profile/gml_ozonesonde.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 7c1db7d2..237ac1af 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -316,13 +316,15 @@ def get_text(): # Note: This is close to "Pottp" but not exactly the same # theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) - # Add some variables from header + # Add some variables from header (these don't change in the profile) time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) df["station"] = meta["Station"] - df["station_height_str"] = meta["Station Height"] + df["station_height_str"] = meta["Station Height"] # e.g. '1743 meters' + df["o3_tot_cmr_str"] = meta["Sonde Total O3"] + df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] # e.g. '325 (62) DU' # Add metadata if hasattr(df, "attrs"): From 86a8a208c90aca00a9523ef5eda6adf73bb7b11c Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 10:24:43 -0700 Subject: [PATCH 27/51] Test some header attr vals --- monetio/profile/gml_ozonesonde.py | 4 ++-- tests/test_gml_ozonesonde.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 237ac1af..02d69212 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -262,11 +262,11 @@ def get_text(): # Metadata meta = {} todo = blocks[3].splitlines()[::-1] - blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "] + on_val_side = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "] while todo: line = todo.pop() key, val = line.split(":", 1) - for key_ish in blah: + for key_ish in on_val_side: if key_ish in val: i = val.index(key_ish) meta[key.strip()] = val[:i].strip() diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index e774bd50..fc34a58d 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -9,6 +9,16 @@ def test_read_100m(): df = gml_ozonesonde.read_100m(url) assert len(df) > 0 + assert df.attrs["ds_attrs"]["Station"] == "Boulder, CO" + assert df.attrs["ds_attrs"]["Station Height"] == "1743 meters" + assert df.attrs["ds_attrs"]["Flight Number"] == "BU1043" + assert df.attrs["ds_attrs"]["O3 Sonde ID"] == "2z43312" + assert df.attrs["ds_attrs"]["Background"] == "0.020 microamps (0.08 mPa)" + assert df.attrs["ds_attrs"]["Flowrate"] == "29.89 sec/100ml" + assert df.attrs["ds_attrs"]["RH Corr"] == "0.31 %" + assert df.attrs["ds_attrs"]["Sonde Total O3"] == "329 (65) DU" + assert df.attrs["ds_attrs"]["Sonde Total O3 (SBUV)"] == "325 (62) DU" + def test_add_data(): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") From bbf3765f6c8532489ae7bf016268e3c8d912b051 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 10:36:25 -0700 Subject: [PATCH 28/51] Test discover files gets all places --- tests/test_gml_ozonesonde.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index fc34a58d..8f3e7898 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -4,6 +4,12 @@ from monetio import gml_ozonesonde +def test_discover_files(): + files = gml_ozonesonde.discover_files() + assert len(files) > 0 + assert set(files["place"].unique()) == set(gml_ozonesonde.PLACES) + + def test_read_100m(): url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100" df = gml_ozonesonde.read_100m(url) From c9abd1a20244b9833bde25b4a0cbf72c7a5dc676 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Feb 2024 10:58:30 -0700 Subject: [PATCH 29/51] Found some other NA vals --- monetio/profile/gml_ozonesonde.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 02d69212..e84ffc36 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -6,7 +6,7 @@ """ import re import warnings -from typing import NamedTuple, Optional +from typing import NamedTuple, Optional, Tuple, Union import numpy as np import pandas as pd @@ -177,7 +177,7 @@ class ColInfo(NamedTuple): name: str long_name: str units: str - na_val: Optional[str] + na_val: Optional[Union[str, Tuple[str, ...]]] COL_INFO_L100 = [ @@ -224,11 +224,11 @@ class ColInfo(NamedTuple): # # "O3 Res" # TODO: goes down with height so could be total ozone above? - ColInfo("o3_col", "total column ozone above", "DU", "9999"), + ColInfo("o3_col", "total column ozone above", "DU", ("9999", "99999", "99.999")), # # "O3 Uncert" # TODO: uncertainty in which ozone value? - ColInfo("o3_uncert", "uncertainty in ozone", "%", "99999.000"), + ColInfo("o3_uncert", "uncertainty in ozone", "%", ("99999.000", "99.999")), ] @@ -321,10 +321,11 @@ def get_text(): df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) - df["station"] = meta["Station"] + df["station"] = meta["Station"] # TODO: could normalize to place df["station_height_str"] = meta["Station Height"] # e.g. '1743 meters' df["o3_tot_cmr_str"] = meta["Sonde Total O3"] df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] # e.g. '325 (62) DU' + # TODO: '99999 (99999) DU' if NA, could put empty string instead? # Add metadata if hasattr(df, "attrs"): From 62a2d53f40bef5ee803e53a215fdab27df396df7 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 09:52:53 -0700 Subject: [PATCH 30/51] notes --- monetio/profile/gml_ozonesonde.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index e84ffc36..c9324cd2 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -212,7 +212,7 @@ class ColInfo(NamedTuple): ColInfo("o3", "ozone mixing ratio", "ppmv", "99.999"), # # "Ozone" - # note 1 DU = 0.001 atm-cm + # Note 1 DU = 0.001 atm-cm # TODO: goes up with height so could be ozone below? ColInfo("o3_cm", "total ozone", "atm-cm", "99.9990"), # @@ -223,11 +223,14 @@ class ColInfo(NamedTuple): ColInfo("o3_nd", "ozone number density", "10^11 cm-3", "999.999"), # # "O3 Res" - # TODO: goes down with height so could be total ozone above? + # From Owen Cooper (NOAA CSL): + # This is the amount of ozone in Dobson units above a given altitude. + # The values above the maximum balloon altitude are from a climatology. + # This is mainly for UV absorption research. ColInfo("o3_col", "total column ozone above", "DU", ("9999", "99999", "99.999")), # # "O3 Uncert" - # TODO: uncertainty in which ozone value? + # Estimated uncertainty in the ozone measurement at a given altitude. ColInfo("o3_uncert", "uncertainty in ozone", "%", ("99999.000", "99.999")), ] From a1f86b2edb943cde2929e4b6f4d26a9ef68f0696 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 10:57:12 -0700 Subject: [PATCH 31/51] Support skipping files that error So far, seems like two things: - some files have only one header block, consisting of some info lines and then the key-value meta directly after - some files don't have the o3 uncert column --- monetio/profile/gml_ozonesonde.py | 36 +++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index c9324cd2..69dbb48a 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -124,7 +124,7 @@ def get_files(place): return df -def add_data(dates, *, place=None, n_procs=1): +def add_data(dates, *, place=None, n_procs=1, errors="raise"): """Retrieve and load GML ozonesonde data as a DataFrame. Parameters @@ -135,6 +135,7 @@ def add_data(dates, *, place=None, n_procs=1): If not provided, all places will be used. n_procs : int For Dask. + errors : {'raise', 'warn', 'ignore'} """ import dask import dask.dataframe as dd @@ -142,6 +143,9 @@ def add_data(dates, *, place=None, n_procs=1): dates = pd.DatetimeIndex(dates) dates_min, dates_max = dates.min(), dates.max() + if errors not in {"raise", "warn", "ignore"}: + raise ValueError(f"Invalid errors setting: {errors!r}.") + print("Discovering files...") df_urls = discover_files(place=place) print(f"Discovered {len(df_urls)} 100-m files.") @@ -151,9 +155,21 @@ def add_data(dates, *, place=None, n_procs=1): if not urls: raise RuntimeError(f"No files found for dates {dates_min} to {dates_max}, place={place}.") + def func(fp_or_url): + try: + return read_100m(fp_or_url) + except Exception as e: + msg = f"Failed to read {fp_or_url}: {e}" + if errors == "raise": + raise RuntimeError(msg) from e + else: + if errors == "warn": + warnings.warn(msg) + return pd.DataFrame() + print(f"Aggregating {len(urls)} files...") - dfs = [dask.delayed(read_100m)(f) for f in urls] - dff = dd.from_delayed(dfs) + dfs = [dask.delayed(func)(url) for url in urls] + dff = dd.from_delayed(dfs, verify_meta=errors == "raise") df = dff.compute(num_workers=n_procs).reset_index() # Time subset again in case of times in files extending @@ -260,7 +276,10 @@ def get_text(): return text blocks = get_text().replace("\r", "").split("\n\n") - assert len(blocks) == 5 + nblocks = len(blocks) + if not nblocks == 5: + heads = "\n".join("\n".join(b.splitlines()[:2] + ["..."]) for b in blocks) + raise ValueError(f"Expected 5 blocks, got {nblocks}:\n{heads}") # Metadata meta = {} @@ -281,7 +300,7 @@ def get_text(): for k, v in meta.items(): meta[k] = re.sub(r"\s{2,}", " ", v) - assert list(meta) == [ + meta_keys_expected = [ "Station", "Station Height", "Latitude", @@ -298,8 +317,13 @@ def get_text(): "Sonde Total O3", "Sonde Total O3 (SBUV)", ] + if not list(meta) == meta_keys_expected: + raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.") - assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_L100) == 14 + data_block_ncol = len(blocks[4].splitlines()[2].split()) + if not data_block_ncol == len(COL_INFO_L100) == 14: + head = "\n".join(blocks[4].splitlines()[:4] + ["..."]) + raise ValueError(f"Expected 14 columns in data block, got {data_block_ncol}:\n{head}") names = [c.name for c in COL_INFO_L100] dtype = {c.name: float for c in COL_INFO_L100} From 7ba055de90398f4087c4af11b82219213c525a49 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 12:08:01 -0700 Subject: [PATCH 32/51] Support last col missing --- monetio/profile/gml_ozonesonde.py | 51 +++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 69dbb48a..59b5abb2 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -320,15 +320,54 @@ def get_text(): if not list(meta) == meta_keys_expected: raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.") + data_head1 = blocks[4].splitlines()[0] # TODO: without splitlines? maybe startswith + data_head1_split = data_head1.split() + data_head1_split_expected = [ + "Level", + "Press", + "Alt", + "Pottp", + "Temp", + "FtempV", + "Hum", + "Ozone", + "Ozone", + "Ozone", + "Ptemp", + "O3", + "#", + "DN", + "O3", + "Res", + "O3", + "Uncert", + ] + if not ( + data_head1_split == data_head1_split_expected[:-2] + or data_head1_split == data_head1_split_expected + ): + raise ValueError( + f"Expected data header line 1 like\n{' '.join(data_head1_split_expected)} " + f"(O3 Uncert allowed to be missing)\ngot\n{' '.join(data_head1_split)}" + ) + have_uncert = len(data_head1_split) == len(data_head1_split_expected) + + col_info = COL_INFO_L100[:] + if not have_uncert: + _ = col_info.pop() + + ncol_expected = len(col_info) data_block_ncol = len(blocks[4].splitlines()[2].split()) - if not data_block_ncol == len(COL_INFO_L100) == 14: + if not data_block_ncol == ncol_expected: head = "\n".join(blocks[4].splitlines()[:4] + ["..."]) - raise ValueError(f"Expected 14 columns in data block, got {data_block_ncol}:\n{head}") + raise ValueError( + f"Expected {ncol_expected} columns in data block, " f"got {data_block_ncol}:\n{head}" + ) - names = [c.name for c in COL_INFO_L100] - dtype = {c.name: float for c in COL_INFO_L100} + names = [c.name for c in col_info] + dtype = {c.name: float for c in col_info} dtype["lev"] = int - na_values = {c.name: c.na_val for c in COL_INFO_L100 if c.na_val is not None} + na_values = {c.name: c.na_val for c in col_info if c.na_val is not None} df = pd.read_csv( StringIO(blocks[4]), @@ -362,7 +401,7 @@ def get_text(): "long_name": c.long_name, "units": c.units, } - for c in COL_INFO_L100 + for c in col_info } return df From 6a3dbe2dbf84a8cf17e34a32a95ba274dadf188d Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 12:23:06 -0700 Subject: [PATCH 33/51] Check data block header lines with str more strict, but should be faster, and luckily the two cases seem to cover all --- monetio/profile/gml_ozonesonde.py | 57 ++++++++++++++----------------- tests/test_gml_ozonesonde.py | 14 ++++++++ 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 59b5abb2..c28be378 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -251,6 +251,17 @@ class ColInfo(NamedTuple): ] +_DATA_BLOCK_START_L100 = """\ +Level Press Alt Pottp Temp FtempV Hum Ozone Ozone Ozone Ptemp O3 # DN O3 Res O3 Uncert + Num hPa km K C C % mPa ppmv atmcm C 10^11/cc DU % +""" + +_DATA_BLOCK_START_L100_NO_UNCERT = """\ +Level Press Alt Pottp Temp FtempV Hum Ozone Ozone Ozone Ptemp O3 # DN O3 Res + Num hPa km K C C % mPa ppmv atmcm C 10^11/cc DU +""" + + def read_100m(fp_or_url): """Read a GML ozonesonde 100-m file (``.l100``). @@ -320,46 +331,28 @@ def get_text(): if not list(meta) == meta_keys_expected: raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.") - data_head1 = blocks[4].splitlines()[0] # TODO: without splitlines? maybe startswith - data_head1_split = data_head1.split() - data_head1_split_expected = [ - "Level", - "Press", - "Alt", - "Pottp", - "Temp", - "FtempV", - "Hum", - "Ozone", - "Ozone", - "Ozone", - "Ptemp", - "O3", - "#", - "DN", - "O3", - "Res", - "O3", - "Uncert", - ] - if not ( - data_head1_split == data_head1_split_expected[:-2] - or data_head1_split == data_head1_split_expected - ): + data_block = blocks[4] + if data_block.startswith(_DATA_BLOCK_START_L100): + have_uncert = True + elif data_block.startswith(_DATA_BLOCK_START_L100_NO_UNCERT): + have_uncert = False + else: + head = "\n".join(data_block.splitlines()[:2] + ["..."]) raise ValueError( - f"Expected data header line 1 like\n{' '.join(data_head1_split_expected)} " - f"(O3 Uncert allowed to be missing)\ngot\n{' '.join(data_head1_split)}" + "Data block does not start with expected header line(s) " + "(O3 Uncert allowed to be missing):\n" + f"{_DATA_BLOCK_START_L100}\n" + f"got\n{head}" ) - have_uncert = len(data_head1_split) == len(data_head1_split_expected) col_info = COL_INFO_L100[:] if not have_uncert: _ = col_info.pop() ncol_expected = len(col_info) - data_block_ncol = len(blocks[4].splitlines()[2].split()) + data_block_ncol = len(data_block[:400].splitlines()[2].split()) if not data_block_ncol == ncol_expected: - head = "\n".join(blocks[4].splitlines()[:4] + ["..."]) + head = "\n".join(data_block.splitlines()[:4] + ["..."]) raise ValueError( f"Expected {ncol_expected} columns in data block, " f"got {data_block_ncol}:\n{head}" ) @@ -370,7 +363,7 @@ def get_text(): na_values = {c.name: c.na_val for c in col_info if c.na_val is not None} df = pd.read_csv( - StringIO(blocks[4]), + StringIO(data_block), skiprows=2, header=None, delimiter=r"\s+", diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 8f3e7898..dd754b05 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -26,6 +26,20 @@ def test_read_100m(): assert df.attrs["ds_attrs"]["Sonde Total O3 (SBUV)"] == "325 (62) DU" +@pytest.mark.parametrize( + "url", + [ + # Missing 'O3 Uncert' + r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/San%20Cristobal,%20Galapagos/100%20Meter%20Average%20Files/sc204_2002_02_01_03.l100", + # Missing 'O3 Uncert' + different header blocks (only 1) + r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Narragansett,%20Rhode%20Island/100%20Meter%20Average%20Files/ri058_2004_08_05_18.l100", + ], +) +def test_read_100m_nonstd(url): + df = gml_ozonesonde.read_100m(url) + assert len(df) > 0 + + def test_add_data(): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") df = gml_ozonesonde.add_data(dates, n_procs=2) From a3981903e7a74845a788a72874dd265ea3e84ddd Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 12:44:20 -0700 Subject: [PATCH 34/51] Support 2-block case --- monetio/profile/gml_ozonesonde.py | 32 ++++++++++++++++++++++--------- tests/test_gml_ozonesonde.py | 11 +++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index c28be378..6e7ce446 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -288,13 +288,25 @@ def get_text(): blocks = get_text().replace("\r", "").split("\n\n") nblocks = len(blocks) - if not nblocks == 5: + if nblocks == 5: # normal + meta_block = blocks[3] + data_block = blocks[4] + elif nblocks == 2: + block_lines = blocks[0].splitlines() + for i, line in enumerate(block_lines): + if line.startswith(("Station:", "Station: ", "Station ")): + break + else: + raise ValueError(f"Expected to find metadata to start with Station, got:\n{blocks[0]}") + meta_block = "\n".join(block_lines[i:]) + data_block = blocks[1] + else: heads = "\n".join("\n".join(b.splitlines()[:2] + ["..."]) for b in blocks) - raise ValueError(f"Expected 5 blocks, got {nblocks}:\n{heads}") + raise ValueError(f"Expected 2 or 5 blocks, got {nblocks}:\n{heads}") # Metadata meta = {} - todo = blocks[3].splitlines()[::-1] + todo = meta_block.splitlines()[::-1] on_val_side = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "] while todo: line = todo.pop() @@ -319,8 +331,9 @@ def get_text(): "Flight Number", "Launch Date", "Launch Time", - "Radiosonde Type", - "Radiosonde Num", + # May see 'Vaisala number' and 'Vaisala humicap' instead of these two: + # "Radiosonde Type", + # "Radiosonde Num", "O3 Sonde ID", "Background", "Flowrate", @@ -328,10 +341,9 @@ def get_text(): "Sonde Total O3", "Sonde Total O3 (SBUV)", ] - if not list(meta) == meta_keys_expected: + if not set(meta) >= set(meta_keys_expected): raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.") - data_block = blocks[4] if data_block.startswith(_DATA_BLOCK_START_L100): have_uncert = True elif data_block.startswith(_DATA_BLOCK_START_L100_NO_UNCERT): @@ -354,8 +366,10 @@ def get_text(): if not data_block_ncol == ncol_expected: head = "\n".join(data_block.splitlines()[:4] + ["..."]) raise ValueError( - f"Expected {ncol_expected} columns in data block, " f"got {data_block_ncol}:\n{head}" + f"Expected {ncol_expected} columns in data block, " + f"got {data_block_ncol} in first data line:\n{head}" ) + # TODO: allow pandas to skip bad lines with `on_bad_lines='skip'`? names = [c.name for c in col_info] dtype = {c.name: float for c in col_info} @@ -380,7 +394,7 @@ def get_text(): df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) - df["station"] = meta["Station"] # TODO: could normalize to place + df["station"] = meta["Station"] # TODO: could normalize to place (in add_data?)? df["station_height_str"] = meta["Station Height"] # e.g. '1743 meters' df["o3_tot_cmr_str"] = meta["Sonde Total O3"] df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] # e.g. '325 (62) DU' diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index dd754b05..3ffcbb5b 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -40,6 +40,17 @@ def test_read_100m_nonstd(url): assert len(df) > 0 +def test_read_100m_bad_data_line(): + url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/San%20Cristobal,%20Galapagos/100%20Meter%20Average%20Files/sc204_2002_01_31_12.l100" + # Level Press Alt Pottp Temp FtempV Hum Ozone Ozone Ozone Ptemp O3 # DN O3 Res + # Num hPa km K C C % mPa ppmv atmcm C 10^11/cc DU + # 0 -6331.0 0.008 0.0-3323.0 999.9 999-6666.00 10.529 0.0000 -91.8 1583.081 260 + # 1 892.2 0.100 301.1 18.3 19.1 105 1.07 0.012 0.0009 32.3 2.649 259 + + with pytest.raises(ValueError, match="Expected 13 columns in data block"): + _ = gml_ozonesonde.read_100m(url) + + def test_add_data(): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") df = gml_ozonesonde.add_data(dates, n_procs=2) From ba349031f06aad0128ab53f8f0372ebea1d55934 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 13:19:13 -0700 Subject: [PATCH 35/51] Add different data header case besides this one and the bad data line one, all others load now (tested on Hopper) --- tests/test_gml_ozonesonde.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 3ffcbb5b..52ad5075 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -51,6 +51,15 @@ def test_read_100m_bad_data_line(): _ = gml_ozonesonde.read_100m(url) +def test_read_100m_bad_header_line(): + url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu913_2021_08_10_16.l100" + # Level Press Alt Pottp Temp FtempV Hum Ozone Ozone Ozone Ptemp O3 # DN O3 Res Ftemp Water + # Num hPa km K C C % mPa ppmv atmcm C 10^11/cc DU C ppmv + + with pytest.raises(ValueError, match="Data block does not start with expected header"): + _ = gml_ozonesonde.read_100m(url) + + def test_add_data(): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") df = gml_ozonesonde.add_data(dates, n_procs=2) From ba6fb7aab9b5a6e8b0bfcfc25684e8bf2517c236 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 13:20:03 -0700 Subject: [PATCH 36/51] Normalize station to the place names --- monetio/profile/gml_ozonesonde.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 6e7ce446..729822ef 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -175,6 +175,36 @@ def func(fp_or_url): # Time subset again in case of times in files extending df = df[df["time"].between(dates_min, dates_max, inclusive="both")] + # Normalize station + # All values, as of 2024-02-08: + # > df.station.value_counts().sort_index() + # Boulder, CO 650757 + # Hilo, Hawaii 627325 + # Hilo,Hawaii 192 + # Huntsville 10982 + # Huntsville, AL 314375 + # Mauna Loa Observatory, Hawaii 605 (different site than Hilo) + # Pago Pago, American Samoa 370141 + # San Cristobal, Galapagos, Ecuador 150244 + # South Pole 661422 + # Summit, Greenland 164620 + # Suva, Fiji 164065 + # Trinidad Head, CA 426409 + # University of Rhode Island 105878 + # helikite test 326 + # hsv 340 + repl = { + "Boulder, CO": "Boulder, Colorado", + "Hilo,Hawaii": "Hilo, Hawaii", + "Huntsville": "Huntsville, Alabama", + "Huntsville, AL": "Huntsville, Alabama", + "San Cristobal, Galapagos, Ecuador": "San Cristobal, Galapagos", + "South Pole": "South Pole, Antarctica", + "Trinidad Head, CA": "Trinidad Head, California", + } + assert set(repl.values()) <= set(PLACES) + df["station"] = df["station"].replace(repl) + # Add metadata if hasattr(df, "attrs"): df.attrs["ds_attrs"] = {"urls": urls} From 37dfbef9d89a490f83b516ec8c98211afdc8b218 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 13:24:14 -0700 Subject: [PATCH 37/51] todo --- monetio/profile/gml_ozonesonde.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 729822ef..e8cc0ce8 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -424,11 +424,11 @@ def get_text(): df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) - df["station"] = meta["Station"] # TODO: could normalize to place (in add_data?)? + df["station"] = meta["Station"] df["station_height_str"] = meta["Station Height"] # e.g. '1743 meters' df["o3_tot_cmr_str"] = meta["Sonde Total O3"] df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] # e.g. '325 (62) DU' - # TODO: '99999 (99999) DU' if NA, could put empty string instead? + # TODO: '99999 (99999) DU' if NA, could put empty string or None or NaN instead? # Add metadata if hasattr(df, "attrs"): From 0ade35e0afe03291ece02f1acb7267db8e985496 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 13:36:05 -0700 Subject: [PATCH 38/51] Fix index in aggregated frame --- monetio/profile/gml_ozonesonde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index e8cc0ce8..5408306b 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -216,7 +216,7 @@ def func(fp_or_url): for c in COL_INFO_L100 } - return df + return df.drop(columns=["index"], errors="ignore").reset_index(drop=True) class ColInfo(NamedTuple): From 251dcc38f82dd70d99c64fbc5e27c2688b39b1dc Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 8 Feb 2024 14:59:01 -0700 Subject: [PATCH 39/51] doc [skip ci] --- monetio/profile/gml_ozonesonde.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 5408306b..cf15a8ee 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -130,12 +130,15 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"): Parameters ---------- dates : sequence of datetime-like + The period between the min and max (both inclusive) + will be used to select the files to load. place : str or sequence of str, optional For example 'Boulder, Colorado'. If not provided, all places will be used. n_procs : int For Dask. - errors : {'raise', 'warn', 'ignore'} + errors : {'raise', 'warn', 'skip'} + What to do when there is an error reading a file. """ import dask import dask.dataframe as dd From 80eeec445ac31220902a4146d6530adac1966d81 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 9 Feb 2024 14:10:47 -0700 Subject: [PATCH 40/51] Add NA vals for altitude found a few cases of 99.9 --- monetio/profile/gml_ozonesonde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index cf15a8ee..439fb38d 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -240,7 +240,7 @@ class ColInfo(NamedTuple): # # "Alt" # TODO: not sure about this na val - ColInfo("altitude", "altitude", "km", "999.999"), + ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")), # # "Pottp" ColInfo("theta", "potential temperature", "K", "9999.9"), From 9b2e264255935586e0a1fb1bbebd9696c2b8153a Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 11:26:22 -0700 Subject: [PATCH 41/51] notes based on email from Bryan Johnson (NOAA GML) --- monetio/profile/gml_ozonesonde.py | 33 ++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 439fb38d..65cf4223 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -236,36 +236,45 @@ class ColInfo(NamedTuple): ColInfo("lev", "level", "", None), # # "Press" - ColInfo("press", "radiosonde corrected pressure", "hPa", "9999.9"), + # Atmospheric pressure, from the radiosonde. + ColInfo("press", "pressure", "hPa", "9999.9"), # # "Alt" - # TODO: not sure about this na val + # Altitude above sea level + # computed from radiosonde pressure and temperature + # (or GPS if available?). ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")), # # "Pottp" ColInfo("theta", "potential temperature", "K", "9999.9"), # # "Temp" - ColInfo("temp", "radiosonde corrected temperature", "degC", "999.9"), + # Atmospheric temperature, from the radiosonde. + ColInfo("temp", "air temperature", "degC", "999.9"), # # "FtempV" - ColInfo("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"), + # Frost point temperature, calculated from the radiosonde RH and temp. + ColInfo("ftempv", "frost point temperature", "degC", "999.9"), # # "Hum" - ColInfo("rh", "radiosonde corrected relative humidity", "%", "999"), + # RH, measured by the radiosonde. + ColInfo("rh", "relative humidity", "%", "999"), # # "Ozone" + # Measured by the ozone sensor cell. ColInfo("o3_press", "ozone partial pressure", "mPa", "99.90"), # # "Ozone" + # Calculated from the ozone partial pressure and atmospheric pressure. ColInfo("o3", "ozone mixing ratio", "ppmv", "99.999"), # # "Ozone" # Note 1 DU = 0.001 atm-cm - # TODO: goes up with height so could be ozone below? - ColInfo("o3_cm", "total ozone", "atm-cm", "99.9990"), + # Cumulative column ozone amount at this point in the profile. + ColInfo("o3_int", "integrated ozone below", "atm-cm", "99.9990"), # # "Ptemp" + # Pump temperature, from thermistor in the vicinity of the pump block. ColInfo("ptemp", "pump temperature", "degC", "999.9"), # # "O3 # DN" @@ -276,7 +285,7 @@ class ColInfo(NamedTuple): # This is the amount of ozone in Dobson units above a given altitude. # The values above the maximum balloon altitude are from a climatology. # This is mainly for UV absorption research. - ColInfo("o3_col", "total column ozone above", "DU", ("9999", "99999", "99.999")), + ColInfo("o3_res", "estimated total column ozone above", "DU", ("9999", "99999", "99.999")), # # "O3 Uncert" # Estimated uncertainty in the ozone measurement at a given altitude. @@ -429,8 +438,14 @@ def get_text(): df["longitude"] = float(meta["Longitude"]) df["station"] = meta["Station"] df["station_height_str"] = meta["Station Height"] # e.g. '1743 meters' + + # Sonde total column ozone amount ('325 (62) DU') in two methods: + # - CMR: extrapolate constant mixing ratio above balloon burst to get ozone above that (the residual) + # - SBUV: compute the residual from the SBUV climate tables + # The first number is the total column ozone (integrated + residual). + # The number in parentheses is the residual. df["o3_tot_cmr_str"] = meta["Sonde Total O3"] - df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] # e.g. '325 (62) DU' + df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] # TODO: '99999 (99999) DU' if NA, could put empty string or None or NaN instead? # Add metadata From 5af1ba8f7f0846d03158a1ac054a89f8d4b11c8f Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 11:32:49 -0700 Subject: [PATCH 42/51] Set '99999 (99999) DU' total col to NaN --- monetio/profile/gml_ozonesonde.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 65cf4223..868d9a84 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -428,25 +428,24 @@ def get_text(): na_values=na_values, ) - # Note: This is close to "Pottp" but not exactly the same - # theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286) - # Add some variables from header (these don't change in the profile) time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) df["longitude"] = float(meta["Longitude"]) + df["station"] = meta["Station"] df["station_height_str"] = meta["Station Height"] # e.g. '1743 meters' - # Sonde total column ozone amount ('325 (62) DU') in two methods: + # Sonde total column ozone amount ('325 (62) DU') from two methods: # - CMR: extrapolate constant mixing ratio above balloon burst to get ozone above that (the residual) # - SBUV: compute the residual from the SBUV climate tables # The first number is the total column ozone (integrated + residual). # The number in parentheses is the residual. df["o3_tot_cmr_str"] = meta["Sonde Total O3"] df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] - # TODO: '99999 (99999) DU' if NA, could put empty string or None or NaN instead? + for col in ["o3_tot_cmr_str", "o3_tot_sbuv_str"]: + df[col] = df[col].replace("99999 (99999) DU", np.nan) # Add metadata if hasattr(df, "attrs"): From 72c6da4c9a0877de30990bd4d8cdb5c456ca43bc Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 11:38:26 -0700 Subject: [PATCH 43/51] notes --- monetio/profile/gml_ozonesonde.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 868d9a84..5426ed6c 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -175,7 +175,7 @@ def func(fp_or_url): dff = dd.from_delayed(dfs, verify_meta=errors == "raise") df = dff.compute(num_workers=n_procs).reset_index() - # Time subset again in case of times in files extending + # Time subset again just in case df = df[df["time"].between(dates_min, dates_max, inclusive="both")] # Normalize station @@ -404,12 +404,12 @@ def get_text(): _ = col_info.pop() ncol_expected = len(col_info) - data_block_ncol = len(data_block[:400].splitlines()[2].split()) - if not data_block_ncol == ncol_expected: + data_block_first_ncol = len(data_block[:400].splitlines()[2].split()) + if not data_block_first_ncol == ncol_expected: head = "\n".join(data_block.splitlines()[:4] + ["..."]) raise ValueError( f"Expected {ncol_expected} columns in data block, " - f"got {data_block_ncol} in first data line:\n{head}" + f"got {data_block_first_ncol} in first data line:\n{head}" ) # TODO: allow pandas to skip bad lines with `on_bad_lines='skip'`? @@ -428,7 +428,7 @@ def get_text(): na_values=na_values, ) - # Add some variables from header (these don't change in the profile) + # Add some variables from header as columns (these don't change in the profile) time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}") df["time"] = time.tz_localize(None) df["latitude"] = float(meta["Latitude"]) From 204d487dc87cca6c3cf3fcc5889855bfe9a7e519 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 11:43:25 -0700 Subject: [PATCH 44/51] Ensure tot cols still object dtype for when Dask checks for consistency --- monetio/profile/gml_ozonesonde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 5426ed6c..12caf312 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -445,7 +445,7 @@ def get_text(): df["o3_tot_cmr_str"] = meta["Sonde Total O3"] df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"] for col in ["o3_tot_cmr_str", "o3_tot_sbuv_str"]: - df[col] = df[col].replace("99999 (99999) DU", np.nan) + df[col] = df[col].replace("99999 (99999) DU", np.nan).astype(object) # Add metadata if hasattr(df, "attrs"): From d8670f1ddadbed4f381219b9b3ed3463ad04cd68 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 13:17:01 -0700 Subject: [PATCH 45/51] notes [skip ci] --- monetio/profile/gml_ozonesonde.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 12caf312..00b08b2f 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -241,8 +241,9 @@ class ColInfo(NamedTuple): # # "Alt" # Altitude above sea level - # computed from radiosonde pressure and temperature - # (or GPS if available?). + # in the sounding computed from radiosonde pressure and temperature (or GPS if available?). + # For 100-m data, the center of the 100-m layer + # (data values included in the layer averages have altitude +/- 50 m of this). ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")), # # "Pottp" From dd7dc3122ab1f027e4651ec0fc4d54a5687c73ac Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 13:26:19 -0700 Subject: [PATCH 46/51] notes [skip ci] --- monetio/profile/gml_ozonesonde.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 00b08b2f..cd8447b8 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -244,6 +244,9 @@ class ColInfo(NamedTuple): # in the sounding computed from radiosonde pressure and temperature (or GPS if available?). # For 100-m data, the center of the 100-m layer # (data values included in the layer averages have altitude +/- 50 m of this). + # If not invalid and removed, the first row is the actual altitude of the launch, + # and the next row begins the clean 100-m intervals + # (i.e. first diff may not be 0.1, but the rest should be). ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")), # # "Pottp" From 8ce5182aba50e88b979fc2af767ddd7c29ed0b18 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 14:13:11 -0700 Subject: [PATCH 47/51] Add flight number to df otherwise can't differentiate launches at same time and place --- monetio/profile/gml_ozonesonde.py | 2 ++ tests/test_gml_ozonesonde.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index cd8447b8..29e55449 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -176,6 +176,7 @@ def func(fp_or_url): df = dff.compute(num_workers=n_procs).reset_index() # Time subset again just in case + # (file time may not match launch time; file time seems to be floored to nearest hour) df = df[df["time"].between(dates_min, dates_max, inclusive="both")] # Normalize station @@ -440,6 +441,7 @@ def get_text(): df["station"] = meta["Station"] df["station_height_str"] = meta["Station Height"] # e.g. '1743 meters' + df["flight_number"] = meta["Flight Number"] # Sonde total column ozone amount ('325 (62) DU') from two methods: # - CMR: extrapolate constant mixing ratio above balloon burst to get ozone above that (the residual) diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 52ad5075..efe99b6c 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -95,3 +95,20 @@ def test_add_data_invalid_place(place): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") with pytest.raises(ValueError, match="Invalid place"): _ = gml_ozonesonde.add_data(dates, place=place) + + +def test_same_place_and_launch_time(): + # Two files with same file time and launch time: + # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl774_2003_03_10_20.l100 + # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl775_2003_03_10_20.l100 + # File time: 2003-03-10 20 + # Launch time: 2003-03-10 20:41:11 + dates = ["2003-03-10 20", "2003-03-10 21"] + df = gml_ozonesonde.add_data(dates, place="Boulder, Colorado", n_procs=2) + assert len(df) > 0 + + # Only one launch time + assert df["time"].nunique() == 1 + + # But multiple profiles + assert df["flight_number"].nunique() == 2 From cad3899ef3bb121b65b9b600909920d254deb78e Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 14 Feb 2024 16:59:31 -0700 Subject: [PATCH 48/51] Test URLs --- tests/test_gml_ozonesonde.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index efe99b6c..6d2f6969 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -112,3 +112,8 @@ def test_same_place_and_launch_time(): # But multiple profiles assert df["flight_number"].nunique() == 2 + + assert df.attrs["ds_attrs"]["urls"] == [ + r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl774_2003_03_10_20.l100", + r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl775_2003_03_10_20.l100", + ] From 26250bdc9bebc6ed5bff12b47094beae7b673433 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 11 Apr 2024 16:39:01 -0600 Subject: [PATCH 49/51] More info about place arg --- monetio/profile/gml_ozonesonde.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index 29e55449..e5f0af91 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -135,6 +135,8 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"): place : str or sequence of str, optional For example 'Boulder, Colorado'. If not provided, all places will be used. + Valid options correspond to the directories in https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/ + and may include data from more than one unique site ('station'). n_procs : int For Dask. errors : {'raise', 'warn', 'skip'} From 2751b13312f723f963f13f9212670471b3e752d4 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 11 Apr 2024 16:47:45 -0600 Subject: [PATCH 50/51] `place` -> `location` fancier sounding not using 'siteid' since one folder's data can have multiple unique 'station' values (akin to 'siteid') --- monetio/profile/gml_ozonesonde.py | 64 ++++++++++++++++--------------- tests/test_gml_ozonesonde.py | 20 +++++----- 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index e5f0af91..c993b3d8 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -40,7 +40,7 @@ def wrapper(*args, **kwargs): return wrapper -PLACES = [ +LOCATIONS = [ "Boulder, Colorado", "Hilo, Hawaii", "Huntsville, Alabama", @@ -54,37 +54,37 @@ def wrapper(*args, **kwargs): ] -_FILES_L100_CACHE = {place: None for place in PLACES} +_FILES_L100_CACHE = {location: None for location in LOCATIONS} -def discover_files(place=None, *, n_threads=3, cache=True): +def discover_files(location=None, *, n_threads=3, cache=True): import itertools from multiprocessing.pool import ThreadPool base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde" - if place is None: - places = PLACES - elif isinstance(place, str): - places = [place] + if location is None: + locations = LOCATIONS + elif isinstance(location, str): + locations = [location] else: - places = place + locations = location - invalid = set(places) - set(PLACES) + invalid = set(locations) - set(LOCATIONS) if invalid: - raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.") + raise ValueError(f"Invalid location(s): {invalid}. Valid options: {LOCATIONS}.") @retry - def get_files(place): - cached = _FILES_L100_CACHE[place] + def get_files(location): + cached = _FILES_L100_CACHE[location] if cached is not None: return cached - if place == "South Pole, Antarctica": - url_place = "South Pole, Antartica" # note sp + if location == "South Pole, Antarctica": + url_location = "South Pole, Antartica" # note sp else: - url_place = place - url = f"{base}/{url_place}/100 Meter Average Files/".replace(" ", "%20") + url_location = location + url = f"{base}/{url_location}/100 Meter Average Files/".replace(" ", "%20") print(url) r = requests.get(url, timeout=10) @@ -103,28 +103,28 @@ def get_files(place): except ValueError: warnings.warn(f"Failed to parse file name {fn!r} for time.") t = np.nan - data.append((place, t, fn, f"{url}{fn}")) + data.append((location, t, fn, f"{url}{fn}")) if not data: - warnings.warn(f"No files detected for place {place!r}.") + warnings.warn(f"No files detected for location {location!r}.") return data - with ThreadPool(processes=min(n_threads, len(places))) as pool: - data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, places))) + with ThreadPool(processes=min(n_threads, len(locations))) as pool: + data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, locations))) - df = pd.DataFrame(data, columns=["place", "time", "fn", "url"]) + df = pd.DataFrame(data, columns=["location", "time", "fn", "url"]) if cache: - for place in places: - _FILES_L100_CACHE[place] = list( - df[df["place"] == place].itertuples(index=False, name=None) + for location in locations: + _FILES_L100_CACHE[location] = list( + df[df["location"] == location].itertuples(index=False, name=None) ) return df -def add_data(dates, *, place=None, n_procs=1, errors="raise"): +def add_data(dates, *, location=None, n_procs=1, errors="raise"): """Retrieve and load GML ozonesonde data as a DataFrame. Parameters @@ -132,11 +132,11 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"): dates : sequence of datetime-like The period between the min and max (both inclusive) will be used to select the files to load. - place : str or sequence of str, optional + location : str or sequence of str, optional For example 'Boulder, Colorado'. - If not provided, all places will be used. + If not provided, all locations will be used. Valid options correspond to the directories in https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/ - and may include data from more than one unique site ('station'). + and may include data from more than one unique site (output column 'station'). n_procs : int For Dask. errors : {'raise', 'warn', 'skip'} @@ -152,13 +152,15 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"): raise ValueError(f"Invalid errors setting: {errors!r}.") print("Discovering files...") - df_urls = discover_files(place=place) + df_urls = discover_files(location=location) print(f"Discovered {len(df_urls)} 100-m files.") urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist() if not urls: - raise RuntimeError(f"No files found for dates {dates_min} to {dates_max}, place={place}.") + raise RuntimeError( + f"No files found for dates {dates_min} to {dates_max}, location={location!r}." + ) def func(fp_or_url): try: @@ -208,7 +210,7 @@ def func(fp_or_url): "South Pole": "South Pole, Antarctica", "Trinidad Head, CA": "Trinidad Head, California", } - assert set(repl.values()) <= set(PLACES) + assert set(repl.values()) <= set(LOCATIONS) df["station"] = df["station"].replace(repl) # Add metadata diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index 6d2f6969..b12fd39b 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -7,7 +7,7 @@ def test_discover_files(): files = gml_ozonesonde.discover_files() assert len(files) > 0 - assert set(files["place"].unique()) == set(gml_ozonesonde.PLACES) + assert set(files["location"].unique()) == set(gml_ozonesonde.LOCATIONS) def test_read_100m(): @@ -74,37 +74,37 @@ def test_add_data(): assert df["station"].nunique() == latlon.nunique() -def test_add_data_place_sel(): +def test_add_data_location_sel(): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") df = gml_ozonesonde.add_data( dates, - place=["Boulder, Colorado", "South Pole, Antarctica"], + location=["Boulder, Colorado", "South Pole, Antarctica"], n_procs=2, ) assert len(df) > 0 latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str) - assert latlon.nunique() == 2, "selected two places" + assert latlon.nunique() == 2, "selected two locations" @pytest.mark.parametrize( - "place", + "location", ["asdf", ["asdf", "blah"], ("asdf", "blah")], ) -def test_add_data_invalid_place(place): +def test_add_data_invalid_location(location): dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H") - with pytest.raises(ValueError, match="Invalid place"): - _ = gml_ozonesonde.add_data(dates, place=place) + with pytest.raises(ValueError, match="Invalid location"): + _ = gml_ozonesonde.add_data(dates, location=location) -def test_same_place_and_launch_time(): +def test_same_location_and_launch_time(): # Two files with same file time and launch time: # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl774_2003_03_10_20.l100 # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl775_2003_03_10_20.l100 # File time: 2003-03-10 20 # Launch time: 2003-03-10 20:41:11 dates = ["2003-03-10 20", "2003-03-10 21"] - df = gml_ozonesonde.add_data(dates, place="Boulder, Colorado", n_procs=2) + df = gml_ozonesonde.add_data(dates, location="Boulder, Colorado", n_procs=2) assert len(df) > 0 # Only one launch time From 649a61111d42972ca663e1e203eecf70a80289e3 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 11 Apr 2024 16:53:14 -0600 Subject: [PATCH 51/51] 'siteid' in `add_data` output --- monetio/profile/gml_ozonesonde.py | 5 ++++- tests/test_gml_ozonesonde.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py index c993b3d8..d0754126 100644 --- a/monetio/profile/gml_ozonesonde.py +++ b/monetio/profile/gml_ozonesonde.py @@ -136,7 +136,7 @@ def add_data(dates, *, location=None, n_procs=1, errors="raise"): For example 'Boulder, Colorado'. If not provided, all locations will be used. Valid options correspond to the directories in https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/ - and may include data from more than one unique site (output column 'station'). + and may include data from more than one unique site (output column 'siteid'). n_procs : int For Dask. errors : {'raise', 'warn', 'skip'} @@ -213,6 +213,9 @@ def func(fp_or_url): assert set(repl.values()) <= set(LOCATIONS) df["station"] = df["station"].replace(repl) + # Normalized station name as site ID + df = df.rename(columns={"station": "siteid"}) + # Add metadata if hasattr(df, "attrs"): df.attrs["ds_attrs"] = {"urls": urls} diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py index b12fd39b..306d28b6 100644 --- a/tests/test_gml_ozonesonde.py +++ b/tests/test_gml_ozonesonde.py @@ -71,7 +71,7 @@ def test_add_data(): assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile" # NOTE: Similar to the place folder names, but not all the same - assert df["station"].nunique() == latlon.nunique() + assert df["siteid"].nunique() == latlon.nunique() def test_add_data_location_sel():