From 102113523c3ecea4b428346d71dc42e9cf8190e2 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 12 Jan 2024 11:33:20 -0700
Subject: [PATCH 01/51] Testing GML ozonesonde

---
 t3.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 t3.py

diff --git a/t3.py b/t3.py
new file mode 100644
index 00000000..8d82800d
--- /dev/null
+++ b/t3.py
@@ -0,0 +1,88 @@
+"""
+Testing loading GML ozonesondes
+"""
+# import re
+from io import StringIO
+
+import pandas as pd
+import requests
+
+# from tempfile import NamedTemporaryFile
+
+
+# from monetio import icartt
+
+# 100-m
+url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100"
+
+r = requests.get(url)
+r.raise_for_status()
+
+# # ICARTT parser doesn't seem to work for it
+# with NamedTemporaryFile(delete=False) as f:
+#     f.write(r.content)
+#     f.seek(0)
+# ic = icartt.add_data(f.name)
+
+blocks = r.text.replace("\r", "").split("\n\n")
+assert len(blocks) == 5
+
+# Metadata
+meta = {}
+todo = blocks[3].splitlines()
+blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "]
+for line in todo:
+    key, val = line.split(":", 1)
+    # maybes = re.split(r"\s{2,}", val.strip())
+    # if len(maybes) == 1:
+    #     meta[key] = val
+    # else:
+    #     meta[key] = maybes[0]
+    #     todo.extend(maybes[1:])
+    #     continue
+    for key_ish in blah:
+        if key_ish in val:
+            i = val.index(key_ish)
+            meta[key.strip()] = val[:i].strip()
+            todo.append(val[i:])
+            break
+    else:
+        meta[key.strip()] = val.strip()
+    # TODO: replace multi space in val with single
+
+col_info = [
+    # name, units, na
+    ("lev", "", None),
+    ("press", "hPa", None),
+    ("alt", "km", None),
+    ("theta", "K", None),  # "Pottp", pretty sure this potential temperature
+    ("temp", "degC", None),
+    ("ftempv", "degC", "999.9"),  # TODO: what is?
+    ("rh", "%", "999"),
+    ("press_o3", "mPa", "99.90"),
+    ("o3", "ppmv", "99.999"),
+    ("o3_tot", "atm-cm", "99.9990"),  # 1 DU = 0.001 atm-cm
+    ("pumptemp", "degC", "999.9"),  # "Ptemp", I think this is the pump temperature
+    ("o3_num", "10^11 cm-3", "999.999"),
+    ("o3_res", "DU", "9999"),
+    ("o3_uncert", "%", "99999.000"),
+]
+
+assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14
+
+names = [c[0] for c in col_info]
+dtype = {c[0]: float for c in col_info}
+dtype["lev"] = int
+na_values = {c[0]: c[2] for c in col_info if c[2] is not None}
+
+df = pd.read_csv(
+    StringIO(blocks[4]),
+    skiprows=2,
+    header=None,
+    delimiter=r"\s+",
+    names=names,
+    dtype=dtype,
+    na_values=na_values,
+)
+
+theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)  # close to "Pottp"

From eaef72bb563a6a3ccdbf4245b9824583065d8bce Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 12 Jan 2024 12:16:03 -0700
Subject: [PATCH 02/51] Move to profile group

---
 t3.py => monetio/profile/gml_ozonesonde.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename t3.py => monetio/profile/gml_ozonesonde.py (100%)

diff --git a/t3.py b/monetio/profile/gml_ozonesonde.py
similarity index 100%
rename from t3.py
rename to monetio/profile/gml_ozonesonde.py

From 22f1b42346f5bc5a2a585fb0f3603c144768315f Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 12 Jan 2024 12:24:44 -0700
Subject: [PATCH 03/51] Check meta keys; clean up

---
 monetio/profile/gml_ozonesonde.py | 42 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 8d82800d..59adb8a2 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -1,45 +1,27 @@
 """
 Testing loading GML ozonesondes
 """
-# import re
 from io import StringIO
 
 import pandas as pd
 import requests
 
-# from tempfile import NamedTemporaryFile
-
-
-# from monetio import icartt
-
 # 100-m
 url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100"
 
 r = requests.get(url)
 r.raise_for_status()
 
-# # ICARTT parser doesn't seem to work for it
-# with NamedTemporaryFile(delete=False) as f:
-#     f.write(r.content)
-#     f.seek(0)
-# ic = icartt.add_data(f.name)
-
 blocks = r.text.replace("\r", "").split("\n\n")
 assert len(blocks) == 5
 
 # Metadata
 meta = {}
-todo = blocks[3].splitlines()
+todo = blocks[3].splitlines()[::-1]
 blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "]
-for line in todo:
+while todo:
+    line = todo.pop()
     key, val = line.split(":", 1)
-    # maybes = re.split(r"\s{2,}", val.strip())
-    # if len(maybes) == 1:
-    #     meta[key] = val
-    # else:
-    #     meta[key] = maybes[0]
-    #     todo.extend(maybes[1:])
-    #     continue
     for key_ish in blah:
         if key_ish in val:
             i = val.index(key_ish)
@@ -50,6 +32,24 @@
         meta[key.strip()] = val.strip()
     # TODO: replace multi space in val with single
 
+assert list(meta) == [
+    "Station",
+    "Station Height",
+    "Latitude",
+    "Longitude",
+    "Flight Number",
+    "Launch Date",
+    "Launch Time",
+    "Radiosonde Type",
+    "Radiosonde Num",
+    "O3 Sonde ID",
+    "Background",
+    "Flowrate",
+    "RH Corr",
+    "Sonde Total O3",
+    "Sonde Total O3 (SBUV)",
+]
+
 col_info = [
     # name, units, na
     ("lev", "", None),

From c76bacecb78643a23f8f97b254829cc66fe78f08 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 12 Jan 2024 12:27:42 -0700
Subject: [PATCH 04/51] Eliminate extra spaces in meta values

---
 monetio/profile/gml_ozonesonde.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 59adb8a2..a5160f02 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -1,6 +1,7 @@
 """
 Testing loading GML ozonesondes
 """
+import re
 from io import StringIO
 
 import pandas as pd
@@ -30,7 +31,9 @@
             break
     else:
         meta[key.strip()] = val.strip()
-    # TODO: replace multi space in val with single
+
+for k, v in meta.items():
+    meta[k] = re.sub(r"\s{2,}", " ", v)
 
 assert list(meta) == [
     "Station",

From cf86140757f4e2b15a7679034178db7e2375bf68 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 12 Jan 2024 12:39:13 -0700
Subject: [PATCH 05/51] Add time; adjust variable info

---
 monetio/profile/gml_ozonesonde.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index a5160f02..3f6f8438 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -56,18 +56,20 @@
 col_info = [
     # name, units, na
     ("lev", "", None),
-    ("press", "hPa", None),
-    ("alt", "km", None),
-    ("theta", "K", None),  # "Pottp", pretty sure this potential temperature
-    ("temp", "degC", None),
-    ("ftempv", "degC", "999.9"),  # TODO: what is?
+    ("press", "hPa", "9999.9"),
+    ("alt", "km", "999.999"),  # TODO: not sure about this na val
+    ("theta", "K", "9999.9"),  # "Pottp", pretty sure this potential temperature
+    ("temp", "degC", "999.9"),
+    ("ftempv", "degC", "999.9"),  # TODO: what is this?
     ("rh", "%", "999"),
-    ("press_o3", "mPa", "99.90"),
+    ("o3_press", "mPa", "99.90"),
     ("o3", "ppmv", "99.999"),
-    ("o3_tot", "atm-cm", "99.9990"),  # 1 DU = 0.001 atm-cm
+    ("o3_cm", "atm-cm", "99.9990"),
+    # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below?
     ("pumptemp", "degC", "999.9"),  # "Ptemp", I think this is the pump temperature
-    ("o3_num", "10^11 cm-3", "999.999"),
-    ("o3_res", "DU", "9999"),
+    ("o3_nd", "10^11 cm-3", "999.999"),
+    ("o3_col", "DU", "9999"),
+    # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above
     ("o3_uncert", "%", "99999.000"),
 ]
 
@@ -89,3 +91,6 @@
 )
 
 theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)  # close to "Pottp"
+time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
+
+df["time"] = time.tz_localize(None)

From 501e218c47f4404dfc5910cdb3490444d00a7dac Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 12 Jan 2024 12:40:23 -0700
Subject: [PATCH 06/51] Add lat/lon

---
 monetio/profile/gml_ozonesonde.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 3f6f8438..222cddd0 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -94,3 +94,5 @@
 time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
 
 df["time"] = time.tz_localize(None)
+df["latitude"] = float(meta["Latitude"])
+df["longitude"] = float(meta["Longitude"])

From 5bd8bd240547f804583e05070e10bb3eab1758fc Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 12 Jan 2024 12:42:02 -0700
Subject: [PATCH 07/51] "altitude"

---
 monetio/profile/gml_ozonesonde.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 222cddd0..dcae4495 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -57,7 +57,7 @@
     # name, units, na
     ("lev", "", None),
     ("press", "hPa", "9999.9"),
-    ("alt", "km", "999.999"),  # TODO: not sure about this na val
+    ("altitude", "km", "999.999"),  # TODO: not sure about this na val
     ("theta", "K", "9999.9"),  # "Pottp", pretty sure this potential temperature
     ("temp", "degC", "999.9"),
     ("ftempv", "degC", "999.9"),  # TODO: what is this?

From 70926797baf59056979a745454b477307932bccb Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 14:41:09 -0700
Subject: [PATCH 08/51] Move to read func and initial test

---
 monetio/__init__.py               |   3 +-
 monetio/profile/__init__.py       |   4 +-
 monetio/profile/gml_ozonesonde.py | 189 ++++++++++++++++--------------
 tests/test_gml_ozonesonde.py      |   7 ++
 4 files changed, 110 insertions(+), 93 deletions(-)
 create mode 100644 tests/test_gml_ozonesonde.py

diff --git a/monetio/__init__.py b/monetio/__init__.py
index 9c60c152..e40494ed 100644
--- a/monetio/__init__.py
+++ b/monetio/__init__.py
@@ -1,7 +1,7 @@
 from . import grids
 from .models import camx, cmaq, fv3chem, hysplit, hytraj, ncep_grib, pardump, prepchem, raqms
 from .obs import aeronet, airnow, aqs, cems, crn, improve, ish, ish_lite, nadp, openaq, pams
-from .profile import geoms, icartt, tolnet
+from .profile import geoms, gml_ozonesonde, icartt, tolnet
 from .sat import goes
 
 __version__ = "0.2.5"
@@ -33,6 +33,7 @@
     #
     # profile obs
     "geoms",
+    "gml_ozonesonde",
     "icartt",
     "tolnet",
     #
diff --git a/monetio/profile/__init__.py b/monetio/profile/__init__.py
index aa328999..b60f8841 100644
--- a/monetio/profile/__init__.py
+++ b/monetio/profile/__init__.py
@@ -1,5 +1,5 @@
-from . import geoms, icartt, tolnet
+from . import geoms, gml_ozonesonde, icartt, tolnet
 
-__all__ = ["tolnet", "icartt", "geoms"]
+__all__ = ["tolnet", "icartt", "geoms", "gml_ozonesonde"]
 
 __name__ = "profile"
diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index dcae4495..9d81ff70 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -5,94 +5,103 @@
 from io import StringIO
 
 import pandas as pd
-import requests
-
-# 100-m
-url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100"
-
-r = requests.get(url)
-r.raise_for_status()
-
-blocks = r.text.replace("\r", "").split("\n\n")
-assert len(blocks) == 5
-
-# Metadata
-meta = {}
-todo = blocks[3].splitlines()[::-1]
-blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "]
-while todo:
-    line = todo.pop()
-    key, val = line.split(":", 1)
-    for key_ish in blah:
-        if key_ish in val:
-            i = val.index(key_ish)
-            meta[key.strip()] = val[:i].strip()
-            todo.append(val[i:])
-            break
+
+
+def read_100m(fp_or_url):
+    import requests
+
+    if fp_or_url.startswith("http"):
+        r = requests.get(fp_or_url, timeout=10)
+        r.raise_for_status()
+        text = r.text
     else:
-        meta[key.strip()] = val.strip()
-
-for k, v in meta.items():
-    meta[k] = re.sub(r"\s{2,}", " ", v)
-
-assert list(meta) == [
-    "Station",
-    "Station Height",
-    "Latitude",
-    "Longitude",
-    "Flight Number",
-    "Launch Date",
-    "Launch Time",
-    "Radiosonde Type",
-    "Radiosonde Num",
-    "O3 Sonde ID",
-    "Background",
-    "Flowrate",
-    "RH Corr",
-    "Sonde Total O3",
-    "Sonde Total O3 (SBUV)",
-]
-
-col_info = [
-    # name, units, na
-    ("lev", "", None),
-    ("press", "hPa", "9999.9"),
-    ("altitude", "km", "999.999"),  # TODO: not sure about this na val
-    ("theta", "K", "9999.9"),  # "Pottp", pretty sure this potential temperature
-    ("temp", "degC", "999.9"),
-    ("ftempv", "degC", "999.9"),  # TODO: what is this?
-    ("rh", "%", "999"),
-    ("o3_press", "mPa", "99.90"),
-    ("o3", "ppmv", "99.999"),
-    ("o3_cm", "atm-cm", "99.9990"),
-    # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below?
-    ("pumptemp", "degC", "999.9"),  # "Ptemp", I think this is the pump temperature
-    ("o3_nd", "10^11 cm-3", "999.999"),
-    ("o3_col", "DU", "9999"),
-    # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above
-    ("o3_uncert", "%", "99999.000"),
-]
-
-assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14
-
-names = [c[0] for c in col_info]
-dtype = {c[0]: float for c in col_info}
-dtype["lev"] = int
-na_values = {c[0]: c[2] for c in col_info if c[2] is not None}
-
-df = pd.read_csv(
-    StringIO(blocks[4]),
-    skiprows=2,
-    header=None,
-    delimiter=r"\s+",
-    names=names,
-    dtype=dtype,
-    na_values=na_values,
-)
-
-theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)  # close to "Pottp"
-time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
-
-df["time"] = time.tz_localize(None)
-df["latitude"] = float(meta["Latitude"])
-df["longitude"] = float(meta["Longitude"])
+        with open(fp_or_url) as f:
+            text = f.read()
+
+    blocks = text.replace("\r", "").split("\n\n")
+    assert len(blocks) == 5
+
+    # Metadata
+    meta = {}
+    todo = blocks[3].splitlines()[::-1]
+    blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "]
+    while todo:
+        line = todo.pop()
+        key, val = line.split(":", 1)
+        for key_ish in blah:
+            if key_ish in val:
+                i = val.index(key_ish)
+                meta[key.strip()] = val[:i].strip()
+                todo.append(val[i:])
+                break
+        else:
+            meta[key.strip()] = val.strip()
+
+    for k, v in meta.items():
+        meta[k] = re.sub(r"\s{2,}", " ", v)
+
+    assert list(meta) == [
+        "Station",
+        "Station Height",
+        "Latitude",
+        "Longitude",
+        "Flight Number",
+        "Launch Date",
+        "Launch Time",
+        "Radiosonde Type",
+        "Radiosonde Num",
+        "O3 Sonde ID",
+        "Background",
+        "Flowrate",
+        "RH Corr",
+        "Sonde Total O3",
+        "Sonde Total O3 (SBUV)",
+    ]
+
+    col_info = [
+        # name, units, na
+        ("lev", "", None),
+        ("press", "hPa", "9999.9"),
+        ("altitude", "km", "999.999"),  # TODO: not sure about this na val
+        ("theta", "K", "9999.9"),  # "Pottp", pretty sure this potential temperature
+        ("temp", "degC", "999.9"),
+        ("ftempv", "degC", "999.9"),  # TODO: what is this?
+        ("rh", "%", "999"),
+        ("o3_press", "mPa", "99.90"),
+        ("o3", "ppmv", "99.999"),
+        ("o3_cm", "atm-cm", "99.9990"),
+        # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below?
+        ("pumptemp", "degC", "999.9"),  # "Ptemp", I think this is the pump temperature
+        ("o3_nd", "10^11 cm-3", "999.999"),
+        ("o3_col", "DU", "9999"),
+        # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above
+        ("o3_uncert", "%", "99999.000"),
+    ]
+
+    assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14
+
+    names = [c[0] for c in col_info]
+    dtype = {c[0]: float for c in col_info}
+    dtype["lev"] = int
+    na_values = {c[0]: c[2] for c in col_info if c[2] is not None}
+
+    df = pd.read_csv(
+        StringIO(blocks[4]),
+        skiprows=2,
+        header=None,
+        delimiter=r"\s+",
+        names=names,
+        dtype=dtype,
+        na_values=na_values,
+    )
+
+    # This close to "Pottp" but not exactly the same
+    theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)  # noqa: F841
+
+    time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
+
+    df["time"] = time.tz_localize(None)
+    df["latitude"] = float(meta["Latitude"])
+    df["longitude"] = float(meta["Longitude"])
+
+    return df
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
new file mode 100644
index 00000000..86c568f0
--- /dev/null
+++ b/tests/test_gml_ozonesonde.py
@@ -0,0 +1,7 @@
+from monetio import gml_ozonesonde
+
+
+def test_read_100m():
+    url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100"
+    df = gml_ozonesonde.read_100m(url)
+    assert len(df) > 0

From 4111dbecdcabb7e438e693abcd5aeb259c37e006 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 14:42:42 -0700
Subject: [PATCH 09/51] Little more robust

---
 monetio/profile/gml_ozonesonde.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 9d81ff70..2923673b 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -10,7 +10,7 @@
 def read_100m(fp_or_url):
     import requests
 
-    if fp_or_url.startswith("http"):
+    if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")):
         r = requests.get(fp_or_url, timeout=10)
         r.raise_for_status()
         text = r.text

From e4bf0e92027c7f67170713ad4167862203eb4608 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 15:19:51 -0700
Subject: [PATCH 10/51] Discover 100-m files

---
 monetio/profile/gml_ozonesonde.py | 51 +++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 2923673b..12be9710 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -1,14 +1,59 @@
 """
-Testing loading GML ozonesondes
+Load GML ozonesondes from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
 """
 import re
-from io import StringIO
 
+import numpy as np
 import pandas as pd
+import requests
+
+PLACES = [
+    "Boulder, Colorado",
+    "Hilo, Hawaii",
+    "Huntsville, Alabama",
+    "Narragansett, Rhode Island",
+    "Pago Pago, American Samoa",
+    "San Cristobal, Galapagos",
+    "South Pole, Antartica",  # note sp
+    "Summit, Greenland",
+    "Suva, Fiji",
+    "Trinidad Head, California",
+]
+
+
+def discover_files():
+    base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde"
+    data = []
+    for place in PLACES:
+        url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20")
+        print(url)
+        r = requests.get(url, timeout=10)
+        r.raise_for_status()
+        for m in re.finditer(r'href="([a-z0-9_]+\.l100)"', r.text):
+            fn = m.group(1)
+            if fn.startswith("san_cristobal_"):
+                a, b = 3, -1
+            else:
+                a, b = 1, -1
+            t_str = "".join(re.split(r"[_\.]", fn)[a:b])
+            try:
+                t = pd.to_datetime(t_str, format=r"%Y%m%d%H")
+            except ValueError:
+                print(f"warning: Failed to parse {fn} for time")
+                t = np.nan
+            data.append((place, t, fn, f"{url}{fn}"))
+
+    df = pd.DataFrame(data, columns=["place", "time", "fn", "url"])
+
+    missing = set(PLACES) - set(df["place"].unique())
+    if missing:
+        print(f"warning: No files detected for these places: {missing}")
+
+    return df
 
 
 def read_100m(fp_or_url):
-    import requests
+    from io import StringIO
 
     if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")):
         r = requests.get(fp_or_url, timeout=10)

From d9fbefbb1d1b65e06f18bf392e023ac70c836e0e Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 15:49:28 -0700
Subject: [PATCH 11/51] Initial `add_data`

---
 monetio/profile/gml_ozonesonde.py | 25 ++++++++++++++++++++++++-
 tests/test_gml_ozonesonde.py      | 11 +++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 12be9710..3d2345af 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -24,7 +24,7 @@
 def discover_files():
     base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde"
     data = []
-    for place in PLACES:
+    for place in PLACES:  # TODO: multithread?
         url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20")
         print(url)
         r = requests.get(url, timeout=10)
@@ -52,6 +52,29 @@ def discover_files():
     return df
 
 
+def add_data(dates, *, n_procs=1):
+    import dask
+    import dask.dataframe as dd
+
+    dates = pd.DatetimeIndex(dates)
+    dates_min, dates_max = dates.min(), dates.max()
+
+    print("Discovering files...")
+    df_urls = discover_files()
+
+    urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist()
+
+    print("Aggregating files...")
+    dfs = [dask.delayed(read_100m)(f) for f in urls]
+    dff = dd.from_delayed(dfs)
+    df = dff.compute(num_workers=n_procs).reset_index()
+
+    # Time subset again in case of times in files extending
+    df = df[df["time"].between(dates_min, dates_max, inclusive="both")]
+
+    return df
+
+
 def read_100m(fp_or_url):
     from io import StringIO
 
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 86c568f0..2e5ffd38 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 from monetio import gml_ozonesonde
 
 
@@ -5,3 +7,12 @@ def test_read_100m():
     url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100"
     df = gml_ozonesonde.read_100m(url)
     assert len(df) > 0
+
+
+def test_add_data():
+    dates = pd.date_range("2023-01-01", "2023-02-01")[:-1]
+    df = gml_ozonesonde.add_data(dates, n_procs=2)
+    assert len(df) > 0
+
+    latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str)
+    assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile"

From 19eab98ab9c7b72655f76622332b594001303da0 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 15:58:09 -0700
Subject: [PATCH 12/51] Multithread discovering files

---
 monetio/profile/gml_ozonesonde.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 3d2345af..51a3af1d 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -21,14 +21,18 @@
 ]
 
 
-def discover_files():
+def discover_files(*, n_threads=3):
+    import itertools
+    from multiprocessing.pool import ThreadPool
+
     base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde"
-    data = []
-    for place in PLACES:  # TODO: multithread?
+
+    def get_files(place):
         url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20")
         print(url)
         r = requests.get(url, timeout=10)
         r.raise_for_status()
+        data = []
         for m in re.finditer(r'href="([a-z0-9_]+\.l100)"', r.text):
             fn = m.group(1)
             if fn.startswith("san_cristobal_"):
@@ -42,6 +46,12 @@ def discover_files():
                 print(f"warning: Failed to parse {fn} for time")
                 t = np.nan
             data.append((place, t, fn, f"{url}{fn}"))
+        if not data:
+            print(f"warning: No files detected for pace {place!r}.")
+        return data
+
+    with ThreadPool(processes=n_threads) as pool:
+        data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, PLACES)))
 
     df = pd.DataFrame(data, columns=["place", "time", "fn", "url"])
 

From b7481e8c73bbdb47c3b93ff803b590291a9ba883 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 16:17:04 -0700
Subject: [PATCH 13/51] Add place selection

---
 monetio/profile/gml_ozonesonde.py | 39 +++++++++++++++++++++++--------
 tests/test_gml_ozonesonde.py      | 26 ++++++++++++++++++++-
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 51a3af1d..5c56bc84 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -2,6 +2,7 @@
 Load GML ozonesondes from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
 """
 import re
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -21,12 +22,23 @@
 ]
 
 
-def discover_files(*, n_threads=3):
+def discover_files(place=None, *, n_threads=3):
     import itertools
     from multiprocessing.pool import ThreadPool
 
     base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde"
 
+    if place is None:
+        places = PLACES
+    elif isinstance(place, str):
+        places = [place]
+    else:
+        places = place
+
+    invalid = set(places) - set(PLACES)
+    if invalid:
+        raise ValueError(f"Invalid place(s): {invalid}.")
+
     def get_files(place):
         url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20")
         print(url)
@@ -43,26 +55,33 @@ def get_files(place):
             try:
                 t = pd.to_datetime(t_str, format=r"%Y%m%d%H")
             except ValueError:
-                print(f"warning: Failed to parse {fn} for time")
+                warnings.warn(f"Failed to parse file name {fn!r} for time.")
                 t = np.nan
             data.append((place, t, fn, f"{url}{fn}"))
         if not data:
-            print(f"warning: No files detected for pace {place!r}.")
+            warnings.warn(f"No files detected for place {place!r}.")
         return data
 
     with ThreadPool(processes=n_threads) as pool:
-        data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, PLACES)))
+        data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, places)))
 
     df = pd.DataFrame(data, columns=["place", "time", "fn", "url"])
 
-    missing = set(PLACES) - set(df["place"].unique())
-    if missing:
-        print(f"warning: No files detected for these places: {missing}")
-
     return df
 
 
-def add_data(dates, *, n_procs=1):
+def add_data(dates, *, place=None, n_procs=1):
+    """
+
+    Parameters
+    ----------
+    dates : sequence of datetime-like
+    place : str or sequence of str, optional
+        For example 'Boulder, Colorado'.
+        If not provided, all places will be used.
+    n_procs : int
+        For Dask.
+    """
     import dask
     import dask.dataframe as dd
 
@@ -70,7 +89,7 @@ def add_data(dates, *, n_procs=1):
     dates_min, dates_max = dates.min(), dates.max()
 
     print("Discovering files...")
-    df_urls = discover_files()
+    df_urls = discover_files(place=place)
 
     urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist()
 
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 2e5ffd38..fa398a30 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import pytest
 
 from monetio import gml_ozonesonde
 
@@ -10,9 +11,32 @@ def test_read_100m():
 
 
 def test_add_data():
-    dates = pd.date_range("2023-01-01", "2023-02-01")[:-1]
+    dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
     df = gml_ozonesonde.add_data(dates, n_procs=2)
     assert len(df) > 0
 
     latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str)
     assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile"
+
+
+def test_add_data_place_sel():
+    dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
+    df = gml_ozonesonde.add_data(
+        dates,
+        place=["Boulder, Colorado", "South Pole, Antartica"],
+        n_procs=2,
+    )
+    assert len(df) > 0
+
+    latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str)
+    assert latlon.nunique() == 2, "selected two places"
+
+
+@pytest.mark.parametrize(
+    "place",
+    ["asdf", ["asdf", "blah"], ("asdf", "blah")],
+)
+def test_add_data_invalid_place(place):
+    dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
+    with pytest.raises(ValueError, match="Invalid place"):
+        _ = gml_ozonesonde.add_data(dates, place=place)

From c02c677e6200ba8fe22a9a78742e1c0899ca03e1 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 16:21:23 -0700
Subject: [PATCH 14/51] Include valid places in error msg; smaller threadpool

if appropriate
---
 monetio/profile/gml_ozonesonde.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 5c56bc84..3286310f 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -37,7 +37,7 @@ def discover_files(place=None, *, n_threads=3):
 
     invalid = set(places) - set(PLACES)
     if invalid:
-        raise ValueError(f"Invalid place(s): {invalid}.")
+        raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.")
 
     def get_files(place):
         url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20")
@@ -62,7 +62,7 @@ def get_files(place):
             warnings.warn(f"No files detected for place {place!r}.")
         return data
 
-    with ThreadPool(processes=n_threads) as pool:
+    with ThreadPool(processes=min(n_threads, len(places))) as pool:
         data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, places)))
 
     df = pd.DataFrame(data, columns=["place", "time", "fn", "url"])

From 1bb5d1928066ae560fe4b67347fad6c7c931a7cc Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 16:28:23 -0700
Subject: [PATCH 15/51] Error if no files

since otherwise Dask does
---
 monetio/profile/gml_ozonesonde.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 3286310f..d5a22273 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -71,7 +71,7 @@ def get_files(place):
 
 
 def add_data(dates, *, place=None, n_procs=1):
-    """
+    """Retrieve and load GML ozonesonde data as a DataFrame.
 
     Parameters
     ----------
@@ -90,10 +90,14 @@ def add_data(dates, *, place=None, n_procs=1):
 
     print("Discovering files...")
     df_urls = discover_files(place=place)
+    print(f"Discovered {len(df_urls)} 100-m files.")
 
     urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist()
 
-    print("Aggregating files...")
+    if not urls:
+        raise RuntimeError(f"No files found for dates {dates_min} to {dates_max}, place={place}.")
+
+    print(f"Aggregating {len(urls)} files...")
     dfs = [dask.delayed(read_100m)(f) for f in urls]
     dff = dd.from_delayed(dfs)
     df = dff.compute(num_workers=n_procs).reset_index()

From 6a1fead1caf8c9d1c42831912c37488f3058ad3d Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 16:32:54 -0700
Subject: [PATCH 16/51] notes

---
 monetio/profile/gml_ozonesonde.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index d5a22273..ac901454 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -1,5 +1,8 @@
 """
-Load GML ozonesondes from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
+Load NOAA Global Monitoring Laboratory (GML) ozonesondes
+from https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
+
+More info: https://gml.noaa.gov/ozwv/ozsondes/
 """
 import re
 import warnings
@@ -109,6 +112,12 @@ def add_data(dates, *, place=None, n_procs=1):
 
 
 def read_100m(fp_or_url):
+    """Read a GML ozonesonde 100-m file (``.l100``).
+
+    Notes
+    -----
+    Close to ICARTT format, but not quite conformant enough to use the ICARTT reader.
+    """
     from io import StringIO
 
     if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")):

From f86fafa49cecac8aea1bde66db7224a81401d198 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Tue, 6 Feb 2024 16:39:33 -0700
Subject: [PATCH 17/51] Add attrs

though they don't survive the agg

maybe better to extract col_info so it can be used for the agg
result
---
 monetio/profile/gml_ozonesonde.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index ac901454..31c88792 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -170,6 +170,7 @@ def read_100m(fp_or_url):
 
     col_info = [
         # name, units, na
+        # TODO: long_name?
         ("lev", "", None),
         ("press", "hPa", "9999.9"),
         ("altitude", "km", "999.999"),  # TODO: not sure about this na val
@@ -205,7 +206,7 @@ def read_100m(fp_or_url):
         na_values=na_values,
     )
 
-    # This close to "Pottp" but not exactly the same
+    # This is close to "Pottp" but not exactly the same
     theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)  # noqa: F841
 
     time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
@@ -214,4 +215,8 @@ def read_100m(fp_or_url):
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
 
+    if hasattr(df, "attrs"):
+        df.attrs["ds_attrs"] = meta
+        df.attrs["var_attrs"] = {name: {"units": units} for name, units, _ in col_info}
+
     return df

From 0ab42d46a702df1892ec0b3760fc3722e1053730 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 08:39:43 -0700
Subject: [PATCH 18/51] Add initial long names

with the help of the doc

still some questions though, as the variables seem a bit different
compared to the fle ones
---
 monetio/profile/gml_ozonesonde.py | 77 +++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 20 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 31c88792..03c91286 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -169,32 +169,63 @@ def read_100m(fp_or_url):
     ]
 
     col_info = [
-        # name, units, na
-        # TODO: long_name?
-        ("lev", "", None),
-        ("press", "hPa", "9999.9"),
-        ("altitude", "km", "999.999"),  # TODO: not sure about this na val
-        ("theta", "K", "9999.9"),  # "Pottp", pretty sure this potential temperature
-        ("temp", "degC", "999.9"),
-        ("ftempv", "degC", "999.9"),  # TODO: what is this?
-        ("rh", "%", "999"),
-        ("o3_press", "mPa", "99.90"),
-        ("o3", "ppmv", "99.999"),
-        ("o3_cm", "atm-cm", "99.9990"),
-        # ^ 1 DU = 0.001 atm-cm; goes up with height so could be ozone below?
-        ("pumptemp", "degC", "999.9"),  # "Ptemp", I think this is the pump temperature
-        ("o3_nd", "10^11 cm-3", "999.999"),
-        ("o3_col", "DU", "9999"),
-        # TODO: ^ what is this? "O3 Res" goes down with height so could be total ozone above
-        ("o3_uncert", "%", "99999.000"),
+        # name, long name, units, na val
+        #
+        # "Level" (just a counter, should never be nan)
+        ("lev", "level", "", None),
+        #
+        # "Press"
+        ("press", "radiosonde corrected pressure", "hPa", "9999.9"),
+        #
+        # "Alt"
+        # TODO: not sure about this na val
+        ("altitude", "altitude", "km", "999.999"),
+        #
+        # "Pottp"
+        ("theta", "potential temperature", "K", "9999.9"),
+        #
+        # "Temp"
+        ("temp", "radiosonde corrected temperature", "degC", "999.9"),
+        #
+        # "FtempV"
+        ("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"),
+        #
+        # "Hum"
+        ("rh", "radiosonde corrected relative humidity", "%", "999"),
+        #
+        # "Ozone"
+        ("o3_press", "ozone partial pressure", "mPa", "99.90"),
+        #
+        # "Ozone"
+        ("o3", "ozone mixing ratio", "ppmv", "99.999"),
+        #
+        # "Ozone"
+        # note 1 DU = 0.001 atm-cm
+        # TODO: goes up with height so could be ozone below?
+        ("o3_cm", "total ozone", "atm-cm", "99.9990"),
+        #
+        # "Ptemp"
+        ("ptemp", "pump temperature", "degC", "999.9"),
+        #
+        # "O3 # DN"
+        ("o3_nd", "ozone number density", "10^11 cm-3", "999.999"),
+        #
+        # "O3 Res"
+        # TODO: goes down with height so could be total ozone above?
+        ("o3_col", "total column ozone above", "DU", "9999"),
+        #
+        # "O3 Uncert"
+        # TODO: uncertainty in which ozone value?
+        ("o3_uncert", "uncertainty in ozone", "%", "99999.000"),
     ]
 
+    assert all(len(c) == 4 for c in col_info)
     assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14
 
     names = [c[0] for c in col_info]
     dtype = {c[0]: float for c in col_info}
     dtype["lev"] = int
-    na_values = {c[0]: c[2] for c in col_info if c[2] is not None}
+    na_values = {c[0]: c[-1] for c in col_info if c[-1] is not None}
 
     df = pd.read_csv(
         StringIO(blocks[4]),
@@ -217,6 +248,12 @@ def read_100m(fp_or_url):
 
     if hasattr(df, "attrs"):
         df.attrs["ds_attrs"] = meta
-        df.attrs["var_attrs"] = {name: {"units": units} for name, units, _ in col_info}
+        df.attrs["var_attrs"] = {
+            name: {
+                "long_name": long_name,
+                "units": units,
+            }
+            for name, long_name, units, _ in col_info
+        }
 
     return df

From 2e849f4b4668838ccfa9f5a023f89bf09670cc88 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 08:48:00 -0700
Subject: [PATCH 19/51] Extract col info

---
 monetio/profile/gml_ozonesonde.py | 128 ++++++++++++++++--------------
 1 file changed, 68 insertions(+), 60 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 03c91286..a117b40d 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -6,6 +6,7 @@
 """
 import re
 import warnings
+from typing import NamedTuple, Optional
 
 import numpy as np
 import pandas as pd
@@ -111,6 +112,65 @@ def add_data(dates, *, place=None, n_procs=1):
     return df
 
 
+class ColInfo(NamedTuple):
+    name: str
+    long_name: str
+    units: str
+    na_val: Optional[str]
+
+
+COL_INFO_100m = [
+    # name, long name, units, na val
+    #
+    # "Level" (just a counter, should never be nan)
+    ColInfo("lev", "level", "", None),
+    #
+    # "Press"
+    ColInfo("press", "radiosonde corrected pressure", "hPa", "9999.9"),
+    #
+    # "Alt"
+    # TODO: not sure about this na val
+    ColInfo("altitude", "altitude", "km", "999.999"),
+    #
+    # "Pottp"
+    ColInfo("theta", "potential temperature", "K", "9999.9"),
+    #
+    # "Temp"
+    ColInfo("temp", "radiosonde corrected temperature", "degC", "999.9"),
+    #
+    # "FtempV"
+    ColInfo("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"),
+    #
+    # "Hum"
+    ColInfo("rh", "radiosonde corrected relative humidity", "%", "999"),
+    #
+    # "Ozone"
+    ColInfo("o3_press", "ozone partial pressure", "mPa", "99.90"),
+    #
+    # "Ozone"
+    ColInfo("o3", "ozone mixing ratio", "ppmv", "99.999"),
+    #
+    # "Ozone"
+    # note 1 DU = 0.001 atm-cm
+    # TODO: goes up with height so could be ozone below?
+    ColInfo("o3_cm", "total ozone", "atm-cm", "99.9990"),
+    #
+    # "Ptemp"
+    ColInfo("ptemp", "pump temperature", "degC", "999.9"),
+    #
+    # "O3 # DN"
+    ColInfo("o3_nd", "ozone number density", "10^11 cm-3", "999.999"),
+    #
+    # "O3 Res"
+    # TODO: goes down with height so could be total ozone above?
+    ColInfo("o3_col", "total column ozone above", "DU", "9999"),
+    #
+    # "O3 Uncert"
+    # TODO: uncertainty in which ozone value?
+    ColInfo("o3_uncert", "uncertainty in ozone", "%", "99999.000"),
+]
+
+
 def read_100m(fp_or_url):
     """Read a GML ozonesonde 100-m file (``.l100``).
 
@@ -168,64 +228,12 @@ def read_100m(fp_or_url):
         "Sonde Total O3 (SBUV)",
     ]
 
-    col_info = [
-        # name, long name, units, na val
-        #
-        # "Level" (just a counter, should never be nan)
-        ("lev", "level", "", None),
-        #
-        # "Press"
-        ("press", "radiosonde corrected pressure", "hPa", "9999.9"),
-        #
-        # "Alt"
-        # TODO: not sure about this na val
-        ("altitude", "altitude", "km", "999.999"),
-        #
-        # "Pottp"
-        ("theta", "potential temperature", "K", "9999.9"),
-        #
-        # "Temp"
-        ("temp", "radiosonde corrected temperature", "degC", "999.9"),
-        #
-        # "FtempV"
-        ("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"),
-        #
-        # "Hum"
-        ("rh", "radiosonde corrected relative humidity", "%", "999"),
-        #
-        # "Ozone"
-        ("o3_press", "ozone partial pressure", "mPa", "99.90"),
-        #
-        # "Ozone"
-        ("o3", "ozone mixing ratio", "ppmv", "99.999"),
-        #
-        # "Ozone"
-        # note 1 DU = 0.001 atm-cm
-        # TODO: goes up with height so could be ozone below?
-        ("o3_cm", "total ozone", "atm-cm", "99.9990"),
-        #
-        # "Ptemp"
-        ("ptemp", "pump temperature", "degC", "999.9"),
-        #
-        # "O3 # DN"
-        ("o3_nd", "ozone number density", "10^11 cm-3", "999.999"),
-        #
-        # "O3 Res"
-        # TODO: goes down with height so could be total ozone above?
-        ("o3_col", "total column ozone above", "DU", "9999"),
-        #
-        # "O3 Uncert"
-        # TODO: uncertainty in which ozone value?
-        ("o3_uncert", "uncertainty in ozone", "%", "99999.000"),
-    ]
-
-    assert all(len(c) == 4 for c in col_info)
-    assert len(col_info) == len(blocks[4].splitlines()[2].split()) == 14
+    assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_100m) == 14
 
-    names = [c[0] for c in col_info]
-    dtype = {c[0]: float for c in col_info}
+    names = [c.name for c in COL_INFO_100m]
+    dtype = {c.name: float for c in COL_INFO_100m}
     dtype["lev"] = int
-    na_values = {c[0]: c[-1] for c in col_info if c[-1] is not None}
+    na_values = {c.name: c.na_val for c in COL_INFO_100m if c.na_val is not None}
 
     df = pd.read_csv(
         StringIO(blocks[4]),
@@ -249,11 +257,11 @@ def read_100m(fp_or_url):
     if hasattr(df, "attrs"):
         df.attrs["ds_attrs"] = meta
         df.attrs["var_attrs"] = {
-            name: {
-                "long_name": long_name,
-                "units": units,
+            c.name: {
+                "long_name": c.long_name,
+                "units": c.units,
             }
-            for name, long_name, units, _ in col_info
+            for c in COL_INFO_100m
         }
 
     return df

From 04c3dd290e9dd3fc7617c964c115cebdcf5660c3 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 08:51:39 -0700
Subject: [PATCH 20/51] Add attrs to add_data result

---
 monetio/profile/gml_ozonesonde.py | 14 +++++++++++++-
 tests/test_gml_ozonesonde.py      |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index a117b40d..10931662 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -109,6 +109,17 @@ def add_data(dates, *, place=None, n_procs=1):
     # Time subset again in case of times in files extending
     df = df[df["time"].between(dates_min, dates_max, inclusive="both")]
 
+    # Add metadata
+    if hasattr(df, "attrs"):
+        df.attrs["ds_attrs"] = {"urls": urls}
+        df.attrs["var_attrs"] = {
+            c.name: {
+                "long_name": c.long_name,
+                "units": c.units,
+            }
+            for c in COL_INFO_100m
+        }
+
     return df
 
 
@@ -248,12 +259,13 @@ def read_100m(fp_or_url):
     # This is close to "Pottp" but not exactly the same
     theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)  # noqa: F841
 
+    # Add some variables from header
     time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
-
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
 
+    # Add metadata
     if hasattr(df, "attrs"):
         df.attrs["ds_attrs"] = meta
         df.attrs["var_attrs"] = {
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index fa398a30..330afb38 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -15,6 +15,8 @@ def test_add_data():
     df = gml_ozonesonde.add_data(dates, n_procs=2)
     assert len(df) > 0
 
+    assert df.attrs["var_attrs"]["o3"]["units"] == "ppmv"
+
     latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str)
     assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile"
 

From 0d8a6cb969189e27b6159a15c07a9d682a593ad5 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 09:09:06 -0700
Subject: [PATCH 21/51] Cache file discovery

---
 monetio/profile/gml_ozonesonde.py | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 10931662..62add3a8 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -26,7 +26,10 @@
 ]
 
 
-def discover_files(place=None, *, n_threads=3):
+_FILES_L100_CACHE = {place: None for place in PLACES}
+
+
+def discover_files(place=None, *, n_threads=3, cache=True):
     import itertools
     from multiprocessing.pool import ThreadPool
 
@@ -44,6 +47,10 @@ def discover_files(place=None, *, n_threads=3):
         raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.")
 
     def get_files(place):
+        cached = _FILES_L100_CACHE[place]
+        if cached is not None:
+            return cached
+
         url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20")
         print(url)
         r = requests.get(url, timeout=10)
@@ -62,8 +69,10 @@ def get_files(place):
                 warnings.warn(f"Failed to parse file name {fn!r} for time.")
                 t = np.nan
             data.append((place, t, fn, f"{url}{fn}"))
+
         if not data:
             warnings.warn(f"No files detected for place {place!r}.")
+
         return data
 
     with ThreadPool(processes=min(n_threads, len(places))) as pool:
@@ -71,6 +80,12 @@ def get_files(place):
 
     df = pd.DataFrame(data, columns=["place", "time", "fn", "url"])
 
+    if cache:
+        for place in places:
+            _FILES_L100_CACHE[place] = list(
+                df[df["place"] == place].itertuples(index=False, name=None)
+            )
+
     return df
 
 
@@ -117,7 +132,7 @@ def add_data(dates, *, place=None, n_procs=1):
                 "long_name": c.long_name,
                 "units": c.units,
             }
-            for c in COL_INFO_100m
+            for c in COL_INFO_L100
         }
 
     return df
@@ -130,7 +145,7 @@ class ColInfo(NamedTuple):
     na_val: Optional[str]
 
 
-COL_INFO_100m = [
+COL_INFO_L100 = [
     # name, long name, units, na val
     #
     # "Level" (just a counter, should never be nan)
@@ -239,12 +254,12 @@ def read_100m(fp_or_url):
         "Sonde Total O3 (SBUV)",
     ]
 
-    assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_100m) == 14
+    assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_L100) == 14
 
-    names = [c.name for c in COL_INFO_100m]
-    dtype = {c.name: float for c in COL_INFO_100m}
+    names = [c.name for c in COL_INFO_L100]
+    dtype = {c.name: float for c in COL_INFO_L100}
     dtype["lev"] = int
-    na_values = {c.name: c.na_val for c in COL_INFO_100m if c.na_val is not None}
+    na_values = {c.name: c.na_val for c in COL_INFO_L100 if c.na_val is not None}
 
     df = pd.read_csv(
         StringIO(blocks[4]),
@@ -273,7 +288,7 @@ def read_100m(fp_or_url):
                 "long_name": c.long_name,
                 "units": c.units,
             }
-            for c in COL_INFO_100m
+            for c in COL_INFO_L100
         }
 
     return df

From 384130cca8986682bcd23e75a8d094d5731d553a Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 09:11:38 -0700
Subject: [PATCH 22/51] Antarctica

---
 monetio/profile/gml_ozonesonde.py | 10 ++++++++--
 tests/test_gml_ozonesonde.py      |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 62add3a8..8d943cd7 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -19,7 +19,7 @@
     "Narragansett, Rhode Island",
     "Pago Pago, American Samoa",
     "San Cristobal, Galapagos",
-    "South Pole, Antartica",  # note sp
+    "South Pole, Antarctica",
     "Summit, Greenland",
     "Suva, Fiji",
     "Trinidad Head, California",
@@ -51,10 +51,16 @@ def get_files(place):
         if cached is not None:
             return cached
 
-        url = f"{base}/{place}/100 Meter Average Files/".replace(" ", "%20")
+        if place == "South Pole, Antarctica":
+            url_place = "South Pole, Antartica"  # note sp
+        else:
+            url_place = place
+        url = f"{base}/{url_place}/100 Meter Average Files/".replace(" ", "%20")
         print(url)
+
         r = requests.get(url, timeout=10)
         r.raise_for_status()
+
         data = []
         for m in re.finditer(r'href="([a-z0-9_]+\.l100)"', r.text):
             fn = m.group(1)
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 330afb38..03a0acfa 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -25,7 +25,7 @@ def test_add_data_place_sel():
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
     df = gml_ozonesonde.add_data(
         dates,
-        place=["Boulder, Colorado", "South Pole, Antartica"],
+        place=["Boulder, Colorado", "South Pole, Antarctica"],
         n_procs=2,
     )
     assert len(df) > 0

From dbd7e30f2ad81ec6c6c11010eb4f3ee6ba1dde2e Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 09:27:07 -0700
Subject: [PATCH 23/51] Add station name and height to frame

---
 monetio/profile/gml_ozonesonde.py | 6 ++++--
 tests/test_gml_ozonesonde.py      | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 8d943cd7..89e2fe8b 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -277,14 +277,16 @@ def read_100m(fp_or_url):
         na_values=na_values,
     )
 
-    # This is close to "Pottp" but not exactly the same
-    theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)  # noqa: F841
+    # Note: This is close to "Pottp" but not exactly the same
+    # theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)
 
     # Add some variables from header
     time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
+    df["station"] = meta["Station"]
+    df["station_height"] = float(meta["Station Height"])
 
     # Add metadata
     if hasattr(df, "attrs"):
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 03a0acfa..e774bd50 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -20,6 +20,9 @@ def test_add_data():
     latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str)
     assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile"
 
+    # NOTE: Similar to the place folder names, but not all the same
+    assert df["station"].nunique() == latlon.nunique()
+
 
 def test_add_data_place_sel():
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")

From 2fa6d48c8fd4d2c2ee6eb8d8335450caf5ec0036 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 09:48:57 -0700
Subject: [PATCH 24/51] Add retry for requests

did get timeout once this morn
---
 monetio/profile/gml_ozonesonde.py | 51 ++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 89e2fe8b..994a9077 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -12,6 +12,35 @@
 import pandas as pd
 import requests
 
+
+def retry(func):
+    import time
+    from functools import wraps
+    from random import random as rand
+
+    n = 3
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        for i in range(n):
+            try:
+                res = func(*args, **kwargs)
+            except (
+                requests.exceptions.ReadTimeout,
+                requests.exceptions.ConnectionError,
+            ) as e:
+                print(f"Failed: {e}")
+                time.sleep(0.5 * i + rand() * 0.1)
+            else:
+                break
+        else:
+            raise RuntimeError(f"failed after {n} tries")
+
+        return res
+
+    return wrapper
+
+
 PLACES = [
     "Boulder, Colorado",
     "Hilo, Hawaii",
@@ -46,6 +75,7 @@ def discover_files(place=None, *, n_threads=3, cache=True):
     if invalid:
         raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.")
 
+    @retry
     def get_files(place):
         cached = _FILES_L100_CACHE[place]
         if cached is not None:
@@ -213,14 +243,21 @@ def read_100m(fp_or_url):
     from io import StringIO
 
     if isinstance(fp_or_url, str) and fp_or_url.startswith(("http://", "https://")):
-        r = requests.get(fp_or_url, timeout=10)
-        r.raise_for_status()
-        text = r.text
+
+        @retry
+        def get_text():
+            r = requests.get(fp_or_url, timeout=10)
+            r.raise_for_status()
+            return r.text
+
     else:
-        with open(fp_or_url) as f:
-            text = f.read()
 
-    blocks = text.replace("\r", "").split("\n\n")
+        def get_text():
+            with open(fp_or_url) as f:
+                text = f.read()
+            return text
+
+    blocks = get_text().replace("\r", "").split("\n\n")
     assert len(blocks) == 5
 
     # Metadata
@@ -286,7 +323,7 @@ def read_100m(fp_or_url):
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
     df["station"] = meta["Station"]
-    df["station_height"] = float(meta["Station Height"])
+    df["station_height_str"] = meta["Station Height"]
 
     # Add metadata
     if hasattr(df, "attrs"):

From 945b1d3d5fbcb5f553dfea44ff8b02e6e618f1ce Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 09:51:35 -0700
Subject: [PATCH 25/51] cleanup

---
 monetio/profile/gml_ozonesonde.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 994a9077..7c1db7d2 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -28,13 +28,12 @@ def wrapper(*args, **kwargs):
             except (
                 requests.exceptions.ReadTimeout,
                 requests.exceptions.ConnectionError,
-            ) as e:
-                print(f"Failed: {e}")
+            ):
                 time.sleep(0.5 * i + rand() * 0.1)
             else:
                 break
         else:
-            raise RuntimeError(f"failed after {n} tries")
+            raise RuntimeError(f"{func.__name__} failed after {n} tries.")
 
         return res
 

From 8e6c98457cbf64467d092c3ab43df6e6e0471f1a Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 10:04:27 -0700
Subject: [PATCH 26/51] Add sonde total o3 strings

---
 monetio/profile/gml_ozonesonde.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 7c1db7d2..237ac1af 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -316,13 +316,15 @@ def get_text():
     # Note: This is close to "Pottp" but not exactly the same
     # theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)
 
-    # Add some variables from header
+    # Add some variables from header (these don't change in the profile)
     time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
     df["station"] = meta["Station"]
-    df["station_height_str"] = meta["Station Height"]
+    df["station_height_str"] = meta["Station Height"]  # e.g. '1743 meters'
+    df["o3_tot_cmr_str"] = meta["Sonde Total O3"]
+    df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]  # e.g. '325 (62) DU'
 
     # Add metadata
     if hasattr(df, "attrs"):

From 86a8a208c90aca00a9523ef5eda6adf73bb7b11c Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 10:24:43 -0700
Subject: [PATCH 27/51] Test some header attr vals

---
 monetio/profile/gml_ozonesonde.py |  4 ++--
 tests/test_gml_ozonesonde.py      | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 237ac1af..02d69212 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -262,11 +262,11 @@ def get_text():
     # Metadata
     meta = {}
     todo = blocks[3].splitlines()[::-1]
-    blah = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "]
+    on_val_side = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "]
     while todo:
         line = todo.pop()
         key, val = line.split(":", 1)
-        for key_ish in blah:
+        for key_ish in on_val_side:
             if key_ish in val:
                 i = val.index(key_ish)
                 meta[key.strip()] = val[:i].strip()
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index e774bd50..fc34a58d 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -9,6 +9,16 @@ def test_read_100m():
     df = gml_ozonesonde.read_100m(url)
     assert len(df) > 0
 
+    assert df.attrs["ds_attrs"]["Station"] == "Boulder, CO"
+    assert df.attrs["ds_attrs"]["Station Height"] == "1743 meters"
+    assert df.attrs["ds_attrs"]["Flight Number"] == "BU1043"
+    assert df.attrs["ds_attrs"]["O3 Sonde ID"] == "2z43312"
+    assert df.attrs["ds_attrs"]["Background"] == "0.020 microamps (0.08 mPa)"
+    assert df.attrs["ds_attrs"]["Flowrate"] == "29.89 sec/100ml"
+    assert df.attrs["ds_attrs"]["RH Corr"] == "0.31 %"
+    assert df.attrs["ds_attrs"]["Sonde Total O3"] == "329 (65) DU"
+    assert df.attrs["ds_attrs"]["Sonde Total O3 (SBUV)"] == "325 (62) DU"
+
 
 def test_add_data():
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")

From bbf3765f6c8532489ae7bf016268e3c8d912b051 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 10:36:25 -0700
Subject: [PATCH 28/51] Test discover files gets all places

---
 tests/test_gml_ozonesonde.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index fc34a58d..8f3e7898 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -4,6 +4,12 @@
 from monetio import gml_ozonesonde
 
 
+def test_discover_files():
+    files = gml_ozonesonde.discover_files()
+    assert len(files) > 0
+    assert set(files["place"].unique()) == set(gml_ozonesonde.PLACES)
+
+
 def test_read_100m():
     url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu1043_2023_12_27_17.l100"
     df = gml_ozonesonde.read_100m(url)

From c9abd1a20244b9833bde25b4a0cbf72c7a5dc676 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 7 Feb 2024 10:58:30 -0700
Subject: [PATCH 29/51] Found some other NA vals

---
 monetio/profile/gml_ozonesonde.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 02d69212..e84ffc36 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -6,7 +6,7 @@
 """
 import re
 import warnings
-from typing import NamedTuple, Optional
+from typing import NamedTuple, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -177,7 +177,7 @@ class ColInfo(NamedTuple):
     name: str
     long_name: str
     units: str
-    na_val: Optional[str]
+    na_val: Optional[Union[str, Tuple[str, ...]]]
 
 
 COL_INFO_L100 = [
@@ -224,11 +224,11 @@ class ColInfo(NamedTuple):
     #
     # "O3 Res"
     # TODO: goes down with height so could be total ozone above?
-    ColInfo("o3_col", "total column ozone above", "DU", "9999"),
+    ColInfo("o3_col", "total column ozone above", "DU", ("9999", "99999", "99.999")),
     #
     # "O3 Uncert"
     # TODO: uncertainty in which ozone value?
-    ColInfo("o3_uncert", "uncertainty in ozone", "%", "99999.000"),
+    ColInfo("o3_uncert", "uncertainty in ozone", "%", ("99999.000", "99.999")),
 ]
 
 
@@ -321,10 +321,11 @@ def get_text():
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
-    df["station"] = meta["Station"]
+    df["station"] = meta["Station"]  # TODO: could normalize to place
     df["station_height_str"] = meta["Station Height"]  # e.g. '1743 meters'
     df["o3_tot_cmr_str"] = meta["Sonde Total O3"]
     df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]  # e.g. '325 (62) DU'
+    # TODO: '99999 (99999) DU' if NA, could put empty string instead?
 
     # Add metadata
     if hasattr(df, "attrs"):

From 62a2d53f40bef5ee803e53a215fdab27df396df7 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 09:52:53 -0700
Subject: [PATCH 30/51] notes

---
 monetio/profile/gml_ozonesonde.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index e84ffc36..c9324cd2 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -212,7 +212,7 @@ class ColInfo(NamedTuple):
     ColInfo("o3", "ozone mixing ratio", "ppmv", "99.999"),
     #
     # "Ozone"
-    # note 1 DU = 0.001 atm-cm
+    # Note 1 DU = 0.001 atm-cm
     # TODO: goes up with height so could be ozone below?
     ColInfo("o3_cm", "total ozone", "atm-cm", "99.9990"),
     #
@@ -223,11 +223,14 @@ class ColInfo(NamedTuple):
     ColInfo("o3_nd", "ozone number density", "10^11 cm-3", "999.999"),
     #
     # "O3 Res"
-    # TODO: goes down with height so could be total ozone above?
+    # From Owen Cooper (NOAA CSL):
+    #   This is the amount of ozone in Dobson units above a given altitude.
+    #   The values above the maximum balloon altitude are from a climatology.
+    #   This is mainly for UV absorption research.
     ColInfo("o3_col", "total column ozone above", "DU", ("9999", "99999", "99.999")),
     #
     # "O3 Uncert"
-    # TODO: uncertainty in which ozone value?
+    # Estimated uncertainty in the ozone measurement at a given altitude.
     ColInfo("o3_uncert", "uncertainty in ozone", "%", ("99999.000", "99.999")),
 ]
 

From a1f86b2edb943cde2929e4b6f4d26a9ef68f0696 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 10:57:12 -0700
Subject: [PATCH 31/51] Support skipping files that error

So far, seems like two things:
- some files have only one header block, consisting of some info lines
  and then the key-value meta directly after
- some files don't have the o3 uncert column
---
 monetio/profile/gml_ozonesonde.py | 36 +++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index c9324cd2..69dbb48a 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -124,7 +124,7 @@ def get_files(place):
     return df
 
 
-def add_data(dates, *, place=None, n_procs=1):
+def add_data(dates, *, place=None, n_procs=1, errors="raise"):
     """Retrieve and load GML ozonesonde data as a DataFrame.
 
     Parameters
@@ -135,6 +135,7 @@ def add_data(dates, *, place=None, n_procs=1):
         If not provided, all places will be used.
     n_procs : int
         For Dask.
+    errors : {'raise', 'warn', 'ignore'}
     """
     import dask
     import dask.dataframe as dd
@@ -142,6 +143,9 @@ def add_data(dates, *, place=None, n_procs=1):
     dates = pd.DatetimeIndex(dates)
     dates_min, dates_max = dates.min(), dates.max()
 
+    if errors not in {"raise", "warn", "ignore"}:
+        raise ValueError(f"Invalid errors setting: {errors!r}.")
+
     print("Discovering files...")
     df_urls = discover_files(place=place)
     print(f"Discovered {len(df_urls)} 100-m files.")
@@ -151,9 +155,21 @@ def add_data(dates, *, place=None, n_procs=1):
     if not urls:
         raise RuntimeError(f"No files found for dates {dates_min} to {dates_max}, place={place}.")
 
+    def func(fp_or_url):
+        try:
+            return read_100m(fp_or_url)
+        except Exception as e:
+            msg = f"Failed to read {fp_or_url}: {e}"
+            if errors == "raise":
+                raise RuntimeError(msg) from e
+            else:
+                if errors == "warn":
+                    warnings.warn(msg)
+                return pd.DataFrame()
+
     print(f"Aggregating {len(urls)} files...")
-    dfs = [dask.delayed(read_100m)(f) for f in urls]
-    dff = dd.from_delayed(dfs)
+    dfs = [dask.delayed(func)(url) for url in urls]
+    dff = dd.from_delayed(dfs, verify_meta=errors == "raise")
     df = dff.compute(num_workers=n_procs).reset_index()
 
     # Time subset again in case of times in files extending
@@ -260,7 +276,10 @@ def get_text():
             return text
 
     blocks = get_text().replace("\r", "").split("\n\n")
-    assert len(blocks) == 5
+    nblocks = len(blocks)
+    if not nblocks == 5:
+        heads = "\n".join("\n".join(b.splitlines()[:2] + ["..."]) for b in blocks)
+        raise ValueError(f"Expected 5 blocks, got {nblocks}:\n{heads}")
 
     # Metadata
     meta = {}
@@ -281,7 +300,7 @@ def get_text():
     for k, v in meta.items():
         meta[k] = re.sub(r"\s{2,}", " ", v)
 
-    assert list(meta) == [
+    meta_keys_expected = [
         "Station",
         "Station Height",
         "Latitude",
@@ -298,8 +317,13 @@ def get_text():
         "Sonde Total O3",
         "Sonde Total O3 (SBUV)",
     ]
+    if not list(meta) == meta_keys_expected:
+        raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.")
 
-    assert len(blocks[4].splitlines()[2].split()) == len(COL_INFO_L100) == 14
+    data_block_ncol = len(blocks[4].splitlines()[2].split())
+    if not data_block_ncol == len(COL_INFO_L100) == 14:
+        head = "\n".join(blocks[4].splitlines()[:4] + ["..."])
+        raise ValueError(f"Expected 14 columns in data block, got {data_block_ncol}:\n{head}")
 
     names = [c.name for c in COL_INFO_L100]
     dtype = {c.name: float for c in COL_INFO_L100}

From 7ba055de90398f4087c4af11b82219213c525a49 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 12:08:01 -0700
Subject: [PATCH 32/51] Support last col missing

---
 monetio/profile/gml_ozonesonde.py | 51 +++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 69dbb48a..59b5abb2 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -320,15 +320,54 @@ def get_text():
     if not list(meta) == meta_keys_expected:
         raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.")
 
+    data_head1 = blocks[4].splitlines()[0]  # TODO: without splitlines? maybe startswith
+    data_head1_split = data_head1.split()
+    data_head1_split_expected = [
+        "Level",
+        "Press",
+        "Alt",
+        "Pottp",
+        "Temp",
+        "FtempV",
+        "Hum",
+        "Ozone",
+        "Ozone",
+        "Ozone",
+        "Ptemp",
+        "O3",
+        "#",
+        "DN",
+        "O3",
+        "Res",
+        "O3",
+        "Uncert",
+    ]
+    if not (
+        data_head1_split == data_head1_split_expected[:-2]
+        or data_head1_split == data_head1_split_expected
+    ):
+        raise ValueError(
+            f"Expected data header line 1 like\n{' '.join(data_head1_split_expected)} "
+            f"(O3 Uncert allowed to be missing)\ngot\n{' '.join(data_head1_split)}"
+        )
+    have_uncert = len(data_head1_split) == len(data_head1_split_expected)
+
+    col_info = COL_INFO_L100[:]
+    if not have_uncert:
+        _ = col_info.pop()
+
+    ncol_expected = len(col_info)
     data_block_ncol = len(blocks[4].splitlines()[2].split())
-    if not data_block_ncol == len(COL_INFO_L100) == 14:
+    if not data_block_ncol == ncol_expected:
         head = "\n".join(blocks[4].splitlines()[:4] + ["..."])
-        raise ValueError(f"Expected 14 columns in data block, got {data_block_ncol}:\n{head}")
+        raise ValueError(
+            f"Expected {ncol_expected} columns in data block, " f"got {data_block_ncol}:\n{head}"
+        )
 
-    names = [c.name for c in COL_INFO_L100]
-    dtype = {c.name: float for c in COL_INFO_L100}
+    names = [c.name for c in col_info]
+    dtype = {c.name: float for c in col_info}
     dtype["lev"] = int
-    na_values = {c.name: c.na_val for c in COL_INFO_L100 if c.na_val is not None}
+    na_values = {c.name: c.na_val for c in col_info if c.na_val is not None}
 
     df = pd.read_csv(
         StringIO(blocks[4]),
@@ -362,7 +401,7 @@ def get_text():
                 "long_name": c.long_name,
                 "units": c.units,
             }
-            for c in COL_INFO_L100
+            for c in col_info
         }
 
     return df

From 6a3dbe2dbf84a8cf17e34a32a95ba274dadf188d Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 12:23:06 -0700
Subject: [PATCH 33/51] Check data block header lines with str

more strict, but should be faster, and luckily the two cases
seem to cover all
---
 monetio/profile/gml_ozonesonde.py | 57 ++++++++++++++-----------------
 tests/test_gml_ozonesonde.py      | 14 ++++++++
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 59b5abb2..c28be378 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -251,6 +251,17 @@ class ColInfo(NamedTuple):
 ]
 
 
+_DATA_BLOCK_START_L100 = """\
+Level   Press    Alt   Pottp   Temp   FtempV   Hum  Ozone  Ozone   Ozone  Ptemp  O3 # DN O3 Res  O3 Uncert
+ Num     hPa      km     K      C       C       %    mPa    ppmv   atmcm    C   10^11/cc   DU          %
+"""
+
+_DATA_BLOCK_START_L100_NO_UNCERT = """\
+Level   Press    Alt   Pottp   Temp   FtempV   Hum  Ozone  Ozone   Ozone  Ptemp  O3 # DN O3 Res
+ Num     hPa      km     K      C       C       %    mPa    ppmv   atmcm    C   10^11/cc   DU
+"""
+
+
 def read_100m(fp_or_url):
     """Read a GML ozonesonde 100-m file (``.l100``).
 
@@ -320,46 +331,28 @@ def get_text():
     if not list(meta) == meta_keys_expected:
         raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.")
 
-    data_head1 = blocks[4].splitlines()[0]  # TODO: without splitlines? maybe startswith
-    data_head1_split = data_head1.split()
-    data_head1_split_expected = [
-        "Level",
-        "Press",
-        "Alt",
-        "Pottp",
-        "Temp",
-        "FtempV",
-        "Hum",
-        "Ozone",
-        "Ozone",
-        "Ozone",
-        "Ptemp",
-        "O3",
-        "#",
-        "DN",
-        "O3",
-        "Res",
-        "O3",
-        "Uncert",
-    ]
-    if not (
-        data_head1_split == data_head1_split_expected[:-2]
-        or data_head1_split == data_head1_split_expected
-    ):
+    data_block = blocks[4]
+    if data_block.startswith(_DATA_BLOCK_START_L100):
+        have_uncert = True
+    elif data_block.startswith(_DATA_BLOCK_START_L100_NO_UNCERT):
+        have_uncert = False
+    else:
+        head = "\n".join(data_block.splitlines()[:2] + ["..."])
         raise ValueError(
-            f"Expected data header line 1 like\n{' '.join(data_head1_split_expected)} "
-            f"(O3 Uncert allowed to be missing)\ngot\n{' '.join(data_head1_split)}"
+            "Data block does not start with expected header line(s) "
+            "(O3 Uncert allowed to be missing):\n"
+            f"{_DATA_BLOCK_START_L100}\n"
+            f"got\n{head}"
         )
-    have_uncert = len(data_head1_split) == len(data_head1_split_expected)
 
     col_info = COL_INFO_L100[:]
     if not have_uncert:
         _ = col_info.pop()
 
     ncol_expected = len(col_info)
-    data_block_ncol = len(blocks[4].splitlines()[2].split())
+    data_block_ncol = len(data_block[:400].splitlines()[2].split())
     if not data_block_ncol == ncol_expected:
-        head = "\n".join(blocks[4].splitlines()[:4] + ["..."])
+        head = "\n".join(data_block.splitlines()[:4] + ["..."])
         raise ValueError(
             f"Expected {ncol_expected} columns in data block, " f"got {data_block_ncol}:\n{head}"
         )
@@ -370,7 +363,7 @@ def get_text():
     na_values = {c.name: c.na_val for c in col_info if c.na_val is not None}
 
     df = pd.read_csv(
-        StringIO(blocks[4]),
+        StringIO(data_block),
         skiprows=2,
         header=None,
         delimiter=r"\s+",
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 8f3e7898..dd754b05 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -26,6 +26,20 @@ def test_read_100m():
     assert df.attrs["ds_attrs"]["Sonde Total O3 (SBUV)"] == "325 (62) DU"
 
 
+@pytest.mark.parametrize(
+    "url",
+    [
+        # Missing 'O3 Uncert'
+        r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/San%20Cristobal,%20Galapagos/100%20Meter%20Average%20Files/sc204_2002_02_01_03.l100",
+        # Missing 'O3 Uncert' + different header blocks (only 1)
+        r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Narragansett,%20Rhode%20Island/100%20Meter%20Average%20Files/ri058_2004_08_05_18.l100",
+    ],
+)
+def test_read_100m_nonstd(url):
+    df = gml_ozonesonde.read_100m(url)
+    assert len(df) > 0
+
+
 def test_add_data():
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
     df = gml_ozonesonde.add_data(dates, n_procs=2)

From a3981903e7a74845a788a72874dd265ea3e84ddd Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 12:44:20 -0700
Subject: [PATCH 34/51] Support 2-block case

---
 monetio/profile/gml_ozonesonde.py | 32 ++++++++++++++++++++++---------
 tests/test_gml_ozonesonde.py      | 11 +++++++++++
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index c28be378..6e7ce446 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -288,13 +288,25 @@ def get_text():
 
     blocks = get_text().replace("\r", "").split("\n\n")
     nblocks = len(blocks)
-    if not nblocks == 5:
+    if nblocks == 5:  # normal
+        meta_block = blocks[3]
+        data_block = blocks[4]
+    elif nblocks == 2:
+        block_lines = blocks[0].splitlines()
+        for i, line in enumerate(block_lines):
+            if line.startswith(("Station:", "Station: ", "Station  ")):
+                break
+        else:
+            raise ValueError(f"Expected to find metadata to start with Station, got:\n{blocks[0]}")
+        meta_block = "\n".join(block_lines[i:])
+        data_block = blocks[1]
+    else:
         heads = "\n".join("\n".join(b.splitlines()[:2] + ["..."]) for b in blocks)
-        raise ValueError(f"Expected 5 blocks, got {nblocks}:\n{heads}")
+        raise ValueError(f"Expected 2 or 5 blocks, got {nblocks}:\n{heads}")
 
     # Metadata
     meta = {}
-    todo = blocks[3].splitlines()[::-1]
+    todo = meta_block.splitlines()[::-1]
     on_val_side = ["Background: ", "Flowrate: ", "RH Corr: ", "Sonde Total O3 (SBUV): "]
     while todo:
         line = todo.pop()
@@ -319,8 +331,9 @@ def get_text():
         "Flight Number",
         "Launch Date",
         "Launch Time",
-        "Radiosonde Type",
-        "Radiosonde Num",
+        # May see 'Vaisala number' and 'Vaisala humicap' instead of these two:
+        # "Radiosonde Type",
+        # "Radiosonde Num",
         "O3 Sonde ID",
         "Background",
         "Flowrate",
@@ -328,10 +341,9 @@ def get_text():
         "Sonde Total O3",
         "Sonde Total O3 (SBUV)",
     ]
-    if not list(meta) == meta_keys_expected:
+    if not set(meta) >= set(meta_keys_expected):
         raise ValueError(f"Expected metadata keys {meta_keys_expected}, got {list(meta)}.")
 
-    data_block = blocks[4]
     if data_block.startswith(_DATA_BLOCK_START_L100):
         have_uncert = True
     elif data_block.startswith(_DATA_BLOCK_START_L100_NO_UNCERT):
@@ -354,8 +366,10 @@ def get_text():
     if not data_block_ncol == ncol_expected:
         head = "\n".join(data_block.splitlines()[:4] + ["..."])
         raise ValueError(
-            f"Expected {ncol_expected} columns in data block, " f"got {data_block_ncol}:\n{head}"
+            f"Expected {ncol_expected} columns in data block, "
+            f"got {data_block_ncol} in first data line:\n{head}"
         )
+        # TODO: allow pandas to skip bad lines with `on_bad_lines='skip'`?
 
     names = [c.name for c in col_info]
     dtype = {c.name: float for c in col_info}
@@ -380,7 +394,7 @@ def get_text():
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
-    df["station"] = meta["Station"]  # TODO: could normalize to place
+    df["station"] = meta["Station"]  # TODO: could normalize to place (in add_data?)?
     df["station_height_str"] = meta["Station Height"]  # e.g. '1743 meters'
     df["o3_tot_cmr_str"] = meta["Sonde Total O3"]
     df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]  # e.g. '325 (62) DU'
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index dd754b05..3ffcbb5b 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -40,6 +40,17 @@ def test_read_100m_nonstd(url):
     assert len(df) > 0
 
 
+def test_read_100m_bad_data_line():
+    url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/San%20Cristobal,%20Galapagos/100%20Meter%20Average%20Files/sc204_2002_01_31_12.l100"
+    # Level   Press    Alt   Pottp   Temp   FtempV   Hum  Ozone  Ozone   Ozone  Ptemp  O3 # DN O3 Res
+    #  Num     hPa      km     K      C       C       %    mPa    ppmv   atmcm    C   10^11/cc   DU
+    #    0 -6331.0   0.008     0.0-3323.0   999.9    999-6666.00 10.529  0.0000  -91.8 1583.081    260
+    #    1   892.2   0.100   301.1   18.3    19.1    105   1.07  0.012  0.0009   32.3    2.649    259
+
+    with pytest.raises(ValueError, match="Expected 13 columns in data block"):
+        _ = gml_ozonesonde.read_100m(url)
+
+
 def test_add_data():
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
     df = gml_ozonesonde.add_data(dates, n_procs=2)

From ba349031f06aad0128ab53f8f0372ebea1d55934 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 13:19:13 -0700
Subject: [PATCH 35/51] Add different data header case

besides this one and the bad data line one, all others load now
(tested on Hopper)
---
 tests/test_gml_ozonesonde.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 3ffcbb5b..52ad5075 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -51,6 +51,15 @@ def test_read_100m_bad_data_line():
         _ = gml_ozonesonde.read_100m(url)
 
 
+def test_read_100m_bad_header_line():
+    url = r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bu913_2021_08_10_16.l100"
+    # Level   Press    Alt   Pottp   Temp   FtempV   Hum  Ozone  Ozone   Ozone  Ptemp  O3 # DN O3 Res   Ftemp   Water
+    #  Num     hPa      km     K      C       C       %    mPa    ppmv   atmcm    C   10^11/cc   DU       C      ppmv
+
+    with pytest.raises(ValueError, match="Data block does not start with expected header"):
+        _ = gml_ozonesonde.read_100m(url)
+
+
 def test_add_data():
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
     df = gml_ozonesonde.add_data(dates, n_procs=2)

From ba6fb7aab9b5a6e8b0bfcfc25684e8bf2517c236 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 13:20:03 -0700
Subject: [PATCH 36/51] Normalize station to the place names

---
 monetio/profile/gml_ozonesonde.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 6e7ce446..729822ef 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -175,6 +175,36 @@ def func(fp_or_url):
     # Time subset again in case of times in files extending
     df = df[df["time"].between(dates_min, dates_max, inclusive="both")]
 
+    # Normalize station
+    # All values, as of 2024-02-08:
+    # > df.station.value_counts().sort_index()
+    # Boulder, CO                          650757
+    # Hilo, Hawaii                         627325
+    # Hilo,Hawaii                             192
+    # Huntsville                            10982
+    # Huntsville, AL                       314375
+    # Mauna Loa Observatory, Hawaii           605 (different site than Hilo)
+    # Pago Pago, American Samoa            370141
+    # San Cristobal, Galapagos, Ecuador    150244
+    # South Pole                           661422
+    # Summit, Greenland                    164620
+    # Suva, Fiji                           164065
+    # Trinidad Head, CA                    426409
+    # University of Rhode Island           105878
+    # helikite test                           326
+    # hsv                                     340
+    repl = {
+        "Boulder, CO": "Boulder, Colorado",
+        "Hilo,Hawaii": "Hilo, Hawaii",
+        "Huntsville": "Huntsville, Alabama",
+        "Huntsville, AL": "Huntsville, Alabama",
+        "San Cristobal, Galapagos, Ecuador": "San Cristobal, Galapagos",
+        "South Pole": "South Pole, Antarctica",
+        "Trinidad Head, CA": "Trinidad Head, California",
+    }
+    assert set(repl.values()) <= set(PLACES)
+    df["station"] = df["station"].replace(repl)
+
     # Add metadata
     if hasattr(df, "attrs"):
         df.attrs["ds_attrs"] = {"urls": urls}

From 37dfbef9d89a490f83b516ec8c98211afdc8b218 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 13:24:14 -0700
Subject: [PATCH 37/51] todo

---
 monetio/profile/gml_ozonesonde.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 729822ef..e8cc0ce8 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -424,11 +424,11 @@ def get_text():
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
-    df["station"] = meta["Station"]  # TODO: could normalize to place (in add_data?)?
+    df["station"] = meta["Station"]
     df["station_height_str"] = meta["Station Height"]  # e.g. '1743 meters'
     df["o3_tot_cmr_str"] = meta["Sonde Total O3"]
     df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]  # e.g. '325 (62) DU'
-    # TODO: '99999 (99999) DU' if NA, could put empty string instead?
+    # TODO: '99999 (99999) DU' if NA, could put empty string or None or NaN instead?
 
     # Add metadata
     if hasattr(df, "attrs"):

From 0ade35e0afe03291ece02f1acb7267db8e985496 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 13:36:05 -0700
Subject: [PATCH 38/51] Fix index in aggregated frame

---
 monetio/profile/gml_ozonesonde.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index e8cc0ce8..5408306b 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -216,7 +216,7 @@ def func(fp_or_url):
             for c in COL_INFO_L100
         }
 
-    return df
+    return df.drop(columns=["index"], errors="ignore").reset_index(drop=True)
 
 
 class ColInfo(NamedTuple):

From 251dcc38f82dd70d99c64fbc5e27c2688b39b1dc Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 8 Feb 2024 14:59:01 -0700
Subject: [PATCH 39/51] doc [skip ci]

---
 monetio/profile/gml_ozonesonde.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 5408306b..cf15a8ee 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -130,12 +130,15 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"):
     Parameters
     ----------
     dates : sequence of datetime-like
+        The period between the min and max (both inclusive)
+        will be used to select the files to load.
     place : str or sequence of str, optional
         For example 'Boulder, Colorado'.
         If not provided, all places will be used.
     n_procs : int
         For Dask.
-    errors : {'raise', 'warn', 'ignore'}
+    errors : {'raise', 'warn', 'skip'}
+        What to do when there is an error reading a file.
     """
     import dask
     import dask.dataframe as dd

From 80eeec445ac31220902a4146d6530adac1966d81 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Fri, 9 Feb 2024 14:10:47 -0700
Subject: [PATCH 40/51] Add NA vals for altitude

found a few cases of 99.9
---
 monetio/profile/gml_ozonesonde.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index cf15a8ee..439fb38d 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -240,7 +240,7 @@ class ColInfo(NamedTuple):
     #
     # "Alt"
     # TODO: not sure about this na val
-    ColInfo("altitude", "altitude", "km", "999.999"),
+    ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")),
     #
     # "Pottp"
     ColInfo("theta", "potential temperature", "K", "9999.9"),

From 9b2e264255935586e0a1fb1bbebd9696c2b8153a Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 11:26:22 -0700
Subject: [PATCH 41/51] notes

based on email from Bryan Johnson (NOAA GML)
---
 monetio/profile/gml_ozonesonde.py | 33 ++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 439fb38d..65cf4223 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -236,36 +236,45 @@ class ColInfo(NamedTuple):
     ColInfo("lev", "level", "", None),
     #
     # "Press"
-    ColInfo("press", "radiosonde corrected pressure", "hPa", "9999.9"),
+    # Atmospheric pressure, from the radiosonde.
+    ColInfo("press", "pressure", "hPa", "9999.9"),
     #
     # "Alt"
-    # TODO: not sure about this na val
+    # Altitude above sea level
+    # computed from radiosonde pressure and temperature
+    # (or GPS if available?).
     ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")),
     #
     # "Pottp"
     ColInfo("theta", "potential temperature", "K", "9999.9"),
     #
     # "Temp"
-    ColInfo("temp", "radiosonde corrected temperature", "degC", "999.9"),
+    # Atmospheric temperature, from the radiosonde.
+    ColInfo("temp", "air temperature", "degC", "999.9"),
     #
     # "FtempV"
-    ColInfo("ftempv", "frost point temperature (radiosonde)", "degC", "999.9"),
+    # Frost point temperature, calculated from the radiosonde RH and temp.
+    ColInfo("ftempv", "frost point temperature", "degC", "999.9"),
     #
     # "Hum"
-    ColInfo("rh", "radiosonde corrected relative humidity", "%", "999"),
+    # RH, measured by the radiosonde.
+    ColInfo("rh", "relative humidity", "%", "999"),
     #
     # "Ozone"
+    # Measured by the ozone sensor cell.
     ColInfo("o3_press", "ozone partial pressure", "mPa", "99.90"),
     #
     # "Ozone"
+    # Calculated from the ozone partial pressure and atmospheric pressure.
     ColInfo("o3", "ozone mixing ratio", "ppmv", "99.999"),
     #
     # "Ozone"
     # Note 1 DU = 0.001 atm-cm
-    # TODO: goes up with height so could be ozone below?
-    ColInfo("o3_cm", "total ozone", "atm-cm", "99.9990"),
+    # Cumulative column ozone amount at this point in the profile.
+    ColInfo("o3_int", "integrated ozone below", "atm-cm", "99.9990"),
     #
     # "Ptemp"
+    # Pump temperature, from thermistor in the vicinity of the pump block.
     ColInfo("ptemp", "pump temperature", "degC", "999.9"),
     #
     # "O3 # DN"
@@ -276,7 +285,7 @@ class ColInfo(NamedTuple):
     #   This is the amount of ozone in Dobson units above a given altitude.
     #   The values above the maximum balloon altitude are from a climatology.
     #   This is mainly for UV absorption research.
-    ColInfo("o3_col", "total column ozone above", "DU", ("9999", "99999", "99.999")),
+    ColInfo("o3_res", "estimated total column ozone above", "DU", ("9999", "99999", "99.999")),
     #
     # "O3 Uncert"
     # Estimated uncertainty in the ozone measurement at a given altitude.
@@ -429,8 +438,14 @@ def get_text():
     df["longitude"] = float(meta["Longitude"])
     df["station"] = meta["Station"]
     df["station_height_str"] = meta["Station Height"]  # e.g. '1743 meters'
+
+    # Sonde total column ozone amount ('325 (62) DU') in two methods:
+    # - CMR: extrapolate constant mixing ratio above balloon burst to get ozone above that (the residual)
+    # - SBUV: compute the residual from the SBUV climate tables
+    # The first number is the total column ozone (integrated + residual).
+    # The number in parentheses is the residual.
     df["o3_tot_cmr_str"] = meta["Sonde Total O3"]
-    df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]  # e.g. '325 (62) DU'
+    df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]
     # TODO: '99999 (99999) DU' if NA, could put empty string or None or NaN instead?
 
     # Add metadata

From 5af1ba8f7f0846d03158a1ac054a89f8d4b11c8f Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 11:32:49 -0700
Subject: [PATCH 42/51] Set '99999 (99999) DU' total col to NaN

---
 monetio/profile/gml_ozonesonde.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 65cf4223..868d9a84 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -428,25 +428,24 @@ def get_text():
         na_values=na_values,
     )
 
-    # Note: This is close to "Pottp" but not exactly the same
-    # theta_calc = (df.temp + 273.15) * (df.press / 1000) ** (-0.286)
-
     # Add some variables from header (these don't change in the profile)
     time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])
     df["longitude"] = float(meta["Longitude"])
+
     df["station"] = meta["Station"]
     df["station_height_str"] = meta["Station Height"]  # e.g. '1743 meters'
 
-    # Sonde total column ozone amount ('325 (62) DU') in two methods:
+    # Sonde total column ozone amount ('325 (62) DU') from two methods:
     # - CMR: extrapolate constant mixing ratio above balloon burst to get ozone above that (the residual)
     # - SBUV: compute the residual from the SBUV climate tables
     # The first number is the total column ozone (integrated + residual).
     # The number in parentheses is the residual.
     df["o3_tot_cmr_str"] = meta["Sonde Total O3"]
     df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]
-    # TODO: '99999 (99999) DU' if NA, could put empty string or None or NaN instead?
+    for col in ["o3_tot_cmr_str", "o3_tot_sbuv_str"]:
+        df[col] = df[col].replace("99999 (99999) DU", np.nan)
 
     # Add metadata
     if hasattr(df, "attrs"):

From 72c6da4c9a0877de30990bd4d8cdb5c456ca43bc Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 11:38:26 -0700
Subject: [PATCH 43/51] notes

---
 monetio/profile/gml_ozonesonde.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 868d9a84..5426ed6c 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -175,7 +175,7 @@ def func(fp_or_url):
     dff = dd.from_delayed(dfs, verify_meta=errors == "raise")
     df = dff.compute(num_workers=n_procs).reset_index()
 
-    # Time subset again in case of times in files extending
+    # Time subset again just in case
     df = df[df["time"].between(dates_min, dates_max, inclusive="both")]
 
     # Normalize station
@@ -404,12 +404,12 @@ def get_text():
         _ = col_info.pop()
 
     ncol_expected = len(col_info)
-    data_block_ncol = len(data_block[:400].splitlines()[2].split())
-    if not data_block_ncol == ncol_expected:
+    data_block_first_ncol = len(data_block[:400].splitlines()[2].split())
+    if not data_block_first_ncol == ncol_expected:
         head = "\n".join(data_block.splitlines()[:4] + ["..."])
         raise ValueError(
             f"Expected {ncol_expected} columns in data block, "
-            f"got {data_block_ncol} in first data line:\n{head}"
+            f"got {data_block_first_ncol} in first data line:\n{head}"
         )
         # TODO: allow pandas to skip bad lines with `on_bad_lines='skip'`?
 
@@ -428,7 +428,7 @@ def get_text():
         na_values=na_values,
     )
 
-    # Add some variables from header (these don't change in the profile)
+    # Add some variables from header as columns (these don't change in the profile)
     time = pd.Timestamp(f"{meta['Launch Date']} {meta['Launch Time']}")
     df["time"] = time.tz_localize(None)
     df["latitude"] = float(meta["Latitude"])

From 204d487dc87cca6c3cf3fcc5889855bfe9a7e519 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 11:43:25 -0700
Subject: [PATCH 44/51] Ensure tot cols still object dtype

for when Dask checks for consistency
---
 monetio/profile/gml_ozonesonde.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 5426ed6c..12caf312 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -445,7 +445,7 @@ def get_text():
     df["o3_tot_cmr_str"] = meta["Sonde Total O3"]
     df["o3_tot_sbuv_str"] = meta["Sonde Total O3 (SBUV)"]
     for col in ["o3_tot_cmr_str", "o3_tot_sbuv_str"]:
-        df[col] = df[col].replace("99999 (99999) DU", np.nan)
+        df[col] = df[col].replace("99999 (99999) DU", np.nan).astype(object)
 
     # Add metadata
     if hasattr(df, "attrs"):

From d8670f1ddadbed4f381219b9b3ed3463ad04cd68 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 13:17:01 -0700
Subject: [PATCH 45/51] notes [skip ci]

---
 monetio/profile/gml_ozonesonde.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 12caf312..00b08b2f 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -241,8 +241,9 @@ class ColInfo(NamedTuple):
     #
     # "Alt"
     # Altitude above sea level
-    # computed from radiosonde pressure and temperature
-    # (or GPS if available?).
+    # in the sounding computed from radiosonde pressure and temperature (or GPS if available?).
+    # For 100-m data, the center of the 100-m layer
+    # (data values included in the layer averages have altitude +/- 50 m of this).
     ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")),
     #
     # "Pottp"

From dd7dc3122ab1f027e4651ec0fc4d54a5687c73ac Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 13:26:19 -0700
Subject: [PATCH 46/51] notes [skip ci]

---
 monetio/profile/gml_ozonesonde.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 00b08b2f..cd8447b8 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -244,6 +244,9 @@ class ColInfo(NamedTuple):
     # in the sounding computed from radiosonde pressure and temperature (or GPS if available?).
     # For 100-m data, the center of the 100-m layer
     # (data values included in the layer averages have altitude +/- 50 m of this).
+    # If not invalid and removed, the first row is the actual altitude of the launch,
+    # and the next row begins the clean 100-m intervals
+    # (i.e. first diff may not be 0.1, but the rest should be).
     ColInfo("altitude", "altitude", "km", ("99.9", "999.9", "99.999", "999.999")),
     #
     # "Pottp"

From 8ce5182aba50e88b979fc2af767ddd7c29ed0b18 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 14:13:11 -0700
Subject: [PATCH 47/51] Add flight number to df

otherwise can't differentiate launches at same time and place
---
 monetio/profile/gml_ozonesonde.py |  2 ++
 tests/test_gml_ozonesonde.py      | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index cd8447b8..29e55449 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -176,6 +176,7 @@ def func(fp_or_url):
     df = dff.compute(num_workers=n_procs).reset_index()
 
     # Time subset again just in case
+    # (file time may not match launch time; file time seems to be floored to nearest hour)
     df = df[df["time"].between(dates_min, dates_max, inclusive="both")]
 
     # Normalize station
@@ -440,6 +441,7 @@ def get_text():
 
     df["station"] = meta["Station"]
     df["station_height_str"] = meta["Station Height"]  # e.g. '1743 meters'
+    df["flight_number"] = meta["Flight Number"]
 
     # Sonde total column ozone amount ('325 (62) DU') from two methods:
     # - CMR: extrapolate constant mixing ratio above balloon burst to get ozone above that (the residual)
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 52ad5075..efe99b6c 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -95,3 +95,20 @@ def test_add_data_invalid_place(place):
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
     with pytest.raises(ValueError, match="Invalid place"):
         _ = gml_ozonesonde.add_data(dates, place=place)
+
+
+def test_same_place_and_launch_time():
+    # Two files with same file time and launch time:
+    # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl774_2003_03_10_20.l100
+    # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl775_2003_03_10_20.l100
+    # File time: 2003-03-10 20
+    # Launch time: 2003-03-10 20:41:11
+    dates = ["2003-03-10 20", "2003-03-10 21"]
+    df = gml_ozonesonde.add_data(dates, place="Boulder, Colorado", n_procs=2)
+    assert len(df) > 0
+
+    # Only one launch time
+    assert df["time"].nunique() == 1
+
+    # But multiple profiles
+    assert df["flight_number"].nunique() == 2

From cad3899ef3bb121b65b9b600909920d254deb78e Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Wed, 14 Feb 2024 16:59:31 -0700
Subject: [PATCH 48/51] Test URLs

---
 tests/test_gml_ozonesonde.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index efe99b6c..6d2f6969 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -112,3 +112,8 @@ def test_same_place_and_launch_time():
 
     # But multiple profiles
     assert df["flight_number"].nunique() == 2
+
+    assert df.attrs["ds_attrs"]["urls"] == [
+        r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl774_2003_03_10_20.l100",
+        r"https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl775_2003_03_10_20.l100",
+    ]

From 26250bdc9bebc6ed5bff12b47094beae7b673433 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 11 Apr 2024 16:39:01 -0600
Subject: [PATCH 49/51] More info about place arg

---
 monetio/profile/gml_ozonesonde.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index 29e55449..e5f0af91 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -135,6 +135,8 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"):
     place : str or sequence of str, optional
         For example 'Boulder, Colorado'.
         If not provided, all places will be used.
+        Valid options correspond to the directories in https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
+        and may include data from more than one unique site ('station').
     n_procs : int
         For Dask.
     errors : {'raise', 'warn', 'skip'}

From 2751b13312f723f963f13f9212670471b3e752d4 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 11 Apr 2024 16:47:45 -0600
Subject: [PATCH 50/51] `place` -> `location`

fancier sounding

not using 'siteid' since one folder's data can have multiple
unique 'station' values (akin to 'siteid')
---
 monetio/profile/gml_ozonesonde.py | 64 ++++++++++++++++---------------
 tests/test_gml_ozonesonde.py      | 20 +++++-----
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index e5f0af91..c993b3d8 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -40,7 +40,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-PLACES = [
+LOCATIONS = [
     "Boulder, Colorado",
     "Hilo, Hawaii",
     "Huntsville, Alabama",
@@ -54,37 +54,37 @@ def wrapper(*args, **kwargs):
 ]
 
 
-_FILES_L100_CACHE = {place: None for place in PLACES}
+_FILES_L100_CACHE = {location: None for location in LOCATIONS}
 
 
-def discover_files(place=None, *, n_threads=3, cache=True):
+def discover_files(location=None, *, n_threads=3, cache=True):
     import itertools
     from multiprocessing.pool import ThreadPool
 
     base = "https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde"
 
-    if place is None:
-        places = PLACES
-    elif isinstance(place, str):
-        places = [place]
+    if location is None:
+        locations = LOCATIONS
+    elif isinstance(location, str):
+        locations = [location]
     else:
-        places = place
+        locations = location
 
-    invalid = set(places) - set(PLACES)
+    invalid = set(locations) - set(LOCATIONS)
     if invalid:
-        raise ValueError(f"Invalid place(s): {invalid}. Valid options: {PLACES}.")
+        raise ValueError(f"Invalid location(s): {invalid}. Valid options: {LOCATIONS}.")
 
     @retry
-    def get_files(place):
-        cached = _FILES_L100_CACHE[place]
+    def get_files(location):
+        cached = _FILES_L100_CACHE[location]
         if cached is not None:
             return cached
 
-        if place == "South Pole, Antarctica":
-            url_place = "South Pole, Antartica"  # note sp
+        if location == "South Pole, Antarctica":
+            url_location = "South Pole, Antartica"  # note sp
         else:
-            url_place = place
-        url = f"{base}/{url_place}/100 Meter Average Files/".replace(" ", "%20")
+            url_location = location
+        url = f"{base}/{url_location}/100 Meter Average Files/".replace(" ", "%20")
         print(url)
 
         r = requests.get(url, timeout=10)
@@ -103,28 +103,28 @@ def get_files(place):
             except ValueError:
                 warnings.warn(f"Failed to parse file name {fn!r} for time.")
                 t = np.nan
-            data.append((place, t, fn, f"{url}{fn}"))
+            data.append((location, t, fn, f"{url}{fn}"))
 
         if not data:
-            warnings.warn(f"No files detected for place {place!r}.")
+            warnings.warn(f"No files detected for location {location!r}.")
 
         return data
 
-    with ThreadPool(processes=min(n_threads, len(places))) as pool:
-        data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, places)))
+    with ThreadPool(processes=min(n_threads, len(locations))) as pool:
+        data = list(itertools.chain.from_iterable(pool.imap_unordered(get_files, locations)))
 
-    df = pd.DataFrame(data, columns=["place", "time", "fn", "url"])
+    df = pd.DataFrame(data, columns=["location", "time", "fn", "url"])
 
     if cache:
-        for place in places:
-            _FILES_L100_CACHE[place] = list(
-                df[df["place"] == place].itertuples(index=False, name=None)
+        for location in locations:
+            _FILES_L100_CACHE[location] = list(
+                df[df["location"] == location].itertuples(index=False, name=None)
             )
 
     return df
 
 
-def add_data(dates, *, place=None, n_procs=1, errors="raise"):
+def add_data(dates, *, location=None, n_procs=1, errors="raise"):
     """Retrieve and load GML ozonesonde data as a DataFrame.
 
     Parameters
@@ -132,11 +132,11 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"):
     dates : sequence of datetime-like
         The period between the min and max (both inclusive)
         will be used to select the files to load.
-    place : str or sequence of str, optional
+    location : str or sequence of str, optional
         For example 'Boulder, Colorado'.
-        If not provided, all places will be used.
+        If not provided, all locations will be used.
         Valid options correspond to the directories in https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
-        and may include data from more than one unique site ('station').
+        and may include data from more than one unique site (output column 'station').
     n_procs : int
         For Dask.
     errors : {'raise', 'warn', 'skip'}
@@ -152,13 +152,15 @@ def add_data(dates, *, place=None, n_procs=1, errors="raise"):
         raise ValueError(f"Invalid errors setting: {errors!r}.")
 
     print("Discovering files...")
-    df_urls = discover_files(place=place)
+    df_urls = discover_files(location=location)
     print(f"Discovered {len(df_urls)} 100-m files.")
 
     urls = df_urls[df_urls["time"].between(dates_min, dates_max, inclusive="both")]["url"].tolist()
 
     if not urls:
-        raise RuntimeError(f"No files found for dates {dates_min} to {dates_max}, place={place}.")
+        raise RuntimeError(
+            f"No files found for dates {dates_min} to {dates_max}, location={location!r}."
+        )
 
     def func(fp_or_url):
         try:
@@ -208,7 +210,7 @@ def func(fp_or_url):
         "South Pole": "South Pole, Antarctica",
         "Trinidad Head, CA": "Trinidad Head, California",
     }
-    assert set(repl.values()) <= set(PLACES)
+    assert set(repl.values()) <= set(LOCATIONS)
     df["station"] = df["station"].replace(repl)
 
     # Add metadata
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index 6d2f6969..b12fd39b 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -7,7 +7,7 @@
 def test_discover_files():
     files = gml_ozonesonde.discover_files()
     assert len(files) > 0
-    assert set(files["place"].unique()) == set(gml_ozonesonde.PLACES)
+    assert set(files["location"].unique()) == set(gml_ozonesonde.LOCATIONS)
 
 
 def test_read_100m():
@@ -74,37 +74,37 @@ def test_add_data():
     assert df["station"].nunique() == latlon.nunique()
 
 
-def test_add_data_place_sel():
+def test_add_data_location_sel():
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
     df = gml_ozonesonde.add_data(
         dates,
-        place=["Boulder, Colorado", "South Pole, Antarctica"],
+        location=["Boulder, Colorado", "South Pole, Antarctica"],
         n_procs=2,
     )
     assert len(df) > 0
 
     latlon = df["latitude"].astype(str) + "," + df["longitude"].astype(str)
-    assert latlon.nunique() == 2, "selected two places"
+    assert latlon.nunique() == 2, "selected two locations"
 
 
 @pytest.mark.parametrize(
-    "place",
+    "location",
     ["asdf", ["asdf", "blah"], ("asdf", "blah")],
 )
-def test_add_data_invalid_place(place):
+def test_add_data_invalid_location(location):
     dates = pd.date_range("2023-01-01", "2023-01-31 23:59", freq="H")
-    with pytest.raises(ValueError, match="Invalid place"):
-        _ = gml_ozonesonde.add_data(dates, place=place)
+    with pytest.raises(ValueError, match="Invalid location"):
+        _ = gml_ozonesonde.add_data(dates, location=location)
 
 
-def test_same_place_and_launch_time():
+def test_same_location_and_launch_time():
     # Two files with same file time and launch time:
     # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl774_2003_03_10_20.l100
     # - https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/Boulder,%20Colorado/100%20Meter%20Average%20Files/bl775_2003_03_10_20.l100
     # File time: 2003-03-10 20
     # Launch time: 2003-03-10 20:41:11
     dates = ["2003-03-10 20", "2003-03-10 21"]
-    df = gml_ozonesonde.add_data(dates, place="Boulder, Colorado", n_procs=2)
+    df = gml_ozonesonde.add_data(dates, location="Boulder, Colorado", n_procs=2)
     assert len(df) > 0
 
     # Only one launch time

From 649a61111d42972ca663e1e203eecf70a80289e3 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Thu, 11 Apr 2024 16:53:14 -0600
Subject: [PATCH 51/51] 'siteid' in `add_data` output

---
 monetio/profile/gml_ozonesonde.py | 5 ++++-
 tests/test_gml_ozonesonde.py      | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/monetio/profile/gml_ozonesonde.py b/monetio/profile/gml_ozonesonde.py
index c993b3d8..d0754126 100644
--- a/monetio/profile/gml_ozonesonde.py
+++ b/monetio/profile/gml_ozonesonde.py
@@ -136,7 +136,7 @@ def add_data(dates, *, location=None, n_procs=1, errors="raise"):
         For example 'Boulder, Colorado'.
         If not provided, all locations will be used.
         Valid options correspond to the directories in https://gml.noaa.gov/aftp/data/ozwv/Ozonesonde/
-        and may include data from more than one unique site (output column 'station').
+        and may include data from more than one unique site (output column 'siteid').
     n_procs : int
         For Dask.
     errors : {'raise', 'warn', 'skip'}
@@ -213,6 +213,9 @@ def func(fp_or_url):
     assert set(repl.values()) <= set(LOCATIONS)
     df["station"] = df["station"].replace(repl)
 
+    # Normalized station name as site ID
+    df = df.rename(columns={"station": "siteid"})
+
     # Add metadata
     if hasattr(df, "attrs"):
         df.attrs["ds_attrs"] = {"urls": urls}
diff --git a/tests/test_gml_ozonesonde.py b/tests/test_gml_ozonesonde.py
index b12fd39b..306d28b6 100644
--- a/tests/test_gml_ozonesonde.py
+++ b/tests/test_gml_ozonesonde.py
@@ -71,7 +71,7 @@ def test_add_data():
     assert 1 < latlon.nunique() <= 10, "multiple sites; lat/lon doesn't change in profile"
 
     # NOTE: Similar to the place folder names, but not all the same
-    assert df["station"].nunique() == latlon.nunique()
+    assert df["siteid"].nunique() == latlon.nunique()
 
 
 def test_add_data_location_sel():