From 2402338684fd81ffff8106c13820aaafcce45392 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 8 Sep 2023 18:56:48 -0600 Subject: [PATCH 01/77] WIP: new reader for single openaq-fetches file --- monetio/obs/openaq.py | 114 ++++++++++++++++++++++++++++++------------ tests/test_openaq.py | 5 ++ 2 files changed, 86 insertions(+), 33 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index bf11b016..1a3a6ef1 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -1,54 +1,102 @@ -"""Short summary. - - Attributes - ---------- - url : type - Description of attribute `url`. - dates : type - Description of attribute `dates`. - df : type - Description of attribute `df`. - daily : type - Description of attribute `daily`. - objtype : type - Description of attribute `objtype`. - filelist : type - Description of attribute `filelist`. - monitor_file : type - Description of attribute `monitor_file`. - __class__ : type - Description of attribute `__class__`. - monitor_df : type - Description of attribute `monitor_df`. - savecols : type - Description of attribute `savecols`. - """ +"""OpenAQ""" import json +import warnings import pandas as pd from numpy import NaN def add_data(dates, n_procs=1): - """add openaq data from the amazon s3 server. + """Add OpenAQ data from the Amazon s3 server. + + https://openaq-fetches.s3.amazonaws.com Parameters ---------- - dates : pd.DateTimeIndex or list of datatime objects - this is a list of dates to download - n_procs : type - Description of parameter `n_procs`. + dates : pandas.DateTimeIndex or list of datetime objects + Dates of data to fetch. + n_procs : int + For Dask. Returns ------- - type - Description of returned object. - + pandas.DataFrame """ a = OPENAQ() return a.add_data(dates, num_workers=n_procs) +def read_json(fp_or_url): + """Read a json file from the OpenAQ server, returning dataframe in non-wide format. + + Parameters + ---------- + fp_or_url : str or path-like + File path or URL. + + Returns + ------- + pandas.DataFrame + """ + df = pd.read_json(fp_or_url, lines=True) + + # "attribution" is complex to deal with, just drop for now + # Seems like it can be null or a list of attribution dicts with "name" and "url" + df = df.drop(columns="attribution") + + # Expand nested columns + # Multiple ways to do this, e.g. + # - pd.DataFrame(df.date.tolist()) + # Seems to be fastest for one, works if only one level of nesting + # - pd.json_normalize(df["date"]) + # - pd.json_normalize(json.loads(df["date"].to_json(orient="records"))) + # With this method, can apply to multiple columns at once + to_expand = ["date", "averagingPeriod", "coordinates"] + new = pd.json_normalize(json.loads(df[to_expand].to_json(orient="records"))) + + # Convert to time + # If we just apply `pd.to_datetime`, we get + # - utc -> datetime64[ns, UTC] + # - local -> obj (datetime.datetime with tzinfo=tzoffset(None, ...)) + # + # But we don't need localization, we just want non-localized UTC time and UTC offset. + # + # To get the UTC time, e.g.: + # - pd.to_datetime(new["date.utc"]).dt.tz_localize(None) + # These are comparable but this seems slightly faster. + # - pd.to_datetime(new["date.utc"].str.slice(None, -1)) + # + # To get UTC offset + # (we can't subtract the two time arrays since different dtypes), e.g.: + # - pd.to_timedelta(new["date.local"].str.slice(-6, None)+":00") + # Seems to be slightly faster + # - pd.to_datetime(new["date.local"]).apply(lambda t: t.utcoffset()) + time = pd.to_datetime(new["date.utc"]).dt.tz_localize(None) + utcoffset = pd.to_timedelta(new["date.local"].str.slice(-6, None) + ":00") + time_local = time + utcoffset + + # Attempting averaging period by assuming hours + # FIXME: probably not always the case... + assert (new["averagingPeriod.unit"].dropna() == "hours").all() + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", category=RuntimeWarning, message="invalid value encountered in cast" + ) + averagingPeriod = pd.to_timedelta(new["averagingPeriod.value"], unit="hours") + + # Apply new columns + df = df.drop(columns=to_expand).assign( + time=time, + time_local=time_local, + utcoffset=utcoffset, + latitude=new["coordinates.latitude"], + longitude=new["coordinates.longitude"], + averagingPeriod=averagingPeriod, + ) + + return df + + class OPENAQ: def __init__(self): import s3fs diff --git a/tests/test_openaq.py b/tests/test_openaq.py index a349544c..1ca67d51 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -15,3 +15,8 @@ def test_openaq(): assert not df.empty assert df.siteid.nunique() == 1 assert (df.country == "CN").all() and ((df.time_local - df.time) == pd.Timedelta(hours=8)).all() + + +df = openaq.read_json( + "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" +) From 4401a6a61c726bf249fd1c51f883105cd3082954 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 8 Sep 2023 19:16:30 -0600 Subject: [PATCH 02/77] Trying a larger file from Jordan's day has "nox" and "pm1" with ug m-3 units --- monetio/obs/openaq.py | 6 ++++++ tests/test_openaq.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 1a3a6ef1..3d5fc6ee 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -38,6 +38,10 @@ def read_json(fp_or_url): ------- pandas.DataFrame """ + from time import perf_counter + + tic = perf_counter() + df = pd.read_json(fp_or_url, lines=True) # "attribution" is complex to deal with, just drop for now @@ -94,6 +98,8 @@ def read_json(fp_or_url): averagingPeriod=averagingPeriod, ) + print(f"{perf_counter() - tic:.3f}s") + return df diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 1ca67d51..add5bc17 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -18,5 +18,6 @@ def test_openaq(): df = openaq.read_json( - "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" + # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB + "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB ) From 2f6c1645c92767e046960fcb0b6e47e6208376e5 Mon Sep 17 00:00:00 2001 From: zmoon Date: Sat, 9 Sep 2023 06:46:11 -0600 Subject: [PATCH 03/77] WIP: revising OPENAQ class --- monetio/obs/openaq.py | 207 ++++++++++++++++++++++-------------------- tests/test_openaq.py | 18 +++- 2 files changed, 122 insertions(+), 103 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 3d5fc6ee..2422fd51 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -111,113 +111,110 @@ def __init__(self): self.s3bucket = "openaq-fetches/realtime" def _get_available_days(self, dates): + """ + Parameters + ---------- + dates : datetime-like or list of datetime-like + ``pd.to_datetime`` will be applied. + """ + # Get all day folders folders = self.fs.ls(self.s3bucket) days = [j.split("/")[2] for j in folders] - avail_dates = pd.to_datetime(days, format="%Y-%m-%d", errors="coerce") - dates = pd.to_datetime(dates).floor(freq="D") - d = pd.Series(dates, name="dates").drop_duplicates() - ad = pd.Series(avail_dates, name="dates") - return pd.merge(d, ad, how="inner") + dates_available = pd.Series( + pd.to_datetime(days, format=r"%Y-%m-%d", errors="coerce"), name="dates" + ) + + # Filter by requested dates + dates_requested = pd.Series( + pd.to_datetime(dates).floor(freq="D"), name="dates" + ).drop_duplicates() + + dates_have = pd.merge(dates_available, dates_requested, how="inner")["dates"] + if dates_have.empty: + raise ValueError(f"No data available for requested dates: {dates_requested}.") + + return dates_have def _get_files_in_day(self, date): - files = self.fs.ls("{}/{}".format(self.s3bucket, date.strftime("%Y-%m-%d"))) + """ + Parameters + ---------- + date + datetime-like object with ``.strftime`` method. + """ + sdate = date.strftime(r"%Y-%m-%d") + files = self.fs.ls(f"{self.s3bucket}/{sdate}") return files def build_urls(self, dates): - d = self._get_available_days(dates) - urls = pd.Series([], name="url") - for i in d.dates: - files = self._get_files_in_day(i) - furls = pd.Series( - [ - f.replace("openaq-fetches", "https://openaq-fetches.s3.amazonaws.com") - for f in files - ], - name="url", - ) - urls = pd.merge(urls, furls, how="outer") - return urls.url.values - - def add_data(self, dates, num_workers=1): + """ + Parameters + ---------- + dates : datetime-like or list of datetime-like + ``pd.to_datetime`` will be applied. + """ + dates_ = self._get_available_days(dates) + urls = [] + for date in dates_: + files = self._get_files_in_day(date) + urls.extend(f"s3://{f}" for f in files) + return urls + + def add_data(self, dates, *, num_workers=1): import dask import dask.dataframe as dd - urls = self.build_urls(dates).tolist() - # z = dd.read_json(urls).compute() - dfs = [dask.delayed(self.read_json)(f) for f in urls] - dff = dd.from_delayed(dfs) - z = dff.compute(num_workers=num_workers) - z.coordinates.replace(to_replace=[None], value=NaN, inplace=True) - z = z.dropna().reset_index(drop=True) - js = json.loads(z[["coordinates", "date"]].to_json(orient="records")) - dff = pd.io.json.json_normalize(js) - dff.columns = dff.columns.str.split(".").str[1] - dff.rename({"local": "time_local", "utc": "time"}, axis=1, inplace=True) - - dff["time"] = pd.to_datetime(dff.time) - dff["utcoffset"] = pd.to_datetime(dff.time_local).apply(lambda x: x.utcoffset()) - zzz = z.join(dff).drop(columns=["coordinates", "date", "attribution", "averagingPeriod"]) - zzz = self._fix_units(zzz) - assert ( - zzz[~zzz.parameter.isin(["pm25", "pm4", "pm10", "bc"])].unit.dropna() == "ppm" - ).all() - zp = self._pivot_table(zzz) - zp["siteid"] = ( - zp.country + dates = pd.to_datetime(dates) + if isinstance(dates, pd.Timestamp): + dates = pd.DatetimeIndex([dates]) + dates = dates.sort_values() + + # Get URLs + urls = self.build_urls(dates) + print(f"Will load {len(urls)} files.") + if len(urls) > 0: + print(urls[0]) + if len(urls) > 2: + print("...") + if len(urls) > 1: + print(urls[-1]) + + dfs = [dask.delayed(read_json)(f) for f in urls] + df_lazy = dd.from_delayed(dfs) + df = df_lazy.compute(num_workers=num_workers) + + # TODO: not sure if necessary (doesn't seem to be?) + # df = df.coordinates.replace(to_replace=[None], value=NaN) + + # Ensure consistent units, e.g. ppm for molecules + self._fix_units(df) + non_molec = ["pm1", "pm25", "pm4", "pm10", "bc", "nox"] + good = (df[~df.parameter.isin(non_molec)].unit.dropna() == "ppm").all() + if not good: + unique_params = sorted(df.parameter.unique()) + molec = [p for p in unique_params if p not in non_molec] + raise ValueError(f"Expected these species to all be in ppm now: {molec}.") + good = (df[df.parameter.isin(non_molec)].unit.dropna() == "µg/m³").all() + if not good: + raise ValueError(f"Expected these species to all be in µg/m³: {non_molec}.") + + # Pivot to wide format + df = self._pivot_table(df) + + # Construct site IDs + df["siteid"] = ( + df.country + "_" - + zp.latitude.round(3).astype(str) + + df.latitude.round(3).astype(str) + "N_" - + zp.longitude.round(3).astype(str) + + df.longitude.round(3).astype(str) + "E" ) - zp["time"] = zp.time.dt.tz_localize(None) - zp["time_local"] = zp["time"] + zp["utcoffset"] - - return zp.loc[zp.time >= dates.min()] - - def read_json(self, url): - return pd.read_json(url, lines=True).dropna().sort_index(axis=1) - - # def read_json(self, url): - # df = pd.read_json(url, lines=True).dropna() - # df.coordinates.replace(to_replace=[None], - # value=pd.np.nan, - # inplace=True) - # df = df.dropna(subset=['coordinates']) - # # df = self._parse_latlon(df) - # # json_struct = json.loads(df.coordinates.to_json(orient='records')) - # # df_flat = pd.io.json.json_normalize(json_struct) - # # df = self._parse_datetime(df) - # # df = self._fix_units(df) - # # df = self._pivot_table(df) - # return df - - def _parse_latlon(self, df): - # lat = vectorize(lambda x: x['latitude']) - # lon = vectorize(lambda x: x['longitude']) - def lat(x): - return x["latitude"] - - def lon(x): - return x["longitude"] - - df["latitude"] = df.coordinates.apply(lat) - df["longitude"] = df.coordinates.apply(lon) - return df.drop(columns="coordinates") - - def _parse_datetime(self, df): - def utc(x): - return pd.to_datetime(x["utc"]) - - def local(x): - return pd.to_datetime(x["local"]) - - df["time"] = df.date.apply(utc) - df["time_local"] = df.date.apply(local) - return df.drop(columns="date") + return df.loc[(df.time >= dates.min()) & (df.time <= dates.max())] def _fix_units(self, df): + """In place, convert units to ppm for molecules.""" df.loc[df.value <= 0] = NaN # For a certain parameter, different site-times may have different units. # https://docs.openaq.org/docs/parameters @@ -230,36 +227,48 @@ def _fix_units(self, df): is_ug = (df.parameter == vn) & (df.unit == "µg/m³") df.loc[is_ug, "value"] /= f df.loc[is_ug, "unit"] = "ppm" - return df def _pivot_table(self, df): - w = df.pivot_table( + # Pivot + wide = df.pivot_table( values="value", index=[ "time", + "time_local", "latitude", "longitude", - "sourceName", - "sourceType", + "utcoffset", + "location", "city", "country", - "utcoffset", + "sourceName", + "sourceType", + "mobile", + "averagingPeriod", ], columns="parameter", ).reset_index() - w = w.rename( + + # Include units in variable names + wide = wide.rename( dict( + # molec co="co_ppm", o3="o3_ppm", no2="no2_ppm", so2="so2_ppm", ch4="ch4_ppm", no="no_ppm", - bc="bc_umg3", + # non-molec + pm1="pm1_ugm3", pm25="pm25_ugm3", + pm4="pm4_ugm3", pm10="pm10_ugm3", + bc="bc_ugm3", + nox="nox_ugm3", ), - axis=1, + axis="columns", errors="ignore", ) - return w + + return wide diff --git a/tests/test_openaq.py b/tests/test_openaq.py index add5bc17..af4e54e8 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -17,7 +17,17 @@ def test_openaq(): assert (df.country == "CN").all() and ((df.time_local - df.time) == pd.Timedelta(hours=8)).all() -df = openaq.read_json( - # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB - "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB -) +# df = openaq.read_json( +# # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB +# "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB +# ) + +o = openaq.OPENAQ() +days_avail = o._get_available_days(pd.date_range("2019-08-01", "2019-08-03")) +files = o._get_files_in_day(pd.to_datetime("2019-08-01")) + +from dask.diagnostics import ProgressBar + +ProgressBar().register() + +df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file From 375d72a5999cbbcf376234e780f2cc98865b80bc Mon Sep 17 00:00:00 2001 From: zmoon Date: Sat, 9 Sep 2023 06:57:14 -0600 Subject: [PATCH 04/77] WIP: want to try going through the JSON with Python and constructing the arrays or dicts for pandas that way --- monetio/obs/openaq.py | 39 +++++++++++++++++++++++++++++++++++++++ tests/test_openaq.py | 11 ++++++++--- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 2422fd51..511a578b 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -103,6 +103,45 @@ def read_json(fp_or_url): return df +def read_json2(fp_or_url): # TODO: go through the JSON with Python + """Read a json file from the OpenAQ server, returning dataframe in non-wide format. + + Parameters + ---------- + fp_or_url : str or path-like + File path or URL. + + Returns + ------- + pandas.DataFrame + """ + from time import perf_counter + + tic = perf_counter() + + if fp_or_url.startswith("http"): + import requests + + r = requests.get(fp_or_url, stream=True, timeout=2) + r.raise_for_status() + else: + raise NotImplementedError + with open(fp_or_url) as f: + data = json.load(f) + + for line in r.iter_lines(): + if line: + print(line) + data = json.loads(line) + print(data) + + df = pd.DataFrame() + + print(f"{perf_counter() - tic:.3f}s") + + return df + + class OPENAQ: def __init__(self): import s3fs diff --git a/tests/test_openaq.py b/tests/test_openaq.py index af4e54e8..464cc9ad 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -26,8 +26,13 @@ def test_openaq(): days_avail = o._get_available_days(pd.date_range("2019-08-01", "2019-08-03")) files = o._get_files_in_day(pd.to_datetime("2019-08-01")) -from dask.diagnostics import ProgressBar +# from dask.diagnostics import ProgressBar -ProgressBar().register() +# ProgressBar().register() -df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file +# df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file + +df = openaq.read_json2( + "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB + # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB +) From c06e8f233b0dffc417da1f5eacb09a9e8421bd43 Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 11 Sep 2023 11:04:11 -0600 Subject: [PATCH 05/77] Python processing reader mostly works --- monetio/obs/openaq.py | 76 ++++++++++++++++++++++++++++++++++++++++--- tests/test_openaq.py | 4 +-- 2 files changed, 74 insertions(+), 6 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 511a578b..15184c01 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -115,6 +115,7 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python ------- pandas.DataFrame """ + import datetime from time import perf_counter tic = perf_counter() @@ -129,13 +130,80 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python with open(fp_or_url) as f: data = json.load(f) + names = [ + "time", + "utcoffset", + "latitude", + "longitude", + # + "parameter", + "value", + "unit", + # + "averagingPeriod", + # + "location", + "city", + "country", + # + "sourceName", + "sourceType", + "mobile", + ] + rows = [] for line in r.iter_lines(): if line: - print(line) data = json.loads(line) - print(data) - - df = pd.DataFrame() + coords = data.get("coordinates") + if coords is None: + print("Skipping row since no coords:", data) + continue + + # Time + time = datetime.datetime.fromisoformat(data["date"]["utc"][:-1]) + time_local_str = data["date"]["local"] + h = int(time_local_str[-6:-3]) + m = int(time_local_str[-2:]) + utcoffset = datetime.timedelta(hours=h, minutes=m) + + # Averaging period + ap = data.get("averagingPeriod") + if ap is not None: + val = data["averagingPeriod"]["value"] + unit = data["averagingPeriod"]["unit"] + averagingPeriod = datetime.timedelta(**{unit: val}) + else: + averagingPeriod = None + + # TODO: attribution + + rows.append( + ( + time, + utcoffset, + data["coordinates"]["latitude"], + data["coordinates"]["longitude"], + # + data["parameter"], + data["value"], + data["unit"], + # + averagingPeriod, + # + data["location"], + data["city"], + data["country"], + # + data["sourceName"], + data["sourceType"], + data["mobile"], + ) + ) + + # TODO: specify dtype here + df = pd.DataFrame(rows, columns=names) + + df["time_local"] = df["time"] + df["utcoffset"] print(f"{perf_counter() - tic:.3f}s") diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 464cc9ad..8593967f 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -33,6 +33,6 @@ def test_openaq(): # df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file df = openaq.read_json2( - "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB - # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB + # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB + "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB ) From 10f0402ad4986e0c9c4e9d0af48945c01382b1fe Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 11 Sep 2023 18:02:50 -0600 Subject: [PATCH 06/77] Get first attribution --- monetio/obs/openaq.py | 10 +++++++++- tests/test_openaq.py | 4 +++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 15184c01..f0b06813 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -146,6 +146,7 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python "city", "country", # + "attribution", "sourceName", "sourceType", "mobile", @@ -175,7 +176,13 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python else: averagingPeriod = None - # TODO: attribution + # Attribution + attrs = data.get("attribution") + if attrs is not None: + attr_names = [a["name"] for a in attrs] + # if len(attr_names) > 1: + # print(f"Taking first of {len(attr_names)}:", attr_names) + attr_name = attr_names[0] # Just the (hopefully) primary one rows.append( ( @@ -194,6 +201,7 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python data["city"], data["country"], # + attr_name, data["sourceName"], data["sourceType"], data["mobile"], diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 8593967f..d08d9165 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -31,8 +31,10 @@ def test_openaq(): # ProgressBar().register() # df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file +# df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=4) # many files df = openaq.read_json2( # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB - "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB + # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB + "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" ) From 5b20db8439fb7b80329ed031addbc12cb305062a Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 11 Sep 2023 18:34:45 -0600 Subject: [PATCH 07/77] Trying out getting attribution with larger dataset --- monetio/obs/openaq.py | 10 ++++++++-- tests/test_openaq.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index f0b06813..51f78a0f 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -120,9 +120,14 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python tic = perf_counter() - if fp_or_url.startswith("http"): + if isinstance(fp_or_url, str) and fp_or_url.startswith(("http", "s3")): import requests + if fp_or_url.startswith("s3"): + fp_or_url = fp_or_url.replace( + "s3://openaq-fetches/", "https://openaq-fetches.s3.amazonaws.com/" + ) + r = requests.get(fp_or_url, stream=True, timeout=2) r.raise_for_status() else: @@ -294,7 +299,7 @@ def add_data(self, dates, *, num_workers=1): if len(urls) > 1: print(urls[-1]) - dfs = [dask.delayed(read_json)(f) for f in urls] + dfs = [dask.delayed(read_json2)(f) for f in urls] df_lazy = dd.from_delayed(dfs) df = df_lazy.compute(num_workers=num_workers) @@ -356,6 +361,7 @@ def _pivot_table(self, df): "location", "city", "country", + "attribution", # currently only in Python reader "sourceName", "sourceType", "mobile", diff --git a/tests/test_openaq.py b/tests/test_openaq.py index d08d9165..acff8cc9 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -26,15 +26,15 @@ def test_openaq(): days_avail = o._get_available_days(pd.date_range("2019-08-01", "2019-08-03")) files = o._get_files_in_day(pd.to_datetime("2019-08-01")) -# from dask.diagnostics import ProgressBar +from dask.diagnostics import ProgressBar -# ProgressBar().register() +ProgressBar().register() # df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file -# df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=4) # many files +df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=4) # many files -df = openaq.read_json2( - # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB - # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB - "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" -) +# df = openaq.read_json2( +# # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB +# # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB +# "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" +# ) From 1af59cb8390814174e530f0eb863f6bf9c82917c Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 11 Sep 2023 19:59:54 -0600 Subject: [PATCH 08/77] Trying out OpenAQ API v2 --- monetio/obs/openaq_v2.py | 76 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 monetio/obs/openaq_v2.py diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py new file mode 100644 index 00000000..2a33e99a --- /dev/null +++ b/monetio/obs/openaq_v2.py @@ -0,0 +1,76 @@ +"""Get AQ data from the OpenAQ v2 REST API.""" +import json + +import pandas as pd +import requests + +t_from = "2023-09-04" +t_to = "2023-09-04T23:59:59" + +n_pages = 20 + +data = [] +for page in range(1, n_pages + 1): + print(page) + r = requests.get( + "https://api.openaq.org/v2/measurements", + headers={"Accept": "application/json"}, + params={ + "date_from": t_from, + "date_to": t_to, + "limit": 100, + # Number of results in response + # Default: 100 + # "limit + offset must be <= 100_000" + # where offset = limit * (page - 1) + # => limit * page <= 100_000 + "page": page, + # Must be <= 6000 + "parameter": ["pm25", "no2", "o3"], + # There are many parameters! + }, + timeout=10, + ) + r.raise_for_status() + this_data = r.json() + data.extend(this_data["results"]) + +df = pd.DataFrame(data) + +# # Column Non-Null Count Dtype +# --- ------ -------------- ----- +# 0 locationId 2000 non-null int64 +# 1 location 2000 non-null object +# 2 parameter 2000 non-null object +# 3 value 2000 non-null float64 +# 4 date 2000 non-null object +# 5 unit 2000 non-null object +# 6 coordinates 2000 non-null object +# 7 country 2000 non-null object +# 8 city 0 non-null object # None +# 9 isMobile 2000 non-null bool +# 10 isAnalysis 0 non-null object # None +# 11 entity 2000 non-null object +# 12 sensorType 2000 non-null object + +to_expand = ["date", "coordinates"] +new = pd.json_normalize(json.loads(df[to_expand].to_json(orient="records"))) + +time = pd.to_datetime(new["date.utc"]).dt.tz_localize(None) +# utcoffset = pd.to_timedelta(new["date.local"].str.slice(-6, None) + ":00") +# time_local = time + utcoffset +# ^ Seems some have negative minutes in the tz, so this method complains +time_local = pd.to_datetime(new["date.local"].str.slice(0, 19)) +utcoffset = time_local - time + +# TODO: null case?? +lat = new["coordinates.latitude"] +lon = new["coordinates.longitude"] + +df = df.drop(columns=to_expand).assign( + time=time, + time_local=time_local, + utcoffset=utcoffset, + latitude=lat, + longitude=lon, +) From 0657d3ec1526bde7c29c7fd6bbc6064fde8e8d7b Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 11 Sep 2023 20:02:11 -0600 Subject: [PATCH 09/77] todo --- monetio/obs/openaq_v2.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 2a33e99a..ae1e68f7 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -14,7 +14,10 @@ print(page) r = requests.get( "https://api.openaq.org/v2/measurements", - headers={"Accept": "application/json"}, + headers={ + "Accept": "application/json", + # "X-API-Key": "", # TODO + }, params={ "date_from": t_from, "date_to": t_to, @@ -27,7 +30,7 @@ "page": page, # Must be <= 6000 "parameter": ["pm25", "no2", "o3"], - # There are many parameters! + # There are (too) many parameters! }, timeout=10, ) From 157be3ce5666a898dc2aa7b94e4014adf37bb35e Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 12 Sep 2023 09:08:25 -0600 Subject: [PATCH 10/77] Experimenting with smaller (finishable) queries --- monetio/obs/openaq_v2.py | 74 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index ae1e68f7..ac33cf5c 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -4,14 +4,55 @@ import pandas as pd import requests -t_from = "2023-09-04" -t_to = "2023-09-04T23:59:59" +# +# Get locations +# -n_pages = 20 +# r = requests.get( +# "https://api.openaq.org/v2/locations", +# params={ +# "country": "US", +# "page": 1, +# "limit": 500, +# }, +# ) +# r.raise_for_status() +# data = r.json() + +# +# Get cities +# + +# r = requests.get( +# "https://api.openaq.org/v2/cities", +# params={ +# "country": "US", +# "page": 1, +# "limit": 500, +# }, +# ) +# r.raise_for_status() +# data = r.json() + +# +# Get data +# + +# t_from = "2023-09-04T" +# t_to = "2023-09-04T23:59:59" + +t_from = "2023-09-03T23:59:59" +# ^ seems to be necessary to get 0 UTC +# so I guess (from < time <= to) == (from , to] is used +# i.e. `from` is exclusive, `to` is inclusive +t_to = "2023-09-04T23:00:00" + +res_limit_per_page = 500 # max number of results per page +n_pages = 50 # max number of pages data = [] for page in range(1, n_pages + 1): - print(page) + print(f"page {page}") r = requests.get( "https://api.openaq.org/v2/measurements", headers={ @@ -21,24 +62,45 @@ params={ "date_from": t_from, "date_to": t_to, - "limit": 100, + "limit": res_limit_per_page, # Number of results in response # Default: 100 # "limit + offset must be <= 100_000" # where offset = limit * (page - 1) # => limit * page <= 100_000 "page": page, + # Page in query results # Must be <= 6000 - "parameter": ["pm25", "no2", "o3"], + "parameter": ["pm1", "pm25", "pm10", "no2", "o3"], # There are (too) many parameters! + "country": "US", + # "city": ["Boulder", "BOULDER", "Denver", "DENVER"], + # Seems like PurpleAir sensors (often?) don't have city listed + # But can get them with the coords + radius search + "coordinates": "39.9920859,-105.2614118", # CSL-ish + # lat/lon, "up to 8 decimal points of precision" + "radius": 10_000, # meters + # Search radius has a max of 25_000 (25 km) + "include_fields": ["sourceType", "sourceName"], # not working }, timeout=10, ) r.raise_for_status() this_data = r.json() + found = this_data["meta"]["found"] + print(f"found {found}") + n = len(this_data["results"]) + if n == 0: + break + if n < res_limit_per_page: + print(f"note: results returned ({n}) < limit ({res_limit_per_page})") data.extend(this_data["results"]) +if isinstance(found, str) and found.startswith(">"): + print("warning: some query results not fetched") + df = pd.DataFrame(data) +assert not df.empty # # Column Non-Null Count Dtype # --- ------ -------------- ----- From 36c418938d5911bc31dc3c456b3328ca2c2ae79e Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 12 Sep 2023 11:59:41 -0600 Subject: [PATCH 11/77] NOx ppm; disable prints (for prog bar) --- monetio/obs/openaq.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 51f78a0f..0392b1b4 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -26,7 +26,7 @@ def add_data(dates, n_procs=1): return a.add_data(dates, num_workers=n_procs) -def read_json(fp_or_url): +def read_json(fp_or_url, *, verbose=True): """Read a json file from the OpenAQ server, returning dataframe in non-wide format. Parameters @@ -98,12 +98,13 @@ def read_json(fp_or_url): averagingPeriod=averagingPeriod, ) - print(f"{perf_counter() - tic:.3f}s") + if verbose: + print(f"{perf_counter() - tic:.3f}s") return df -def read_json2(fp_or_url): # TODO: go through the JSON with Python +def read_json2(fp_or_url, *, verbose=True): """Read a json file from the OpenAQ server, returning dataframe in non-wide format. Parameters @@ -162,7 +163,8 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python data = json.loads(line) coords = data.get("coordinates") if coords is None: - print("Skipping row since no coords:", data) + if verbose: + print("Skipping row since no coords:", data) continue # Time @@ -218,7 +220,8 @@ def read_json2(fp_or_url): # TODO: go through the JSON with Python df["time_local"] = df["time"] + df["utcoffset"] - print(f"{perf_counter() - tic:.3f}s") + if verbose: + print(f"{perf_counter() - tic:.3f}s") return df @@ -281,6 +284,8 @@ def build_urls(self, dates): return urls def add_data(self, dates, *, num_workers=1): + from functools import partial + import dask import dask.dataframe as dd @@ -299,7 +304,8 @@ def add_data(self, dates, *, num_workers=1): if len(urls) > 1: print(urls[-1]) - dfs = [dask.delayed(read_json2)(f) for f in urls] + func = partial(read_json2, verbose=False) + dfs = [dask.delayed(func)(url) for url in urls] df_lazy = dd.from_delayed(dfs) df = df_lazy.compute(num_workers=num_workers) @@ -308,7 +314,7 @@ def add_data(self, dates, *, num_workers=1): # Ensure consistent units, e.g. ppm for molecules self._fix_units(df) - non_molec = ["pm1", "pm25", "pm4", "pm10", "bc", "nox"] + non_molec = ["pm1", "pm25", "pm4", "pm10", "bc"] good = (df[~df.parameter.isin(non_molec)].unit.dropna() == "ppm").all() if not good: unique_params = sorted(df.parameter.unique()) @@ -343,6 +349,7 @@ def _fix_units(self, df): # - air density: 1.2 kg m -3 # rounded to 3 significant figures. fs = {"co": 1160, "o3": 1990, "so2": 2650, "no2": 1900, "ch4": 664, "no": 1240} + fs["nox"] = fs["no2"] # Need to make an assumption about NOx MW for vn, f in fs.items(): is_ug = (df.parameter == vn) & (df.unit == "µg/m³") df.loc[is_ug, "value"] /= f From 3ff4cd65e4d2120b5fc02b07e758b2340dccbaa2 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 12 Sep 2023 12:01:02 -0600 Subject: [PATCH 12/77] The species Jordan wants --- monetio/obs/openaq_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index ac33cf5c..6aa0c6e6 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -71,7 +71,7 @@ "page": page, # Page in query results # Must be <= 6000 - "parameter": ["pm1", "pm25", "pm10", "no2", "o3"], + "parameter": ["o3", "pm25", "pm10", "co", "no2"], # There are (too) many parameters! "country": "US", # "city": ["Boulder", "BOULDER", "Denver", "DENVER"], From b096dd74f84d56a537e2e2021afa44132d530c78 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 15 Sep 2023 11:24:55 -0600 Subject: [PATCH 13/77] Pass API key --- monetio/obs/openaq_v2.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 6aa0c6e6..0383db24 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -1,9 +1,17 @@ """Get AQ data from the OpenAQ v2 REST API.""" import json +import os import pandas as pd import requests +API_KEY = os.environ.get("OPENAQ_API_KEY", None) +if API_KEY is None: + print( + "warning: non-cached requests will be slow without API key. " + "Obtain one and set your OPENAQ_API_KEY environment variable." + ) + # # Get locations # @@ -57,7 +65,7 @@ "https://api.openaq.org/v2/measurements", headers={ "Accept": "application/json", - # "X-API-Key": "", # TODO + "X-API-Key": API_KEY, }, params={ "date_from": t_from, From d41c645314627fc9ac8c11d6f75a1f9fc15dd1ba Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 18 Oct 2023 16:40:24 -0600 Subject: [PATCH 14/77] Currently converting NOx to ppmv --- monetio/obs/openaq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 0392b1b4..519efc9a 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -393,7 +393,8 @@ def _pivot_table(self, df): pm4="pm4_ugm3", pm10="pm10_ugm3", bc="bc_ugm3", - nox="nox_ugm3", + # + nox="nox_ppm", ), axis="columns", errors="ignore", From 7a69d0626e091d41e850efacc3fa9d8979a60086 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 18 Oct 2023 17:05:54 -0600 Subject: [PATCH 15/77] WIP --- monetio/obs/openaq_v2.py | 287 +++++++++++++++++++++------------------ 1 file changed, 153 insertions(+), 134 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 0383db24..c98b651f 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -12,138 +12,157 @@ "Obtain one and set your OPENAQ_API_KEY environment variable." ) -# -# Get locations -# - -# r = requests.get( -# "https://api.openaq.org/v2/locations", -# params={ -# "country": "US", -# "page": 1, -# "limit": 500, -# }, -# ) -# r.raise_for_status() -# data = r.json() - -# -# Get cities -# - -# r = requests.get( -# "https://api.openaq.org/v2/cities", -# params={ -# "country": "US", -# "page": 1, -# "limit": 500, -# }, -# ) -# r.raise_for_status() -# data = r.json() - -# -# Get data -# - -# t_from = "2023-09-04T" -# t_to = "2023-09-04T23:59:59" - -t_from = "2023-09-03T23:59:59" -# ^ seems to be necessary to get 0 UTC -# so I guess (from < time <= to) == (from , to] is used -# i.e. `from` is exclusive, `to` is inclusive -t_to = "2023-09-04T23:00:00" - -res_limit_per_page = 500 # max number of results per page -n_pages = 50 # max number of pages - -data = [] -for page in range(1, n_pages + 1): - print(f"page {page}") - r = requests.get( - "https://api.openaq.org/v2/measurements", - headers={ - "Accept": "application/json", - "X-API-Key": API_KEY, - }, - params={ - "date_from": t_from, - "date_to": t_to, - "limit": res_limit_per_page, - # Number of results in response - # Default: 100 - # "limit + offset must be <= 100_000" - # where offset = limit * (page - 1) - # => limit * page <= 100_000 - "page": page, - # Page in query results - # Must be <= 6000 - "parameter": ["o3", "pm25", "pm10", "co", "no2"], - # There are (too) many parameters! - "country": "US", - # "city": ["Boulder", "BOULDER", "Denver", "DENVER"], - # Seems like PurpleAir sensors (often?) don't have city listed - # But can get them with the coords + radius search - "coordinates": "39.9920859,-105.2614118", # CSL-ish - # lat/lon, "up to 8 decimal points of precision" - "radius": 10_000, # meters - # Search radius has a max of 25_000 (25 km) - "include_fields": ["sourceType", "sourceName"], # not working - }, - timeout=10, + +def _consume(url, *, params=None, timeout=3, limit=500, npages=None): + """Consume a paginated OpenAQ API endpoint.""" + if params is None: + params = {} + + if npages is None: + # Maximize + # "limit + offset must be <= 100_000" + # where offset = limit * (page - 1) + # => limit * page <= 100_000 + # and also page must be <= 6_000 + npages = min(100_000 // limit, 6_000) + + params["limit"] = limit + + headers = { + "Accept": "application/json", + "X-API-Key": API_KEY, + } + + data = [] + for page in range(1, npages + 1): + params["page"] = page + r = requests.get(url, params=params, headers=headers, timeout=timeout) + r.raise_for_status() + + this_data = r.json() + found = this_data["meta"]["found"] + print(f"found {found}") + n = len(this_data["results"]) + if n == 0: + break + if n < limit: + print(f"note: results returned ({n}) < limit ({limit})") + data.extend(this_data["results"]) + + if isinstance(found, str) and found.startswith(">"): + print(f"warning: some query results not fetched ('found' is {found!r})") + + return data + + +def get_locations(): + """Get locations from OpenAQ v2 API.""" + return _consume("https://api.openaq.org/v2/locations") + + +def add_data(): + """Get OpenAQ API v2 data, including low-cost sensors.""" + + # t_from = "2023-09-04T" + # t_to = "2023-09-04T23:59:59" + + t_from = "2023-09-03T23:59:59" + # ^ seems to be necessary to get 0 UTC + # so I guess (from < time <= to) == (from , to] is used + # i.e. `from` is exclusive, `to` is inclusive + t_to = "2023-09-04T23:00:00" + + res_limit_per_page = 500 # max number of results per page + n_pages = 50 # max number of pages + + data = [] + for page in range(1, n_pages + 1): + print(f"page {page}") + r = requests.get( + "https://api.openaq.org/v2/measurements", + headers={ + "Accept": "application/json", + "X-API-Key": API_KEY, + }, + params={ + "date_from": t_from, + "date_to": t_to, + "limit": res_limit_per_page, + # Number of results in response + # Default: 100 + # "limit + offset must be <= 100_000" + # where offset = limit * (page - 1) + # => limit * page <= 100_000 + "page": page, + # Page in query results + # Must be <= 6000 + "parameter": ["o3", "pm25", "pm10", "co", "no2"], + # There are (too) many parameters! + "country": "US", + # "city": ["Boulder", "BOULDER", "Denver", "DENVER"], + # Seems like PurpleAir sensors (often?) don't have city listed + # But can get them with the coords + radius search + "coordinates": "39.9920859,-105.2614118", # CSL-ish + # lat/lon, "up to 8 decimal points of precision" + "radius": 10_000, # meters + # Search radius has a max of 25_000 (25 km) + "include_fields": ["sourceType", "sourceName"], # not working + }, + timeout=10, + ) + r.raise_for_status() + this_data = r.json() + found = this_data["meta"]["found"] + print(f"found {found}") + n = len(this_data["results"]) + if n == 0: + break + if n < res_limit_per_page: + print(f"note: results returned ({n}) < limit ({res_limit_per_page})") + data.extend(this_data["results"]) + + if isinstance(found, str) and found.startswith(">"): + print("warning: some query results not fetched") + + df = pd.DataFrame(data) + assert not df.empty + + # # Column Non-Null Count Dtype + # --- ------ -------------- ----- + # 0 locationId 2000 non-null int64 + # 1 location 2000 non-null object + # 2 parameter 2000 non-null object + # 3 value 2000 non-null float64 + # 4 date 2000 non-null object + # 5 unit 2000 non-null object + # 6 coordinates 2000 non-null object + # 7 country 2000 non-null object + # 8 city 0 non-null object # None + # 9 isMobile 2000 non-null bool + # 10 isAnalysis 0 non-null object # None + # 11 entity 2000 non-null object + # 12 sensorType 2000 non-null object + + to_expand = ["date", "coordinates"] + new = pd.json_normalize(json.loads(df[to_expand].to_json(orient="records"))) + + time = pd.to_datetime(new["date.utc"]).dt.tz_localize(None) + # utcoffset = pd.to_timedelta(new["date.local"].str.slice(-6, None) + ":00") + # time_local = time + utcoffset + # ^ Seems some have negative minutes in the tz, so this method complains + time_local = pd.to_datetime(new["date.local"].str.slice(0, 19)) + utcoffset = time_local - time + + # TODO: null case?? + lat = new["coordinates.latitude"] + lon = new["coordinates.longitude"] + + df = df.drop(columns=to_expand).assign( + time=time, + time_local=time_local, + utcoffset=utcoffset, + latitude=lat, + longitude=lon, ) - r.raise_for_status() - this_data = r.json() - found = this_data["meta"]["found"] - print(f"found {found}") - n = len(this_data["results"]) - if n == 0: - break - if n < res_limit_per_page: - print(f"note: results returned ({n}) < limit ({res_limit_per_page})") - data.extend(this_data["results"]) - -if isinstance(found, str) and found.startswith(">"): - print("warning: some query results not fetched") - -df = pd.DataFrame(data) -assert not df.empty - -# # Column Non-Null Count Dtype -# --- ------ -------------- ----- -# 0 locationId 2000 non-null int64 -# 1 location 2000 non-null object -# 2 parameter 2000 non-null object -# 3 value 2000 non-null float64 -# 4 date 2000 non-null object -# 5 unit 2000 non-null object -# 6 coordinates 2000 non-null object -# 7 country 2000 non-null object -# 8 city 0 non-null object # None -# 9 isMobile 2000 non-null bool -# 10 isAnalysis 0 non-null object # None -# 11 entity 2000 non-null object -# 12 sensorType 2000 non-null object - -to_expand = ["date", "coordinates"] -new = pd.json_normalize(json.loads(df[to_expand].to_json(orient="records"))) - -time = pd.to_datetime(new["date.utc"]).dt.tz_localize(None) -# utcoffset = pd.to_timedelta(new["date.local"].str.slice(-6, None) + ":00") -# time_local = time + utcoffset -# ^ Seems some have negative minutes in the tz, so this method complains -time_local = pd.to_datetime(new["date.local"].str.slice(0, 19)) -utcoffset = time_local - time - -# TODO: null case?? -lat = new["coordinates.latitude"] -lon = new["coordinates.longitude"] - -df = df.drop(columns=to_expand).assign( - time=time, - time_local=time_local, - utcoffset=utcoffset, - latitude=lat, - longitude=lon, -) + + return df From f91a7c59fd990eaf8554706bfd0c02523a53481b Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 09:42:01 -0600 Subject: [PATCH 16/77] Testing get-locations --- monetio/obs/openaq_v2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index c98b651f..b7122559 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -8,12 +8,12 @@ API_KEY = os.environ.get("OPENAQ_API_KEY", None) if API_KEY is None: print( - "warning: non-cached requests will be slow without API key. " + "warning: non-cached requests to the OpenAQ v2 web API will be slow without an API key. " "Obtain one and set your OPENAQ_API_KEY environment variable." ) -def _consume(url, *, params=None, timeout=3, limit=500, npages=None): +def _consume(url, *, params=None, timeout=10, limit=500, npages=None): """Consume a paginated OpenAQ API endpoint.""" if params is None: params = {} @@ -41,8 +41,8 @@ def _consume(url, *, params=None, timeout=3, limit=500, npages=None): this_data = r.json() found = this_data["meta"]["found"] - print(f"found {found}") n = len(this_data["results"]) + print(f"page={page} found={found!r} n={n}") if n == 0: break if n < limit: @@ -55,9 +55,9 @@ def _consume(url, *, params=None, timeout=3, limit=500, npages=None): return data -def get_locations(): +def get_locations(**kwargs): """Get locations from OpenAQ v2 API.""" - return _consume("https://api.openaq.org/v2/locations") + return _consume("https://api.openaq.org/v2/locations", **kwargs) def add_data(): From 4ff270e7d0a6dbb40850fbd50091d9fda648a51e Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 10:29:50 -0600 Subject: [PATCH 17/77] Locations as df --- monetio/obs/openaq_v2.py | 49 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index b7122559..2dee1035 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -51,13 +51,60 @@ def _consume(url, *, params=None, timeout=10, limit=500, npages=None): if isinstance(found, str) and found.startswith(">"): print(f"warning: some query results not fetched ('found' is {found!r})") + elif isinstance(found, int) and len(data) < found: + print(f"warning: some query results not fetched (found={found}, got {len(data)} results)") return data def get_locations(**kwargs): """Get locations from OpenAQ v2 API.""" - return _consume("https://api.openaq.org/v2/locations", **kwargs) + + data = _consume("https://api.openaq.org/v2/locations", **kwargs) + + # Some fields with scalar values to take + some_scalars = [ + "id", + "name", + "city", + "country", + # "entity", # all null + "isMobile", + # "isAnalysis", # all null + # "sensorType", # all null + "firstUpdated", + "lastUpdated", + ] + + data2 = [] + for d in data: + lat = d["coordinates"]["latitude"] + lon = d["coordinates"]["longitude"] + parameters = [p["parameter"] for p in d["parameters"]] + manufacturer = d["manufacturers"][0]["manufacturerName"] if d["manufacturers"] else None + d2 = {k: d[k] for k in some_scalars} + d2.update( + latitude=lat, + longitude=lon, + parameters=parameters, + manufacturer=manufacturer, + ) + data2.append(d2) + + df = pd.DataFrame(data2) + + # Compute datetimes (the timestamps are already in UTC, but with tz specified) + assert (df.firstUpdated.str.slice(-6, None) == "+00:00").all() + df["firstUpdated"] = pd.to_datetime(df.firstUpdated.str.slice(0, -6)) + assert df.lastUpdated.str.slice(-6, None).eq("+00:00").all() + df["lastUpdated"] = pd.to_datetime(df.lastUpdated.str.slice(0, -6)) + + # Site ID + df = df.rename(columns={"id": "siteid"}) + df["siteid"] = df.siteid.astype(str) + df = df.drop_duplicates("siteid", keep="first").reset_index(drop=True) # seem to be some dupes + + return df def add_data(): From af86394b60253dcd03cd7f55a9d02d1764732141 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 10:37:47 -0600 Subject: [PATCH 18/77] notes --- monetio/obs/openaq_v2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 2dee1035..40a49957 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -1,4 +1,7 @@ -"""Get AQ data from the OpenAQ v2 REST API.""" +"""Get AQ data from the OpenAQ v2 REST API. + +https://api.openaq.org/docs#/v2 +""" import json import os From d59e647bed01fc37d3d34d9663989b52e2b78d06 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 10:45:31 -0600 Subject: [PATCH 19/77] Get parameters list --- monetio/obs/openaq_v2.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 40a49957..19259078 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -61,7 +61,10 @@ def _consume(url, *, params=None, timeout=10, limit=500, npages=None): def get_locations(**kwargs): - """Get locations from OpenAQ v2 API.""" + """Get available site info (including IDs) from OpenAQ v2 API. + + https://api.openaq.org/docs#/v2/locations_get_v2_locations_get + """ data = _consume("https://api.openaq.org/v2/locations", **kwargs) @@ -110,6 +113,16 @@ def get_locations(**kwargs): return df +def get_parameters(**kwargs): + """Get supported parameter info from OpenAQ v2 API.""" + + data = _consume("https://api.openaq.org/v2/parameters", **kwargs) + + df = pd.DataFrame(data) + + return df + + def add_data(): """Get OpenAQ API v2 data, including low-cost sensors.""" From 919bb9171b911f160fa69a47a5a0deb53133a1f7 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 11:10:29 -0600 Subject: [PATCH 20/77] Trying lat/lon box method Barry's idea --- monetio/obs/openaq_v2.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 19259078..e25d82ce 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -123,6 +123,27 @@ def get_parameters(**kwargs): return df +def get_latlonbox_sites(latlonbox, **kwargs): + """ + Parameters + ---------- + latlonbox : array-like of float + ``[lat1, lon1, lat2, lon2]`` (lower-left corner, upper-right corner) + """ + lat1, lon1, lat2, lon2 = latlonbox + sites = get_locations(**kwargs) + + in_box = ( + (sites.latitude >= lat1) + & (sites.latitude <= lat2) + & (sites.longitude >= lon1) + & (sites.longitude <= lon2) + ) + # TODO: need to account for case of box crossing antimeridian + + return sites[in_box].reset_index(drop=True) + + def add_data(): """Get OpenAQ API v2 data, including low-cost sensors.""" From 8510ff5b82fbeab3104c12407c26186af8ae1e07 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 12:56:14 -0600 Subject: [PATCH 21/77] Towards working add-data fn --- monetio/obs/openaq_v2.py | 179 +++++++++++++++++++++++++-------------- 1 file changed, 115 insertions(+), 64 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index e25d82ce..a46c4e8c 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -16,7 +16,7 @@ ) -def _consume(url, *, params=None, timeout=10, limit=500, npages=None): +def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): """Consume a paginated OpenAQ API endpoint.""" if params is None: params = {} @@ -39,7 +39,15 @@ def _consume(url, *, params=None, timeout=10, limit=500, npages=None): data = [] for page in range(1, npages + 1): params["page"] = page - r = requests.get(url, params=params, headers=headers, timeout=timeout) + + tries = 0 + while tries < retry: + r = requests.get(url, params=params, headers=headers, timeout=timeout) + if r.status_code == 408: + tries += 1 + print(f"warning: request timed out (try {tries}/{retry})") + else: + break r.raise_for_status() this_data = r.json() @@ -74,10 +82,10 @@ def get_locations(**kwargs): "name", "city", "country", - # "entity", # all null + # "entity", # all null (from /measurements we do get values) "isMobile", # "isAnalysis", # all null - # "sensorType", # all null + # "sensorType", # all null (from /measurements we do get values) "firstUpdated", "lastUpdated", ] @@ -144,72 +152,115 @@ def get_latlonbox_sites(latlonbox, **kwargs): return sites[in_box].reset_index(drop=True) -def add_data(): +def add_data( + dates, + *, + parameters=None, + query_time_split="1H", + **kwargs, +): """Get OpenAQ API v2 data, including low-cost sensors.""" - # t_from = "2023-09-04T" - # t_to = "2023-09-04T23:59:59" - - t_from = "2023-09-03T23:59:59" - # ^ seems to be necessary to get 0 UTC - # so I guess (from < time <= to) == (from , to] is used - # i.e. `from` is exclusive, `to` is inclusive - t_to = "2023-09-04T23:00:00" - - res_limit_per_page = 500 # max number of results per page - n_pages = 50 # max number of pages + dates = pd.DatetimeIndex(dates) + if parameters is None: + parameters = ["pm25", "o3"] + elif isinstance(parameters, str): + parameters = [parameters] + query_dt = pd.to_timedelta(query_time_split) + date_min, date_max = dates.min(), dates.max() + if date_min == date_max: + raise ValueError("must provide at least two unique datetimes") + + def iter_time_slices(): + # seems that (from < time <= to) == (from , to] is used + # i.e. `from` is exclusive, `to` is inclusive + one_sec = pd.Timedelta(seconds=1) + t = date_min + while t < date_max: + t_next = t + query_dt + yield t - one_sec, t_next + t = t_next data = [] - for page in range(1, n_pages + 1): - print(f"page {page}") - r = requests.get( - "https://api.openaq.org/v2/measurements", - headers={ - "Accept": "application/json", - "X-API-Key": API_KEY, - }, - params={ - "date_from": t_from, - "date_to": t_to, - "limit": res_limit_per_page, - # Number of results in response - # Default: 100 - # "limit + offset must be <= 100_000" - # where offset = limit * (page - 1) - # => limit * page <= 100_000 - "page": page, - # Page in query results - # Must be <= 6000 - "parameter": ["o3", "pm25", "pm10", "co", "no2"], - # There are (too) many parameters! - "country": "US", - # "city": ["Boulder", "BOULDER", "Denver", "DENVER"], - # Seems like PurpleAir sensors (often?) don't have city listed - # But can get them with the coords + radius search - "coordinates": "39.9920859,-105.2614118", # CSL-ish - # lat/lon, "up to 8 decimal points of precision" - "radius": 10_000, # meters - # Search radius has a max of 25_000 (25 km) - "include_fields": ["sourceType", "sourceName"], # not working - }, - timeout=10, - ) - r.raise_for_status() - this_data = r.json() - found = this_data["meta"]["found"] - print(f"found {found}") - n = len(this_data["results"]) - if n == 0: - break - if n < res_limit_per_page: - print(f"note: results returned ({n}) < limit ({res_limit_per_page})") - data.extend(this_data["results"]) - - if isinstance(found, str) and found.startswith(">"): - print("warning: some query results not fetched") + for parameter in parameters: + for t_from, t_to in iter_time_slices(): + print(f"parameter={parameter!r} t_from={t_from} t_to={t_to}") + data_ = _consume( + "https://api.openaq.org/v2/measurements", + params={ + "date_from": t_from, + "date_to": t_to, + "parameter": parameter, + }, + **kwargs, + ) + data.extend(data_) + + # # t_from = "2023-09-04T" + # # t_to = "2023-09-04T23:59:59" + + # t_from = "2023-09-03T23:59:59" + # # ^ seems to be necessary to get 0 UTC + # # so I guess (from < time <= to) == (from , to] is used + # # i.e. `from` is exclusive, `to` is inclusive + # t_to = "2023-09-04T23:00:00" + + # res_limit_per_page = 500 # max number of results per page + # n_pages = 50 # max number of pages + + # data = [] + # for page in range(1, n_pages + 1): + # print(f"page {page}") + # r = requests.get( + # "https://api.openaq.org/v2/measurements", + # headers={ + # "Accept": "application/json", + # "X-API-Key": API_KEY, + # }, + # params={ + # "date_from": t_from, + # "date_to": t_to, + # "limit": res_limit_per_page, + # # Number of results in response + # # Default: 100 + # # "limit + offset must be <= 100_000" + # # where offset = limit * (page - 1) + # # => limit * page <= 100_000 + # "page": page, + # # Page in query results + # # Must be <= 6000 + # "parameter": ["o3", "pm25", "pm10", "co", "no2"], + # # There are (too) many parameters! + # "country": "US", + # # "city": ["Boulder", "BOULDER", "Denver", "DENVER"], + # # Seems like PurpleAir sensors (often?) don't have city listed + # # But can get them with the coords + radius search + # "coordinates": "39.9920859,-105.2614118", # CSL-ish + # # lat/lon, "up to 8 decimal points of precision" + # "radius": 10_000, # meters + # # Search radius has a max of 25_000 (25 km) + # "include_fields": ["sourceType", "sourceName"], # not working + # }, + # timeout=10, + # ) + # r.raise_for_status() + # this_data = r.json() + # found = this_data["meta"]["found"] + # print(f"found {found}") + # n = len(this_data["results"]) + # if n == 0: + # break + # if n < res_limit_per_page: + # print(f"note: results returned ({n}) < limit ({res_limit_per_page})") + # data.extend(this_data["results"]) + + # if isinstance(found, str) and found.startswith(">"): + # print("warning: some query results not fetched") df = pd.DataFrame(data) - assert not df.empty + if df.empty: + print("warning: no data found") + return df # # Column Non-Null Count Dtype # --- ------ -------------- ----- From 97b68dec192d783dd2993663cec4a805b24849cc Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 15:36:17 -0600 Subject: [PATCH 22/77] Initial search radius(es) option --- monetio/obs/openaq_v2.py | 111 ++++++++++++++------------------------- 1 file changed, 39 insertions(+), 72 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index a46c4e8c..7eee948f 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -156,10 +156,17 @@ def add_data( dates, *, parameters=None, + search_radius=None, query_time_split="1H", **kwargs, ): - """Get OpenAQ API v2 data, including low-cost sensors.""" + """Get OpenAQ API v2 data, including low-cost sensors. + + Parameters + ---------- + search_radius : dict + Mapping coords (lat, lon) [deg] to search radius [m] (max of 25 km). + """ dates = pd.DatetimeIndex(dates) if parameters is None: @@ -181,81 +188,41 @@ def iter_time_slices(): yield t - one_sec, t_next t = t_next + params = {} data = [] for parameter in parameters: + params.update(parameter=parameter) for t_from, t_to in iter_time_slices(): - print(f"parameter={parameter!r} t_from={t_from} t_to={t_to}") - data_ = _consume( - "https://api.openaq.org/v2/measurements", - params={ - "date_from": t_from, - "date_to": t_to, - "parameter": parameter, - }, - **kwargs, + params.update( + date_from=t_from, + date_to=t_to, ) - data.extend(data_) - - # # t_from = "2023-09-04T" - # # t_to = "2023-09-04T23:59:59" - - # t_from = "2023-09-03T23:59:59" - # # ^ seems to be necessary to get 0 UTC - # # so I guess (from < time <= to) == (from , to] is used - # # i.e. `from` is exclusive, `to` is inclusive - # t_to = "2023-09-04T23:00:00" - - # res_limit_per_page = 500 # max number of results per page - # n_pages = 50 # max number of pages - - # data = [] - # for page in range(1, n_pages + 1): - # print(f"page {page}") - # r = requests.get( - # "https://api.openaq.org/v2/measurements", - # headers={ - # "Accept": "application/json", - # "X-API-Key": API_KEY, - # }, - # params={ - # "date_from": t_from, - # "date_to": t_to, - # "limit": res_limit_per_page, - # # Number of results in response - # # Default: 100 - # # "limit + offset must be <= 100_000" - # # where offset = limit * (page - 1) - # # => limit * page <= 100_000 - # "page": page, - # # Page in query results - # # Must be <= 6000 - # "parameter": ["o3", "pm25", "pm10", "co", "no2"], - # # There are (too) many parameters! - # "country": "US", - # # "city": ["Boulder", "BOULDER", "Denver", "DENVER"], - # # Seems like PurpleAir sensors (often?) don't have city listed - # # But can get them with the coords + radius search - # "coordinates": "39.9920859,-105.2614118", # CSL-ish - # # lat/lon, "up to 8 decimal points of precision" - # "radius": 10_000, # meters - # # Search radius has a max of 25_000 (25 km) - # "include_fields": ["sourceType", "sourceName"], # not working - # }, - # timeout=10, - # ) - # r.raise_for_status() - # this_data = r.json() - # found = this_data["meta"]["found"] - # print(f"found {found}") - # n = len(this_data["results"]) - # if n == 0: - # break - # if n < res_limit_per_page: - # print(f"note: results returned ({n}) < limit ({res_limit_per_page})") - # data.extend(this_data["results"]) - - # if isinstance(found, str) and found.startswith(">"): - # print("warning: some query results not fetched") + if search_radius is not None: + for coords, radius in search_radius.items(): + if not 0 < radius <= 25_000: + raise ValueError(f"invalid radius {radius!r}") + params.update( + coordinates=f"{coords[0]:.8f},{coords[1]:.8f}", + radius=radius, + ) + print( + f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}' " + f"coords={coords} radius={radius}" + ) + data_ = _consume( + "https://api.openaq.org/v2/measurements", + params=params, + **kwargs, + ) + data.extend(data_) + else: + print(f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}'") + data_ = _consume( + "https://api.openaq.org/v2/measurements", + params=params, + **kwargs, + ) + data.extend(data_) df = pd.DataFrame(data) if df.empty: From 3ab310211f655fd01a1329238e14895cf955d77e Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 15:42:00 -0600 Subject: [PATCH 23/77] docstring work --- monetio/obs/openaq_v2.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 7eee948f..191db17d 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -164,8 +164,18 @@ def add_data( Parameters ---------- - search_radius : dict + parameters : str or list of str, optional + For example, ``'o3'`` or ``['pm25', 'o3']`` (default). + search_radius : dict, optional Mapping coords (lat, lon) [deg] to search radius [m] (max of 25 km). + query_time_split + Frequency to use when splitting the queries in time, + in a format that ``pandas.to_timedelta`` will understand. + This is necessary since there is a 100k limit on the number of results. + However, if you are using search radii, e.g., you may want to set this + to something higher in order to increase the query return speed. + Default: 1 hour + (OpenAQ data are hourly, so setting to something smaller won't help). """ dates = pd.DatetimeIndex(dates) From 5cc7d08bdee4bc00daf04d24404fe4ed80f0eb9c Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 15:50:43 -0600 Subject: [PATCH 24/77] No time split option; more input validation --- monetio/obs/openaq_v2.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 191db17d..1c6d5fb7 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -174,6 +174,7 @@ def add_data( This is necessary since there is a 100k limit on the number of results. However, if you are using search radii, e.g., you may want to set this to something higher in order to increase the query return speed. + Set to ``None`` for no time splitting. Default: 1 hour (OpenAQ data are hourly, so setting to something smaller won't help). """ @@ -184,19 +185,26 @@ def add_data( elif isinstance(parameters, str): parameters = [parameters] query_dt = pd.to_timedelta(query_time_split) + if query_dt is not None and query_dt <= pd.Timedelta(0): + raise ValueError( + f"query_time_split must be positive, got {query_dt} from {query_time_split!r}" + ) date_min, date_max = dates.min(), dates.max() - if date_min == date_max: + if date_min == date_max or len(dates) == 0: raise ValueError("must provide at least two unique datetimes") def iter_time_slices(): # seems that (from < time <= to) == (from , to] is used # i.e. `from` is exclusive, `to` is inclusive one_sec = pd.Timedelta(seconds=1) - t = date_min - while t < date_max: - t_next = t + query_dt - yield t - one_sec, t_next - t = t_next + if query_dt is not None: + t = date_min + while t < date_max: + t_next = t + query_dt + yield t - one_sec, t_next + t = t_next + else: + yield date_min - one_sec, date_max params = {} data = [] From 916d7dea2278fa42359b1edad7e5eaed33ec2b4f Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 16:13:22 -0600 Subject: [PATCH 25/77] country option --- monetio/obs/openaq_v2.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 1c6d5fb7..953fea86 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -156,6 +156,7 @@ def add_data( dates, *, parameters=None, + country=None, search_radius=None, query_time_split="1H", **kwargs, @@ -166,10 +167,13 @@ def add_data( ---------- parameters : str or list of str, optional For example, ``'o3'`` or ``['pm25', 'o3']`` (default). + country : str or list of str, optional + For example, ``'US'`` or ``['US', 'CA']`` (two-letter country codes). + Default: full dataset (no limitation by country). search_radius : dict, optional Mapping coords (lat, lon) [deg] to search radius [m] (max of 25 km). query_time_split - Frequency to use when splitting the queries in time, + Frequency to use when splitting the web API queries in time, in a format that ``pandas.to_timedelta`` will understand. This is necessary since there is a 100k limit on the number of results. However, if you are using search radii, e.g., you may want to set this @@ -207,6 +211,9 @@ def iter_time_slices(): yield date_min - one_sec, date_max params = {} + if country is not None: + params.update(country=country) + data = [] for parameter in parameters: params.update(parameter=parameter) From 6a8107b6f0d5830adad40278696350527101a3a9 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 16:16:09 -0600 Subject: [PATCH 26/77] consistency --- monetio/obs/openaq_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 953fea86..4d640516 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -108,7 +108,7 @@ def get_locations(**kwargs): df = pd.DataFrame(data2) # Compute datetimes (the timestamps are already in UTC, but with tz specified) - assert (df.firstUpdated.str.slice(-6, None) == "+00:00").all() + assert df.firstUpdated.str.slice(-6, None).eq("+00:00").all() df["firstUpdated"] = pd.to_datetime(df.firstUpdated.str.slice(0, -6)) assert df.lastUpdated.str.slice(-6, None).eq("+00:00").all() df["lastUpdated"] = pd.to_datetime(df.lastUpdated.str.slice(0, -6)) From e86d927934d6517746c570eef099bdf236526bfc Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 16:27:32 -0600 Subject: [PATCH 27/77] Use logger --- monetio/obs/openaq_v2.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 4d640516..e8cb7ce5 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -3,11 +3,14 @@ https://api.openaq.org/docs#/v2 """ import json +import logging import os import pandas as pd import requests +logger = logging.getLogger(__name__) + API_KEY = os.environ.get("OPENAQ_API_KEY", None) if API_KEY is None: print( @@ -45,7 +48,7 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): r = requests.get(url, params=params, headers=headers, timeout=timeout) if r.status_code == 408: tries += 1 - print(f"warning: request timed out (try {tries}/{retry})") + logger.warning(f"request timed out (try {tries}/{retry})") else: break r.raise_for_status() @@ -53,11 +56,11 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): this_data = r.json() found = this_data["meta"]["found"] n = len(this_data["results"]) - print(f"page={page} found={found!r} n={n}") + logger.info(f"page={page} found={found!r} n={n}") if n == 0: break if n < limit: - print(f"note: results returned ({n}) < limit ({limit})") + logger.info(f"note: results returned ({n}) < limit ({limit})") data.extend(this_data["results"]) if isinstance(found, str) and found.startswith(">"): @@ -95,7 +98,13 @@ def get_locations(**kwargs): lat = d["coordinates"]["latitude"] lon = d["coordinates"]["longitude"] parameters = [p["parameter"] for p in d["parameters"]] - manufacturer = d["manufacturers"][0]["manufacturerName"] if d["manufacturers"] else None + mfs = d["manufacturers"] + if mfs: + manufacturer = mfs[0]["manufacturerName"] + if len(mfs) > 1: + logger.info(f"site {d['id']} has multiple manufacturers: {mfs}") + else: + manufacturer = None d2 = {k: d[k] for k in some_scalars} d2.update( latitude=lat, @@ -230,7 +239,7 @@ def iter_time_slices(): coordinates=f"{coords[0]:.8f},{coords[1]:.8f}", radius=radius, ) - print( + logger.info( f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}' " f"coords={coords} radius={radius}" ) @@ -241,7 +250,7 @@ def iter_time_slices(): ) data.extend(data_) else: - print(f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}'") + logger.info(f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}'") data_ = _consume( "https://api.openaq.org/v2/measurements", params=params, From 8a11fe3a7a89cc181db26fb11729347b5ca15fe5 Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Oct 2023 17:18:43 -0600 Subject: [PATCH 28/77] fix --- monetio/obs/openaq_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index e8cb7ce5..09082953 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -48,7 +48,7 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): r = requests.get(url, params=params, headers=headers, timeout=timeout) if r.status_code == 408: tries += 1 - logger.warning(f"request timed out (try {tries}/{retry})") + logger.info(f"request timed out (try {tries}/{retry})") else: break r.raise_for_status() @@ -213,7 +213,7 @@ def iter_time_slices(): if query_dt is not None: t = date_min while t < date_max: - t_next = t + query_dt + t_next = min(t + query_dt, date_max) yield t - one_sec, t_next t = t_next else: From 725b746b027fcd00ff237d1da146f14b9e1c93ae Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 10:58:24 -0600 Subject: [PATCH 29/77] Sites option --- monetio/obs/openaq_v2.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 09082953..bd6e8c84 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -72,7 +72,7 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): def get_locations(**kwargs): - """Get available site info (including IDs) from OpenAQ v2 API. + """Get available site info (including site IDs) from OpenAQ v2 API. https://api.openaq.org/docs#/v2/locations_get_v2_locations_get """ @@ -167,6 +167,7 @@ def add_data( parameters=None, country=None, search_radius=None, + sites=None, query_time_split="1H", **kwargs, ): @@ -181,6 +182,10 @@ def add_data( Default: full dataset (no limitation by country). search_radius : dict, optional Mapping coords (lat, lon) [deg] to search radius [m] (max of 25 km). + sites : list of str, optional + Site ID(s) to include, e.g. a specific known site + or group of sites from :func:`get_latlonbox_sites`. + Default: full dataset (no limitation by site). query_time_split Frequency to use when splitting the web API queries in time, in a format that ``pandas.to_timedelta`` will understand. @@ -222,6 +227,8 @@ def iter_time_slices(): params = {} if country is not None: params.update(country=country) + if sites is not None: + params.update(location_id=sites) data = [] for parameter in parameters: From a6a35881df3b14528c0d5ae83a642d0037aa8efa Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 11:18:07 -0600 Subject: [PATCH 30/77] Start tests for OpenAQ v2 --- tests/test_openaq.py | 14 +++++++------- tests/test_openaq_v2.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 7 deletions(-) create mode 100644 tests/test_openaq_v2.py diff --git a/tests/test_openaq.py b/tests/test_openaq.py index acff8cc9..b0e2605d 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -22,16 +22,16 @@ def test_openaq(): # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB # ) -o = openaq.OPENAQ() -days_avail = o._get_available_days(pd.date_range("2019-08-01", "2019-08-03")) -files = o._get_files_in_day(pd.to_datetime("2019-08-01")) +# o = openaq.OPENAQ() +# days_avail = o._get_available_days(pd.date_range("2019-08-01", "2019-08-03")) +# files = o._get_files_in_day(pd.to_datetime("2019-08-01")) -from dask.diagnostics import ProgressBar +# from dask.diagnostics import ProgressBar -ProgressBar().register() +# ProgressBar().register() -# df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file -df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=4) # many files +# # df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file +# df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=4) # many files # df = openaq.read_json2( # # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py new file mode 100644 index 00000000..33a3f792 --- /dev/null +++ b/tests/test_openaq_v2.py @@ -0,0 +1,10 @@ +import monetio.obs.openaq_v2 as openaq + + +def test_get_parameters(): + params = openaq.get_parameters() + assert 50 <= len(params) <= 500 + assert params.id.nunique() == len(params) + assert params.name.nunique() < len(params), "dupes for different units etc." + assert "pm25" in params.name.values + assert "o3" in params.name.values From dd854e1d71742379b628dc74f7534208c8341828 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 11:26:38 -0600 Subject: [PATCH 31/77] Test get locations --- tests/test_openaq_v2.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 33a3f792..8ad30a82 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -8,3 +8,15 @@ def test_get_parameters(): assert params.name.nunique() < len(params), "dupes for different units etc." assert "pm25" in params.name.values assert "o3" in params.name.values + + +def test_get_locations(): + sites = openaq.get_locations(npages=2, limit=100) + assert len(sites) <= 200 + assert sites.siteid.nunique() == len(sites) + assert sites.dtypes["firstUpdated"] == "datetime64[ns]" + assert sites.dtypes["lastUpdated"] == "datetime64[ns]" + assert sites.dtypes["latitude"] == "float64" + assert sites.dtypes["longitude"] == "float64" + assert sites["latitude"].isnull().sum() == 0 + assert sites["longitude"].isnull().sum() == 0 From dee4d742c086b80a4861054774b8dbbba910be04 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 11:31:24 -0600 Subject: [PATCH 32/77] docstring --- monetio/obs/openaq_v2.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index bd6e8c84..a1caab42 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -20,7 +20,24 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): - """Consume a paginated OpenAQ API endpoint.""" + """Consume a paginated OpenAQ API endpoint. + + Parameters + ---------- + params : dict, optional + Parameters for the GET request to the API. + Don't pass ``limit``, ``page``, or ``offset`` here, since they are covered + by the `limit` and `npages` kwargs. + timeout : float or tuple + Seconds to wait for the server before giving up. Passed to ``requests.get``. + retry : int + Number of times to retry the request if it times out. + limit : int + Max number of results per page. + npages : int, optional + Number of pages to fetch. + By default, try to fetch as many as needed to get all results. + """ if params is None: params = {} From 50059f4fbded62646706462bf73e1b312a95c4e6 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 11:35:21 -0600 Subject: [PATCH 33/77] docstrings --- monetio/obs/openaq_v2.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index a1caab42..d65d586b 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -91,6 +91,8 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): def get_locations(**kwargs): """Get available site info (including site IDs) from OpenAQ v2 API. + kwargs are passed to :func:`_consume`. + https://api.openaq.org/docs#/v2/locations_get_v2_locations_get """ @@ -148,7 +150,10 @@ def get_locations(**kwargs): def get_parameters(**kwargs): - """Get supported parameter info from OpenAQ v2 API.""" + """Get supported parameter info from OpenAQ v2 API. + + kwargs are passed to :func:`_consume`. + """ data = _consume("https://api.openaq.org/v2/parameters", **kwargs) @@ -158,7 +163,10 @@ def get_parameters(**kwargs): def get_latlonbox_sites(latlonbox, **kwargs): - """ + """From all available sites, return those within a lat/lon box. + + kwargs are passed to :func:`_consume`. + Parameters ---------- latlonbox : array-like of float @@ -190,6 +198,9 @@ def add_data( ): """Get OpenAQ API v2 data, including low-cost sensors. + kwargs are passed to :func:`_consume`, + though currently ``params`` can't be one of them. + Parameters ---------- parameters : str or list of str, optional From 8fb1c1db5ea756a4c05fdb21146f4a01ef042ba7 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 11:36:23 -0600 Subject: [PATCH 34/77] Site ID as string --- monetio/obs/openaq_v2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index d65d586b..194fd854 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -336,4 +336,8 @@ def iter_time_slices(): longitude=lon, ) + # Site ID + df = df.rename(columns={"id": "siteid"}) + df["siteid"] = df.siteid.astype(str) + return df From 54186850a2f457ce9d3e65c43f20a1f036a9de8a Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 12:58:24 -0600 Subject: [PATCH 35/77] Test get selected data --- monetio/obs/openaq_v2.py | 3 ++- tests/test_openaq_v2.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 194fd854..53e36233 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -1,5 +1,6 @@ """Get AQ data from the OpenAQ v2 REST API. +https://openaq.org/ https://api.openaq.org/docs#/v2 """ import json @@ -337,7 +338,7 @@ def iter_time_slices(): ) # Site ID - df = df.rename(columns={"id": "siteid"}) + df = df.rename(columns={"locationId": "siteid"}) df["siteid"] = df.siteid.astype(str) return df diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 8ad30a82..29dc0747 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -1,3 +1,5 @@ +import pandas as pd + import monetio.obs.openaq_v2 as openaq @@ -20,3 +22,34 @@ def test_get_locations(): assert sites.dtypes["longitude"] == "float64" assert sites["latitude"].isnull().sum() == 0 assert sites["longitude"].isnull().sum() == 0 + + +def test_get_data_near_ncwcp_sites(): + sites = [ + # AirGradient monitor + 1236068, + # PurpleAir sensors + 1118827, + 357301, + 273440, + 271155, + ] + dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") + df = openaq.add_data(dates, sites=sites) + assert len(df) > 0 + assert "pm25" in df.parameter.values + assert df.latitude.round().eq(39).all() + assert df.longitude.round().eq(-77).all() + assert (sorted(df.time.unique()) == dates).all() + assert set(df.siteid) == {str(site) for site in sites} + + +def test_get_data_near_ncwcp_search_radius(): + latlon = 38.9721, -76.9248 + dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") + df = openaq.add_data(dates, search_radius={latlon: 5_000}) + assert len(df) > 0 + assert "pm25" in df.parameter.values + assert df.latitude.round().eq(39).all() + assert df.longitude.round().eq(-77).all() + assert (sorted(df.time.unique()) == dates).all() From dd4a2de6784f50186fbe5e8bec90f08f241a1e86 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 13:26:09 -0600 Subject: [PATCH 36/77] docstrings --- monetio/obs/openaq.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 519efc9a..17bfdf27 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -1,4 +1,7 @@ -"""OpenAQ""" +"""Get v1 (government-only) OpenAQ data from AWS. + +https://openaq.org/ +""" import json import warnings @@ -7,7 +10,7 @@ def add_data(dates, n_procs=1): - """Add OpenAQ data from the Amazon s3 server. + """Add OpenAQ data from the OpenAQ S3 bucket. https://openaq-fetches.s3.amazonaws.com @@ -27,7 +30,7 @@ def add_data(dates, n_procs=1): def read_json(fp_or_url, *, verbose=True): - """Read a json file from the OpenAQ server, returning dataframe in non-wide format. + """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in non-wide format. Parameters ---------- @@ -105,7 +108,7 @@ def read_json(fp_or_url, *, verbose=True): def read_json2(fp_or_url, *, verbose=True): - """Read a json file from the OpenAQ server, returning dataframe in non-wide format. + """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in non-wide format. Parameters ---------- From bdd932d983465ff16ad66a1ea9f7dadf518b740b Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 13:30:49 -0600 Subject: [PATCH 37/77] verbose false --- monetio/obs/openaq.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 17bfdf27..412a8569 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -29,7 +29,7 @@ def add_data(dates, n_procs=1): return a.add_data(dates, num_workers=n_procs) -def read_json(fp_or_url, *, verbose=True): +def read_json(fp_or_url, *, verbose=False): """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in non-wide format. Parameters @@ -107,7 +107,7 @@ def read_json(fp_or_url, *, verbose=True): return df -def read_json2(fp_or_url, *, verbose=True): +def read_json2(fp_or_url, *, verbose=False): """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in non-wide format. Parameters @@ -136,8 +136,6 @@ def read_json2(fp_or_url, *, verbose=True): r.raise_for_status() else: raise NotImplementedError - with open(fp_or_url) as f: - data = json.load(f) names = [ "time", From ac63e9b62e85e77349dc60b71cfe75a2f8a70978 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 13:37:15 -0600 Subject: [PATCH 38/77] docstrings --- monetio/obs/openaq.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 412a8569..5acf4811 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -1,6 +1,7 @@ """Get v1 (government-only) OpenAQ data from AWS. https://openaq.org/ +https://openaq-fetches.s3.amazonaws.com/index.html """ import json import warnings @@ -188,8 +189,9 @@ def read_json2(fp_or_url, *, verbose=False): attrs = data.get("attribution") if attrs is not None: attr_names = [a["name"] for a in attrs] - # if len(attr_names) > 1: - # print(f"Taking first of {len(attr_names)}:", attr_names) + if verbose: + if len(attr_names) > 1: + print(f"Taking first of {len(attr_names)}:", attr_names) attr_name = attr_names[0] # Just the (hopefully) primary one rows.append( @@ -216,7 +218,6 @@ def read_json2(fp_or_url, *, verbose=False): ) ) - # TODO: specify dtype here df = pd.DataFrame(rows, columns=names) df["time_local"] = df["time"] + df["utcoffset"] @@ -243,14 +244,16 @@ def _get_available_days(self, dates): """ # Get all day folders folders = self.fs.ls(self.s3bucket) - days = [j.split("/")[2] for j in folders] + days = [folder.split("/")[2] for folder in folders] dates_available = pd.Series( - pd.to_datetime(days, format=r"%Y-%m-%d", errors="coerce"), name="dates" + pd.to_datetime(days, format=r"%Y-%m-%d", errors="coerce"), + name="dates", ) # Filter by requested dates dates_requested = pd.Series( - pd.to_datetime(dates).floor(freq="D"), name="dates" + pd.to_datetime(dates).floor(freq="D"), + name="dates", ).drop_duplicates() dates_have = pd.merge(dates_available, dates_requested, how="inner")["dates"] @@ -285,6 +288,7 @@ def build_urls(self, dates): return urls def add_data(self, dates, *, num_workers=1): + """Get data for `dates`, using `num_workers` Dask workers.""" from functools import partial import dask @@ -357,6 +361,7 @@ def _fix_units(self, df): df.loc[is_ug, "unit"] = "ppm" def _pivot_table(self, df): + """Convert to wide format, with one column per parameter.""" # Pivot wide = df.pivot_table( values="value", From 7eea32bfc436c6a155742ac5589606bd3b09eea7 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 14:01:04 -0600 Subject: [PATCH 39/77] Support using either reader --- monetio/obs/openaq.py | 56 +++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 5acf4811..ffbf21ff 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -15,6 +15,9 @@ def add_data(dates, n_procs=1): https://openaq-fetches.s3.amazonaws.com + Note that the source files are daily, so requesting a single day or single hour within it + will take the same amount of time. + Parameters ---------- dates : pandas.DateTimeIndex or list of datetime objects @@ -229,12 +232,22 @@ def read_json2(fp_or_url, *, verbose=False): class OPENAQ: - def __init__(self): + def __init__(self, *, engine="pandas"): + from functools import partial + import s3fs self.fs = s3fs.S3FileSystem(anon=True) self.s3bucket = "openaq-fetches/realtime" + if engine == "pandas": + self.read = partial(read_json, verbose=False) + elif engine == "python": + self.read = partial(read_json2, verbose=False) + else: + raise ValueError("engine must be 'pandas' or 'python'.") + self.engine = engine + def _get_available_days(self, dates): """ Parameters @@ -289,8 +302,6 @@ def build_urls(self, dates): def add_data(self, dates, *, num_workers=1): """Get data for `dates`, using `num_workers` Dask workers.""" - from functools import partial - import dask import dask.dataframe as dd @@ -309,13 +320,13 @@ def add_data(self, dates, *, num_workers=1): if len(urls) > 1: print(urls[-1]) - func = partial(read_json2, verbose=False) + func = self.read dfs = [dask.delayed(func)(url) for url in urls] df_lazy = dd.from_delayed(dfs) df = df_lazy.compute(num_workers=num_workers) # TODO: not sure if necessary (doesn't seem to be?) - # df = df.coordinates.replace(to_replace=[None], value=NaN) + # df["coordinates"] = df.coordinates.replace(to_replace=[None], value=NaN) # Ensure consistent units, e.g. ppm for molecules self._fix_units(df) @@ -362,24 +373,29 @@ def _fix_units(self, df): def _pivot_table(self, df): """Convert to wide format, with one column per parameter.""" + + index = [ + "time", + "time_local", + "latitude", + "longitude", + "utcoffset", + "location", + "city", + "country", + "attribution", # currently only in Python reader + "sourceName", + "sourceType", + "mobile", + "averagingPeriod", + ] + if self.engine == "pandas": + index.remove("attribution") + # Pivot wide = df.pivot_table( values="value", - index=[ - "time", - "time_local", - "latitude", - "longitude", - "utcoffset", - "location", - "city", - "country", - "attribution", # currently only in Python reader - "sourceName", - "sourceType", - "mobile", - "averagingPeriod", - ], + index=index, columns="parameter", ).reset_index() From 44102c092c6b46bc828504bcd5de66f014f50c42 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 14:41:16 -0600 Subject: [PATCH 40/77] More test; fix averaging period calc for other units --- monetio/obs/openaq.py | 22 ++++++++++------------ tests/test_openaq.py | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index ffbf21ff..6a2813d7 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -4,7 +4,6 @@ https://openaq-fetches.s3.amazonaws.com/index.html """ import json -import warnings import pandas as pd from numpy import NaN @@ -47,6 +46,8 @@ def read_json(fp_or_url, *, verbose=False): """ from time import perf_counter + import numpy as np + tic = perf_counter() df = pd.read_json(fp_or_url, lines=True) @@ -86,14 +87,14 @@ def read_json(fp_or_url, *, verbose=False): utcoffset = pd.to_timedelta(new["date.local"].str.slice(-6, None) + ":00") time_local = time + utcoffset - # Attempting averaging period by assuming hours - # FIXME: probably not always the case... - assert (new["averagingPeriod.unit"].dropna() == "hours").all() - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", category=RuntimeWarning, message="invalid value encountered in cast" - ) - averagingPeriod = pd.to_timedelta(new["averagingPeriod.value"], unit="hours") + # Convert averaging period to timedelta + value = new["averagingPeriod.value"] + units = new["averagingPeriod.unit"] + unique_units = units.dropna().unique() + averagingPeriod = pd.Series(np.full(len(new), NaN, dtype="timedelta64[ns]")) + for unit in unique_units: + is_unit = units == unit + averagingPeriod.loc[is_unit] = pd.to_timedelta(value[is_unit], unit=unit) # Apply new columns df = df.drop(columns=to_expand).assign( @@ -325,9 +326,6 @@ def add_data(self, dates, *, num_workers=1): df_lazy = dd.from_delayed(dfs) df = df_lazy.compute(num_workers=num_workers) - # TODO: not sure if necessary (doesn't seem to be?) - # df["coordinates"] = df.coordinates.replace(to_replace=[None], value=NaN) - # Ensure consistent units, e.g. ppm for molecules self._fix_units(df) non_molec = ["pm1", "pm25", "pm4", "pm10", "bc"] diff --git a/tests/test_openaq.py b/tests/test_openaq.py index b0e2605d..0756a318 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -15,6 +15,42 @@ def test_openaq(): assert not df.empty assert df.siteid.nunique() == 1 assert (df.country == "CN").all() and ((df.time_local - df.time) == pd.Timedelta(hours=8)).all() + assert df.latitude.isnull().sum() == 0 + assert df.longitude.isnull().sum() == 0 + assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" + assert df.averagingPeriod.eq(pd.Timedelta("1H")).all() + + +@pytest.mark.parametrize( + "url", + [ + "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson", # 1 MB + "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson", # 10 MB" + ], +) +def test_read(url): + df = openaq.read_json(url) + df2 = openaq.read_json2(url) + assert len(df) > 0 + + if "2019-08-01" in url: + assert len(df2) < len(df), "some that didn't have coords were skipped" + assert df.latitude.isnull().sum() > 0 + else: + assert len(df2) == len(df) + + assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" + assert not df.averagingPeriod.isnull().all() + assert df.averagingPeriod.dropna().gt(pd.Timedelta(0)).all() + + +def test_openaq_2023(): + # Period from Jordan's NRT example (#130) + df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=2) # many files + assert len(df) > 0 + assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" + assert not df.averagingPeriod.isnull().all() + assert df.averagingPeriod.dropna().gt(pd.Timedelta(0)).all() # df = openaq.read_json( @@ -36,5 +72,4 @@ def test_openaq(): # df = openaq.read_json2( # # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB # # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB -# "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # ) From 2e55bf8d78002796c77db80b90630e6c0f49e242 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 14:47:33 -0600 Subject: [PATCH 41/77] Cap for faster test random sample though --- monetio/obs/openaq.py | 7 +++++++ tests/test_openaq.py | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 6a2813d7..11cdf23d 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -8,6 +8,8 @@ import pandas as pd from numpy import NaN +_URL_CAP = None # set to int to limit number of files loaded for testing + def add_data(dates, n_procs=1): """Add OpenAQ data from the OpenAQ S3 bucket. @@ -321,6 +323,11 @@ def add_data(self, dates, *, num_workers=1): if len(urls) > 1: print(urls[-1]) + if _URL_CAP is not None and len(urls) > _URL_CAP: + import random + + urls = random.sample(urls, _URL_CAP) + func = self.read dfs = [dask.delayed(func)(url) for url in urls] df_lazy = dd.from_delayed(dfs) diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 0756a318..847ca18b 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -5,6 +5,8 @@ from monetio import openaq +openaq._URL_CAP = 4 + @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires Python 3.7+") def test_openaq(): @@ -46,7 +48,9 @@ def test_read(url): def test_openaq_2023(): # Period from Jordan's NRT example (#130) - df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=2) # many files + # There are many files in this period (~ 100?) + # Disable cap setting to test whole set of files + df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=2) assert len(df) > 0 assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" assert not df.averagingPeriod.isnull().all() From c421abad07f6758ff4c11bd01781e70ce75e9ec8 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 15:34:31 -0600 Subject: [PATCH 42/77] Refactor a bit --- monetio/obs/openaq.py | 124 +++++++++++++++++++++--------------------- tests/test_openaq.py | 4 ++ 2 files changed, 66 insertions(+), 62 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 11cdf23d..7aeeb8c5 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -235,6 +235,28 @@ def read_json2(fp_or_url, *, verbose=False): class OPENAQ: + NON_MOLEC_PARAMS = [ + "pm1", + "pm25", + "pm4", + "pm10", + "bc", + ] + + PPM_TO_UGM3 = { + "o3": 1990, + "co": 1160, + "no2": 1900, + "no": 1240, + "so2": 2650, + "ch4": 664, + "co2": 1820, + } + # These conversion factors are based on + # - air average molecular weight: 29 g/mol + # - air density: 1.2 kg m -3 + # rounded to 3 significant figures. + def __init__(self, *, engine="pandas"): from functools import partial @@ -328,14 +350,34 @@ def add_data(self, dates, *, num_workers=1): urls = random.sample(urls, _URL_CAP) + # Read JSON files func = self.read dfs = [dask.delayed(func)(url) for url in urls] df_lazy = dd.from_delayed(dfs) df = df_lazy.compute(num_workers=num_workers) - # Ensure consistent units, e.g. ppm for molecules - self._fix_units(df) - non_molec = ["pm1", "pm25", "pm4", "pm10", "bc"] + # Measurements like air comp shouldn't be negative + non_neg_units = [ + "ng/m3", + "particles/cm³", + "ppb", + "ppm", + "ugm3", + "umol/mol", + "µg/m³", + ] + df.loc[df.unit.isin(non_neg_units) & (df.value <= 0), "value"] = NaN + # Assume value 0 implies below detection limit + + # Convert to consistent units for molecules (ppmv) + # (For a certain parameter, different site-times may have different units.) + for vn, f in self.PPM_TO_UGM3.items(): + is_ug = (df.parameter == vn) & (df.unit == "µg/m³") + df.loc[is_ug, "value"] /= f + df.loc[is_ug, "unit"] = "ppm" + + # Ensure consistent units + non_molec = self.NON_MOLEC_PARAMS good = (df[~df.parameter.isin(non_molec)].unit.dropna() == "ppm").all() if not good: unique_params = sorted(df.parameter.unique()) @@ -345,40 +387,7 @@ def add_data(self, dates, *, num_workers=1): if not good: raise ValueError(f"Expected these species to all be in µg/m³: {non_molec}.") - # Pivot to wide format - df = self._pivot_table(df) - - # Construct site IDs - df["siteid"] = ( - df.country - + "_" - + df.latitude.round(3).astype(str) - + "N_" - + df.longitude.round(3).astype(str) - + "E" - ) - - return df.loc[(df.time >= dates.min()) & (df.time <= dates.max())] - - def _fix_units(self, df): - """In place, convert units to ppm for molecules.""" - df.loc[df.value <= 0] = NaN - # For a certain parameter, different site-times may have different units. - # https://docs.openaq.org/docs/parameters - # These conversion factors are based on - # - air average molecular weight: 29 g/mol - # - air density: 1.2 kg m -3 - # rounded to 3 significant figures. - fs = {"co": 1160, "o3": 1990, "so2": 2650, "no2": 1900, "ch4": 664, "no": 1240} - fs["nox"] = fs["no2"] # Need to make an assumption about NOx MW - for vn, f in fs.items(): - is_ug = (df.parameter == vn) & (df.unit == "µg/m³") - df.loc[is_ug, "value"] /= f - df.loc[is_ug, "unit"] = "ppm" - - def _pivot_table(self, df): - """Convert to wide format, with one column per parameter.""" - + # Pivot to wide format (each parameter gets its own column) index = [ "time", "time_local", @@ -396,35 +405,26 @@ def _pivot_table(self, df): ] if self.engine == "pandas": index.remove("attribution") - - # Pivot - wide = df.pivot_table( + df = df.pivot_table( values="value", index=index, columns="parameter", ).reset_index() + df = df.rename(columns={p: f"{p}_ugm3" for p in self.NON_MOLEC_PARAMS}, errors="ignore") + df = df.rename(columns={p: f"{p}_ppm" for p in self.PPM_TO_UGM3}, errors="ignore") - # Include units in variable names - wide = wide.rename( - dict( - # molec - co="co_ppm", - o3="o3_ppm", - no2="no2_ppm", - so2="so2_ppm", - ch4="ch4_ppm", - no="no_ppm", - # non-molec - pm1="pm1_ugm3", - pm25="pm25_ugm3", - pm4="pm4_ugm3", - pm10="pm10_ugm3", - bc="bc_ugm3", - # - nox="nox_ppm", - ), - axis="columns", - errors="ignore", + # Construct site IDs + df["siteid"] = ( + df.country + + "_" + + df.latitude.round(3).astype(str) + + "N_" + + df.longitude.round(3).astype(str) + + "E" ) - return wide + return df.loc[(df.time >= dates.min()) & (df.time <= dates.max())] + + +# Need to make an assumption about NOx MW +OPENAQ.PPM_TO_UGM3["nox"] = OPENAQ.PPM_TO_UGM3["no2"] diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 847ca18b..29d75b41 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -21,6 +21,7 @@ def test_openaq(): assert df.longitude.isnull().sum() == 0 assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" assert df.averagingPeriod.eq(pd.Timedelta("1H")).all() + assert df.pm25_ugm3.gt(0).all() @pytest.mark.parametrize( @@ -50,11 +51,14 @@ def test_openaq_2023(): # Period from Jordan's NRT example (#130) # There are many files in this period (~ 100?) # Disable cap setting to test whole set of files + # NOTE: possible to get empty df with the random URL selection df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=2) assert len(df) > 0 assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" assert not df.averagingPeriod.isnull().all() assert df.averagingPeriod.dropna().gt(pd.Timedelta(0)).all() + assert df.pm25_ugm3.dropna().gt(0).all() + assert df.o3_ppm.dropna().gt(0).all() # df = openaq.read_json( From 5299b3b374bee975367f79ec9b0518d13ca6e4c9 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 15:37:08 -0600 Subject: [PATCH 43/77] cleanup --- monetio/obs/openaq.py | 14 ++++++++++---- tests/test_openaq.py | 22 ---------------------- 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 7aeeb8c5..5b035402 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -242,6 +242,7 @@ class OPENAQ: "pm10", "bc", ] + """Parameters that are not molecules and should be in µg/m³ units.""" PPM_TO_UGM3 = { "o3": 1990, @@ -252,10 +253,15 @@ class OPENAQ: "ch4": 664, "co2": 1820, } - # These conversion factors are based on - # - air average molecular weight: 29 g/mol - # - air density: 1.2 kg m -3 - # rounded to 3 significant figures. + """Conversion factors from ppmv to µg/m³. + + Based on + + - air average molecular weight: 29 g/mol + - air density: 1.2 kg m -3 + + and rounded to 3 significant figures. + """ def __init__(self, *, engine="pandas"): from functools import partial diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 29d75b41..bc2c58dc 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -59,25 +59,3 @@ def test_openaq_2023(): assert df.averagingPeriod.dropna().gt(pd.Timedelta(0)).all() assert df.pm25_ugm3.dropna().gt(0).all() assert df.o3_ppm.dropna().gt(0).all() - - -# df = openaq.read_json( -# # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB -# "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB -# ) - -# o = openaq.OPENAQ() -# days_avail = o._get_available_days(pd.date_range("2019-08-01", "2019-08-03")) -# files = o._get_files_in_day(pd.to_datetime("2019-08-01")) - -# from dask.diagnostics import ProgressBar - -# ProgressBar().register() - -# # df = openaq.add_data(["2016-08-01", "2016-08-01 23:00"], n_procs=1) # one file -# df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=4) # many files - -# df = openaq.read_json2( -# # "https://openaq-fetches.s3.amazonaws.com/realtime/2019-08-01/1564644065.ndjson" # 1 MB -# # "https://openaq-fetches.s3.amazonaws.com/realtime/2023-09-04/1693798742_realtime_1c4e466d-c461-4c8d-b604-1e81cf2df73a.ndjson" # 10 MB -# ) From f122df96fb43d3cb7f488fc8ecae0579bc21b669 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 15:41:31 -0600 Subject: [PATCH 44/77] Test parameter accounting --- tests/test_openaq.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_openaq.py b/tests/test_openaq.py index bc2c58dc..2453b7ec 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -59,3 +59,27 @@ def test_openaq_2023(): assert df.averagingPeriod.dropna().gt(pd.Timedelta(0)).all() assert df.pm25_ugm3.dropna().gt(0).all() assert df.o3_ppm.dropna().gt(0).all() + + +def test_parameter_coverage(): + # From https://openaq.org/developers/help/ ("What pollutants are available on OpenAQ?") + # these are the parameters to account for: + params = [ + "pm1", + "pm25", + "pm4", + "pm10", + "bc", + "o3", + "co", + "no2", + "no", + "nox", + "so2", + "ch4", + "co2", + ] + assert len(params) == 13 + assert sorted(openaq.OPENAQ.NON_MOLEC_PARAMS + list(openaq.OPENAQ.PPM_TO_UGM3)) == sorted( + params + ) From a7f83afe0e9f2f443b90d33c61ad6ed025773012 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 24 Oct 2023 16:51:48 -0600 Subject: [PATCH 45/77] WIP: dealing with time-site dupes --- monetio/obs/openaq.py | 34 +++++++++++++++++++++------------- tests/test_openaq.py | 10 ++++++++++ 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 5b035402..dbeeeb9d 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -333,6 +333,8 @@ def build_urls(self, dates): def add_data(self, dates, *, num_workers=1): """Get data for `dates`, using `num_workers` Dask workers.""" + import hashlib + import dask import dask.dataframe as dd @@ -411,23 +413,29 @@ def add_data(self, dates, *, num_workers=1): ] if self.engine == "pandas": index.remove("attribution") - df = df.pivot_table( - values="value", - index=index, - columns="parameter", - ).reset_index() + df = ( + df[(df.averagingPeriod == pd.Timedelta("1H")) & (df.city != "N/A")] + .pivot_table( + values="value", + index=index, + columns="parameter", + ) + .reset_index() + ) df = df.rename(columns={p: f"{p}_ugm3" for p in self.NON_MOLEC_PARAMS}, errors="ignore") df = df.rename(columns={p: f"{p}_ppm" for p in self.PPM_TO_UGM3}, errors="ignore") # Construct site IDs - df["siteid"] = ( - df.country - + "_" - + df.latitude.round(3).astype(str) - + "N_" - + df.longitude.round(3).astype(str) - + "E" - ) + # Sometimes, at a given time, there are multiple measurements at the same lat/lon + # with different location names. + # Occasionally, there are rows that appear to actual duplicates + # (e.g. all same except one col is null in one or something) + def do_hash(b): + return hashlib.sha1(b, usedforsecurity=False).hexdigest() + + # to_hash = df.latitude.astype(str) + " " + df.longitude.astype(str) + to_hash = df.location + " " + df.latitude.astype(str) + " " + df.longitude.astype(str) + df["siteid"] = df.country + "_" + to_hash.str.encode("utf-8").apply(do_hash).str.slice(0, 7) return df.loc[(df.time >= dates.min()) & (df.time <= dates.max())] diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 2453b7ec..185577a2 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -14,13 +14,17 @@ def test_openaq(): # Browse the archive at https://openaq-fetches.s3.amazonaws.com/index.html dates = pd.date_range(start="2013-11-26", end="2013-11-27", freq="H")[:-1] df = openaq.add_data(dates) + assert not df.empty assert df.siteid.nunique() == 1 assert (df.country == "CN").all() and ((df.time_local - df.time) == pd.Timedelta(hours=8)).all() + assert df.latitude.isnull().sum() == 0 assert df.longitude.isnull().sum() == 0 + assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" assert df.averagingPeriod.eq(pd.Timedelta("1H")).all() + assert df.pm25_ugm3.gt(0).all() @@ -34,6 +38,7 @@ def test_openaq(): def test_read(url): df = openaq.read_json(url) df2 = openaq.read_json2(url) + assert len(df) > 0 if "2019-08-01" in url: @@ -53,10 +58,15 @@ def test_openaq_2023(): # Disable cap setting to test whole set of files # NOTE: possible to get empty df with the random URL selection df = openaq.add_data(["2023-09-04", "2023-09-04 23:00"], n_procs=2) + assert len(df) > 0 + + assert (df.time.astype(str) + df.siteid).nunique() == len(df) + assert df.dtypes["averagingPeriod"] == "timedelta64[ns]" assert not df.averagingPeriod.isnull().all() assert df.averagingPeriod.dropna().gt(pd.Timedelta(0)).all() + assert df.pm25_ugm3.dropna().gt(0).all() assert df.o3_ppm.dropna().gt(0).all() From fc6ddd9eba056fd60402e6989e03f778df8f05c2 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 25 Oct 2023 15:08:09 -0600 Subject: [PATCH 46/77] `wide_fmt` option --- monetio/obs/openaq.py | 134 +++++++++++++++++++++++---------------- monetio/obs/openaq_v2.py | 6 ++ tests/test_openaq.py | 21 ++++-- tests/test_openaq_v2.py | 26 +++++--- 4 files changed, 121 insertions(+), 66 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index dbeeeb9d..53b0ef07 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -11,7 +11,7 @@ _URL_CAP = None # set to int to limit number of files loaded for testing -def add_data(dates, n_procs=1): +def add_data(dates, *, n_procs=1, wide_fmt=True): """Add OpenAQ data from the OpenAQ S3 bucket. https://openaq-fetches.s3.amazonaws.com @@ -31,11 +31,11 @@ def add_data(dates, n_procs=1): pandas.DataFrame """ a = OPENAQ() - return a.add_data(dates, num_workers=n_procs) + return a.add_data(dates, num_workers=n_procs, wide_fmt=wide_fmt) def read_json(fp_or_url, *, verbose=False): - """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in non-wide format. + """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in original long format. Parameters ---------- @@ -115,7 +115,9 @@ def read_json(fp_or_url, *, verbose=False): def read_json2(fp_or_url, *, verbose=False): - """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in non-wide format. + """Read a JSON file from the OpenAQ S3 bucket, returning dataframe in original long format. + + This provides 'attribution', while :func:`read_json` does not. Parameters ---------- @@ -264,6 +266,17 @@ class OPENAQ: """ def __init__(self, *, engine="pandas"): + """ + Parameters + ---------- + engine : str, optional + _description_, by default "pandas" + + Raises + ------ + ValueError + _description_ + """ from functools import partial import s3fs @@ -331,8 +344,19 @@ def build_urls(self, dates): urls.extend(f"s3://{f}" for f in files) return urls - def add_data(self, dates, *, num_workers=1): - """Get data for `dates`, using `num_workers` Dask workers.""" + def add_data(self, dates, *, num_workers=1, wide_fmt=True): + """Get data for `dates`, using `num_workers` Dask workers. + + Parameters + ---------- + wide_fmt : bool + If True, return data in wide format + (each parameter gets its own column, + as opposed to long format with 'parameter', 'value', and 'units' columns). + Accordingly, convert units to consistent units + (ppmv for molecules, µg/m³ for others) + and rename columns to reflect units. + """ import hashlib import dask @@ -364,6 +388,9 @@ def add_data(self, dates, *, num_workers=1): df_lazy = dd.from_delayed(dfs) df = df_lazy.compute(num_workers=num_workers) + # Ensure data within requested time window + df = df.loc[(df.time >= dates.min()) & (df.time <= dates.max())] + # Measurements like air comp shouldn't be negative non_neg_units = [ "ng/m3", @@ -377,53 +404,54 @@ def add_data(self, dates, *, num_workers=1): df.loc[df.unit.isin(non_neg_units) & (df.value <= 0), "value"] = NaN # Assume value 0 implies below detection limit - # Convert to consistent units for molecules (ppmv) - # (For a certain parameter, different site-times may have different units.) - for vn, f in self.PPM_TO_UGM3.items(): - is_ug = (df.parameter == vn) & (df.unit == "µg/m³") - df.loc[is_ug, "value"] /= f - df.loc[is_ug, "unit"] = "ppm" - - # Ensure consistent units - non_molec = self.NON_MOLEC_PARAMS - good = (df[~df.parameter.isin(non_molec)].unit.dropna() == "ppm").all() - if not good: - unique_params = sorted(df.parameter.unique()) - molec = [p for p in unique_params if p not in non_molec] - raise ValueError(f"Expected these species to all be in ppm now: {molec}.") - good = (df[df.parameter.isin(non_molec)].unit.dropna() == "µg/m³").all() - if not good: - raise ValueError(f"Expected these species to all be in µg/m³: {non_molec}.") - - # Pivot to wide format (each parameter gets its own column) - index = [ - "time", - "time_local", - "latitude", - "longitude", - "utcoffset", - "location", - "city", - "country", - "attribution", # currently only in Python reader - "sourceName", - "sourceType", - "mobile", - "averagingPeriod", - ] - if self.engine == "pandas": - index.remove("attribution") - df = ( - df[(df.averagingPeriod == pd.Timedelta("1H")) & (df.city != "N/A")] - .pivot_table( - values="value", - index=index, - columns="parameter", + if wide_fmt: + # Convert to consistent units for molecules (ppmv) + # (For a certain parameter, different site-times may have different units.) + for vn, f in self.PPM_TO_UGM3.items(): + is_ug = (df.parameter == vn) & (df.unit == "µg/m³") + df.loc[is_ug, "value"] /= f + df.loc[is_ug, "unit"] = "ppm" + + # Ensure consistent units + non_molec = self.NON_MOLEC_PARAMS + good = (df[~df.parameter.isin(non_molec)].unit.dropna() == "ppm").all() + if not good: + unique_params = sorted(df.parameter.unique()) + molec = [p for p in unique_params if p not in non_molec] + raise ValueError(f"Expected these species to all be in ppm now: {molec}.") + good = (df[df.parameter.isin(non_molec)].unit.dropna() == "µg/m³").all() + if not good: + raise ValueError(f"Expected these species to all be in µg/m³: {non_molec}.") + + # Pivot to wide format (each parameter gets its own column) + index = [ + "time", + "time_local", + "latitude", + "longitude", + "utcoffset", + "location", + "city", + "country", + "attribution", # currently only in Python reader + "sourceName", + "sourceType", + "mobile", + "averagingPeriod", + ] + if self.engine == "pandas": + index.remove("attribution") + df = ( + df[(df.averagingPeriod == pd.Timedelta("1H")) & (df.city != "N/A")] + .pivot_table( + values="value", + index=index, + columns="parameter", + ) + .reset_index() ) - .reset_index() - ) - df = df.rename(columns={p: f"{p}_ugm3" for p in self.NON_MOLEC_PARAMS}, errors="ignore") - df = df.rename(columns={p: f"{p}_ppm" for p in self.PPM_TO_UGM3}, errors="ignore") + df = df.rename(columns={p: f"{p}_ugm3" for p in self.NON_MOLEC_PARAMS}, errors="ignore") + df = df.rename(columns={p: f"{p}_ppm" for p in self.PPM_TO_UGM3}, errors="ignore") # Construct site IDs # Sometimes, at a given time, there are multiple measurements at the same lat/lon @@ -437,7 +465,7 @@ def do_hash(b): to_hash = df.location + " " + df.latitude.astype(str) + " " + df.longitude.astype(str) df["siteid"] = df.country + "_" + to_hash.str.encode("utf-8").apply(do_hash).str.slice(0, 7) - return df.loc[(df.time >= dates.min()) & (df.time <= dates.max())] + return df # Need to make an assumption about NOx MW diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 53e36233..ad1183f2 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -195,6 +195,7 @@ def add_data( search_radius=None, sites=None, query_time_split="1H", + wide_fmt=False, # FIXME: probably want to default to True **kwargs, ): """Get OpenAQ API v2 data, including low-cost sensors. @@ -224,6 +225,8 @@ def add_data( Set to ``None`` for no time splitting. Default: 1 hour (OpenAQ data are hourly, so setting to something smaller won't help). + wide_fmt : bool + Convert dataframe to wide format (one column per parameter). """ dates = pd.DatetimeIndex(dates) @@ -240,6 +243,9 @@ def add_data( if date_min == date_max or len(dates) == 0: raise ValueError("must provide at least two unique datetimes") + if wide_fmt is True: + raise NotImplementedError("wide format not implemented yet") + def iter_time_slices(): # seems that (from < time <= to) == (from , to] is used # i.e. `from` is exclusive, `to` is inclusive diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 185577a2..546ea08b 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -7,12 +7,14 @@ openaq._URL_CAP = 4 +# First date in the archive, just one file +# Browse the archive at https://openaq-fetches.s3.amazonaws.com/index.html +FIRST_DAY = pd.date_range(start="2013-11-26", end="2013-11-27", freq="H")[:-1] + @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires Python 3.7+") -def test_openaq(): - # First date in the archive, just one file - # Browse the archive at https://openaq-fetches.s3.amazonaws.com/index.html - dates = pd.date_range(start="2013-11-26", end="2013-11-27", freq="H")[:-1] +def test_openaq_first_date(): + dates = FIRST_DAY df = openaq.add_data(dates) assert not df.empty @@ -28,6 +30,17 @@ def test_openaq(): assert df.pm25_ugm3.gt(0).all() +def test_openaq_long_fmt(): + dates = FIRST_DAY + df = openaq.add_data(dates, wide_fmt=False) + + assert not df.empty + + assert {"parameter", "value", "unit"} < set(df.columns) + assert "pm25_ugm3" not in df.columns + assert "pm25" in df.parameter.values + + @pytest.mark.parametrize( "url", [ diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 29dc0747..0b626caa 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -1,7 +1,18 @@ import pandas as pd +import pytest import monetio.obs.openaq_v2 as openaq +SITES_NEAR_NCWCP = [ + # AirGradient monitor + 1236068, + # PurpleAir sensors + 1118827, + 357301, + 273440, + 271155, +] + def test_get_parameters(): params = openaq.get_parameters() @@ -25,15 +36,7 @@ def test_get_locations(): def test_get_data_near_ncwcp_sites(): - sites = [ - # AirGradient monitor - 1236068, - # PurpleAir sensors - 1118827, - 357301, - 273440, - 271155, - ] + sites = SITES_NEAR_NCWCP dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") df = openaq.add_data(dates, sites=sites) assert len(df) > 0 @@ -53,3 +56,8 @@ def test_get_data_near_ncwcp_search_radius(): assert df.latitude.round().eq(39).all() assert df.longitude.round().eq(-77).all() assert (sorted(df.time.unique()) == dates).all() + + +def test_get_data_wide_error(): + with pytest.raises(NotImplementedError, match="wide format not implemented"): + openaq.add_data(["2023-08-01", "2023-08-02"], wide_fmt=True) From f733ad5f74938db217ff985249f4831ad7a99927 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 25 Oct 2023 16:04:22 -0600 Subject: [PATCH 47/77] doc --- monetio/obs/openaq.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 53b0ef07..8fe56353 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -349,6 +349,8 @@ def add_data(self, dates, *, num_workers=1, wide_fmt=True): Parameters ---------- + num_workers : int + Number of Dask workers to use to read the JSON files. wide_fmt : bool If True, return data in wide format (each parameter gets its own column, From a9e810bdad1770e76a200e9759976a2be401dd9b Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 25 Oct 2023 19:23:25 -0600 Subject: [PATCH 48/77] Remove averaging period from pivot table index still selecting 1H but warning message if multiple values in the dataset --- monetio/obs/openaq.py | 40 ++++++++++++++++++++++++++++++++++++---- tests/test_openaq.py | 1 + 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 8fe56353..da6b1e7b 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -4,10 +4,12 @@ https://openaq-fetches.s3.amazonaws.com/index.html """ import json +import warnings import pandas as pd from numpy import NaN +_URL_CAP_RANDOM_SAMPLE = False # if false, take from end of list _URL_CAP = None # set to int to limit number of files loaded for testing @@ -380,9 +382,12 @@ def add_data(self, dates, *, num_workers=1, wide_fmt=True): print(urls[-1]) if _URL_CAP is not None and len(urls) > _URL_CAP: - import random + if _URL_CAP_RANDOM_SAMPLE: + import random - urls = random.sample(urls, _URL_CAP) + urls = random.sample(urls, _URL_CAP) + else: + urls = urls[-_URL_CAP:] # Read JSON files func = self.read @@ -425,6 +430,25 @@ def add_data(self, dates, *, num_workers=1, wide_fmt=True): if not good: raise ValueError(f"Expected these species to all be in µg/m³: {non_molec}.") + # Determine averaging periods for each parameter + aps = {} + for p, g in df.groupby("parameter"): + aps[p] = g.averagingPeriod.dropna().unique() + mult_ap_lines = [] + for p, ap in aps.items(): + if len(ap) > 1: + counts = df.averagingPeriod.loc[df.parameter == p].dropna().value_counts() + s_counts = ", ".join(f"'{v}' ({n})" for v, n in counts.items()) + mult_ap_lines.append(f"{p!r}: {s_counts}") + if mult_ap_lines: + s_mults = "\n".join(f"- {s}" for s in mult_ap_lines) + warnings.warn( + "Multiple averaging periods for" + f"\n{s_mults}" + "\nWill select data with averaging period 1H. " + "Use wide_fmt=False if you want all data." + ) + # Pivot to wide format (each parameter gets its own column) index = [ "time", @@ -439,12 +463,19 @@ def add_data(self, dates, *, num_workers=1, wide_fmt=True): "sourceName", "sourceType", "mobile", - "averagingPeriod", + # "averagingPeriod", # different parameters may have different standard averaging periods ] if self.engine == "pandas": index.remove("attribution") + + # NOTE: 1H is the most common averaging period by far + # NOTE: seems that some sites have dupe rows with city == "N/A" + na_locations = ["Wampanoag Laboratory"] df = ( - df[(df.averagingPeriod == pd.Timedelta("1H")) & (df.city != "N/A")] + df[ + (df.averagingPeriod == pd.Timedelta("1H")) + & ~(df.location.isin(na_locations) & (df.city == "N/A")) + ] .pivot_table( values="value", index=index, @@ -452,6 +483,7 @@ def add_data(self, dates, *, num_workers=1, wide_fmt=True): ) .reset_index() ) + df["averagingPeriod"] = pd.Timedelta("1H") # TODO: could just not include df = df.rename(columns={p: f"{p}_ugm3" for p in self.NON_MOLEC_PARAMS}, errors="ignore") df = df.rename(columns={p: f"{p}_ppm" for p in self.PPM_TO_UGM3}, errors="ignore") diff --git a/tests/test_openaq.py b/tests/test_openaq.py index 546ea08b..a23a4808 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -5,6 +5,7 @@ from monetio import openaq +# openaq._URL_CAP_RANDOM_SAMPLE = True openaq._URL_CAP = 4 # First date in the archive, just one file From cb000b550ceea8b4d97e7929f08d9744300a0621 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 25 Oct 2023 19:36:48 -0600 Subject: [PATCH 49/77] Full openaq v1 test suite only in 3.7+ --- tests/test_openaq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_openaq.py b/tests/test_openaq.py index a23a4808..665e9c31 100644 --- a/tests/test_openaq.py +++ b/tests/test_openaq.py @@ -5,6 +5,9 @@ from monetio import openaq +if sys.version_info < (3, 7): + pytest.skip("requires Python 3.7+", allow_module_level=True) + # openaq._URL_CAP_RANDOM_SAMPLE = True openaq._URL_CAP = 4 @@ -13,7 +16,6 @@ FIRST_DAY = pd.date_range(start="2013-11-26", end="2013-11-27", freq="H")[:-1] -@pytest.mark.skipif(sys.version_info < (3, 7), reason="requires Python 3.7+") def test_openaq_first_date(): dates = FIRST_DAY df = openaq.add_data(dates) From 462d48840fa7d4ef1a35f5557fec5f213afc963c Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 25 Oct 2023 22:10:39 -0600 Subject: [PATCH 50/77] `usedforsecurity` is 3.9+ --- monetio/obs/openaq.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index da6b1e7b..41c68661 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -4,11 +4,14 @@ https://openaq-fetches.s3.amazonaws.com/index.html """ import json +import sys import warnings import pandas as pd from numpy import NaN +_PY39_PLUS = sys.version_info >= (3, 9) + _URL_CAP_RANDOM_SAMPLE = False # if false, take from end of list _URL_CAP = None # set to int to limit number of files loaded for testing @@ -492,8 +495,15 @@ def add_data(self, dates, *, num_workers=1, wide_fmt=True): # with different location names. # Occasionally, there are rows that appear to actual duplicates # (e.g. all same except one col is null in one or something) - def do_hash(b): - return hashlib.sha1(b, usedforsecurity=False).hexdigest() + if _PY39_PLUS: + + def do_hash(b): + return hashlib.sha1(b, usedforsecurity=False).hexdigest() + + else: + + def do_hash(b): + return hashlib.sha1(b).hexdigest() # to_hash = df.latitude.astype(str) + " " + df.longitude.astype(str) to_hash = df.location + " " + df.latitude.astype(str) + " " + df.longitude.astype(str) From aaebef6f29e3bbbf188986a83f504f38dc2f106f Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 27 Oct 2023 14:25:24 -0600 Subject: [PATCH 51/77] Log a bit of info about dupe location IDs --- monetio/obs/openaq_v2.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index ad1183f2..7120b0b3 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -145,6 +145,11 @@ def get_locations(**kwargs): # Site ID df = df.rename(columns={"id": "siteid"}) df["siteid"] = df.siteid.astype(str) + maybe_dupe_rows = df[df.siteid.duplicated(keep=False)].sort_values("siteid") + if not maybe_dupe_rows.empty: + logger.info( + f"note: found {len(maybe_dupe_rows)} rows with duplicate site IDs:\n{maybe_dupe_rows}" + ) df = df.drop_duplicates("siteid", keep="first").reset_index(drop=True) # seem to be some dupes return df From 419318db2ab63c1d4431e5bf5b6edd01ce367baf Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 27 Oct 2023 14:41:24 -0600 Subject: [PATCH 52/77] Wait some time before retry --- monetio/obs/openaq_v2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 7120b0b3..9fbf0b3f 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -39,6 +39,8 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): Number of pages to fetch. By default, try to fetch as many as needed to get all results. """ + import time + if params is None: params = {} @@ -67,6 +69,7 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): if r.status_code == 408: tries += 1 logger.info(f"request timed out (try {tries}/{retry})") + time.sleep(tries) else: break r.raise_for_status() From 1bb0e5c3d24c4b5734894a302cc13708d64cb203 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 13:23:38 -0500 Subject: [PATCH 53/77] Add entity and sensor-type options also: - retry on 429 - OpenAQ no longer has PurpleAir (as of 2024-Mar-11 according to Twitter) --- monetio/obs/openaq_v2.py | 21 +++++++++++++++++- tests/test_openaq_v2.py | 48 +++++++++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 9fbf0b3f..795a79e2 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -57,6 +57,7 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): headers = { "Accept": "application/json", "X-API-Key": API_KEY, + "User-Agent": "monetio", } data = [] @@ -70,6 +71,10 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): tries += 1 logger.info(f"request timed out (try {tries}/{retry})") time.sleep(tries) + if r.status_code == 429: + tries += 1 + logger.info(f"rate limited (try {tries}/{retry})") + time.sleep(tries * 1.5) else: break r.raise_for_status() @@ -202,6 +207,8 @@ def add_data( country=None, search_radius=None, sites=None, + entity=None, + sensor_type=None, query_time_split="1H", wide_fmt=False, # FIXME: probably want to default to True **kwargs, @@ -224,6 +231,12 @@ def add_data( Site ID(s) to include, e.g. a specific known site or group of sites from :func:`get_latlonbox_sites`. Default: full dataset (no limitation by site). + entity : str or list of str, optional + Options: ``'government'``, ``'research'``, ``'community'``. + Default: full dataset (no limitation by entity). + sensor_type : str or list of str, optional + Options: ``'low-cost sensor'``, ``'reference grade'``. + Default: full dataset (no limitation by sensor type). query_time_split Frequency to use when splitting the web API queries in time, in a format that ``pandas.to_timedelta`` will understand. @@ -272,6 +285,10 @@ def iter_time_slices(): params.update(country=country) if sites is not None: params.update(location_id=sites) + if entity is not None: + params.update(entity=entity) + if sensor_type is not None: + params.update(sensor_type=sensor_type) data = [] for parameter in parameters: @@ -284,7 +301,9 @@ def iter_time_slices(): if search_radius is not None: for coords, radius in search_radius.items(): if not 0 < radius <= 25_000: - raise ValueError(f"invalid radius {radius!r}") + raise ValueError( + f"invalid radius {radius!r}. Must be positive and <= 25000 (25 km)." + ) params.update( coordinates=f"{coords[0]:.8f},{coords[1]:.8f}", radius=radius, diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 0b626caa..b0860590 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -3,14 +3,21 @@ import monetio.obs.openaq_v2 as openaq +LATLON_NCWCP = 38.9721, -76.9248 SITES_NEAR_NCWCP = [ # AirGradient monitor 1236068, - # PurpleAir sensors - 1118827, - 357301, - 273440, - 271155, + 1719392, + # # PurpleAir sensors + # 1118827, + # 357301, + # 273440, + # 271155, + # NASA GSFC + 2978434, + # Beltsville (AirNow) + 3832, + 843, ] @@ -44,18 +51,43 @@ def test_get_data_near_ncwcp_sites(): assert df.latitude.round().eq(39).all() assert df.longitude.round().eq(-77).all() assert (sorted(df.time.unique()) == dates).all() - assert set(df.siteid) == {str(site) for site in sites} + assert set(df.siteid) <= {str(site) for site in sites} def test_get_data_near_ncwcp_search_radius(): - latlon = 38.9721, -76.9248 + latlon = LATLON_NCWCP dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") - df = openaq.add_data(dates, search_radius={latlon: 5_000}) + df = openaq.add_data(dates, search_radius={latlon: 10_000}) assert len(df) > 0 assert "pm25" in df.parameter.values assert df.latitude.round().eq(39).all() assert df.longitude.round().eq(-77).all() assert (sorted(df.time.unique()) == dates).all() + assert not df.sensorType.eq("low-cost sensor").all() + assert df.entity.eq("Governmental Organization").all() + + +def test_get_data_near_ncwcp_sensor_type(): + latlon = LATLON_NCWCP + dates = pd.date_range("2023-08-01", "2023-08-01 03:00", freq="1H") + df = openaq.add_data(dates, sensor_type="low-cost sensor", search_radius={latlon: 25_000}) + assert len(df) > 0 + assert df.sensorType.eq("low-cost sensor").all() + + +@pytest.mark.parametrize( + "entity", + [ + "research", + "community", + ["research", "community"], + ], +) +def test_get_data_near_ncwcp_entity(entity): + latlon = LATLON_NCWCP + dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") + df = openaq.add_data(dates, entity=entity, search_radius={latlon: 25_000}) + assert df.empty def test_get_data_wide_error(): From 6f881240cf1a073fedf967484d91881e1b23afce Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 13:31:56 -0500 Subject: [PATCH 54/77] Add some randomness to sleeps --- monetio/obs/openaq_v2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 795a79e2..f0b0d98b 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -40,6 +40,7 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): By default, try to fetch as many as needed to get all results. """ import time + from random import random as rand if params is None: params = {} @@ -70,11 +71,11 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): if r.status_code == 408: tries += 1 logger.info(f"request timed out (try {tries}/{retry})") - time.sleep(tries) + time.sleep(tries + 0.1 * rand()) if r.status_code == 429: tries += 1 logger.info(f"rate limited (try {tries}/{retry})") - time.sleep(tries * 1.5) + time.sleep(tries * 1.5 + 0.1 * rand()) else: break r.raise_for_status() From cc79ec68d48dea4c4915c3b8e96ed79841a25d4e Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 13:46:01 -0500 Subject: [PATCH 55/77] Pass endpoint to consume instead --- monetio/obs/openaq_v2.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index f0b0d98b..1befe380 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -19,12 +19,21 @@ "Obtain one and set your OPENAQ_API_KEY environment variable." ) +_BASE_URL = "https://api.openaq.org" +_ENDPOINTS = { + "locations": "/v2/locations", + "parameters": "/v2/parameters", + "measurements": "/v2/measurements", +} -def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): + +def _consume(endpoint, *, params=None, timeout=10, retry=5, limit=500, npages=None): """Consume a paginated OpenAQ API endpoint. Parameters ---------- + endpoint : str + API endpoint, e.g. ``'/v2/locations'``, ``'/v2/parameters'``, ``'/v2/measurements'``. params : dict, optional Parameters for the GET request to the API. Don't pass ``limit``, ``page``, or ``offset`` here, since they are covered @@ -42,6 +51,12 @@ def _consume(url, *, params=None, timeout=10, retry=5, limit=500, npages=None): import time from random import random as rand + if not endpoint.startswith("/"): + endpoint = "/" + endpoint + if not endpoint.startswith("/v2"): + endpoint = "/v2" + endpoint + url = _BASE_URL + endpoint + if params is None: params = {} @@ -106,7 +121,7 @@ def get_locations(**kwargs): https://api.openaq.org/docs#/v2/locations_get_v2_locations_get """ - data = _consume("https://api.openaq.org/v2/locations", **kwargs) + data = _consume(_ENDPOINTS["locations"], **kwargs) # Some fields with scalar values to take some_scalars = [ @@ -170,7 +185,7 @@ def get_parameters(**kwargs): kwargs are passed to :func:`_consume`. """ - data = _consume("https://api.openaq.org/v2/parameters", **kwargs) + data = _consume(_ENDPOINTS["parameters"], **kwargs) df = pd.DataFrame(data) @@ -314,7 +329,7 @@ def iter_time_slices(): f"coords={coords} radius={radius}" ) data_ = _consume( - "https://api.openaq.org/v2/measurements", + _ENDPOINTS["measurements"], params=params, **kwargs, ) @@ -322,7 +337,7 @@ def iter_time_slices(): else: logger.info(f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}'") data_ = _consume( - "https://api.openaq.org/v2/measurements", + _ENDPOINTS["measurements"], params=params, **kwargs, ) From 149d2bf460ae16d3d700d81dd3ec8ba27ffd8317 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 13:57:51 -0500 Subject: [PATCH 56/77] Wait until usage to warn about API key parameters should be fine keyless --- monetio/obs/openaq_v2.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 1befe380..833547b1 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -3,9 +3,11 @@ https://openaq.org/ https://api.openaq.org/docs#/v2 """ +import functools import json import logging import os +import warnings import pandas as pd import requests @@ -13,11 +15,22 @@ logger = logging.getLogger(__name__) API_KEY = os.environ.get("OPENAQ_API_KEY", None) -if API_KEY is None: - print( - "warning: non-cached requests to the OpenAQ v2 web API will be slow without an API key. " - "Obtain one and set your OPENAQ_API_KEY environment variable." - ) + + +def _api_key_warning(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if API_KEY is None: + warnings.warn( + "Non-cached requests to the OpenAQ v2 web API will be slow without an API key. " + "Obtain one (https://docs.openaq.org/docs/getting-started#api-key) " + "and set your OPENAQ_API_KEY environment variable.", + stacklevel=2, + ) + return func(*args, **kwargs) + + return wrapper + _BASE_URL = "https://api.openaq.org" _ENDPOINTS = { @@ -113,6 +126,7 @@ def _consume(endpoint, *, params=None, timeout=10, retry=5, limit=500, npages=No return data +@_api_key_warning def get_locations(**kwargs): """Get available site info (including site IDs) from OpenAQ v2 API. @@ -216,6 +230,7 @@ def get_latlonbox_sites(latlonbox, **kwargs): return sites[in_box].reset_index(drop=True) +@_api_key_warning def add_data( dates, *, From 3c8890ebd5aec2ac6058f2cdecfba70ce1811a4f Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 14:18:44 -0500 Subject: [PATCH 57/77] Validate search radii earlier --- monetio/obs/openaq_v2.py | 15 ++++++++++----- tests/test_openaq_v2.py | 13 +++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 833547b1..12757ab1 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -257,7 +257,9 @@ def add_data( For example, ``'US'`` or ``['US', 'CA']`` (two-letter country codes). Default: full dataset (no limitation by country). search_radius : dict, optional - Mapping coords (lat, lon) [deg] to search radius [m] (max of 25 km). + Mapping of coords tuple (lat, lon) [deg] to search radius [m] (max of 25 km). + For example: ``search_radius={(39.0, -77.0): 10_000}``. + Note that this dict can contain multiple entries. sites : list of str, optional Site ID(s) to include, e.g. a specific known site or group of sites from :func:`get_latlonbox_sites`. @@ -295,6 +297,13 @@ def add_data( if date_min == date_max or len(dates) == 0: raise ValueError("must provide at least two unique datetimes") + for coords, radius in search_radius.items(): + if not 0 < radius <= 25_000: + raise ValueError( + f"invalid radius {radius!r} for location {coords!r}. " + "Must be positive and <= 25000 (25 km)." + ) + if wide_fmt is True: raise NotImplementedError("wide format not implemented yet") @@ -331,10 +340,6 @@ def iter_time_slices(): ) if search_radius is not None: for coords, radius in search_radius.items(): - if not 0 < radius <= 25_000: - raise ValueError( - f"invalid radius {radius!r}. Must be positive and <= 25000 (25 km)." - ) params.update( coordinates=f"{coords[0]:.8f},{coords[1]:.8f}", radius=radius, diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index b0860590..63ce1e82 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -93,3 +93,16 @@ def test_get_data_near_ncwcp_entity(entity): def test_get_data_wide_error(): with pytest.raises(NotImplementedError, match="wide format not implemented"): openaq.add_data(["2023-08-01", "2023-08-02"], wide_fmt=True) + + +@pytest.mark.parametrize( + "radius", + [ + 0, + -1, + 25001, + ], +) +def test_get_data_bad_radius(radius): + with pytest.raises(ValueError, match="invalid radius"): + openaq.add_data(["2023-08-01", "2023-08-02"], search_radius={LATLON_NCWCP: radius}) From 5e49e3cbf154c25df3448d6879203c6d45f27898 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 14:58:33 -0500 Subject: [PATCH 58/77] Generator for params for the separate queries --- monetio/obs/openaq_v2.py | 84 ++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 12757ab1..ecf192a5 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -95,6 +95,7 @@ def _consume(endpoint, *, params=None, timeout=10, retry=5, limit=500, npages=No tries = 0 while tries < retry: + logger.debug(f"GET {url} params={params}") r = requests.get(url, params=params, headers=headers, timeout=timeout) if r.status_code == 408: tries += 1 @@ -297,12 +298,13 @@ def add_data( if date_min == date_max or len(dates) == 0: raise ValueError("must provide at least two unique datetimes") - for coords, radius in search_radius.items(): - if not 0 < radius <= 25_000: - raise ValueError( - f"invalid radius {radius!r} for location {coords!r}. " - "Must be positive and <= 25000 (25 km)." - ) + if search_radius is not None: + for coords, radius in search_radius.items(): + if not 0 < radius <= 25_000: + raise ValueError( + f"invalid radius {radius!r} for location {coords!r}. " + "Must be positive and <= 25000 (25 km)." + ) if wide_fmt is True: raise NotImplementedError("wide format not implemented yet") @@ -320,48 +322,46 @@ def iter_time_slices(): else: yield date_min - one_sec, date_max - params = {} + base_params = {} if country is not None: - params.update(country=country) + base_params.update(country=country) if sites is not None: - params.update(location_id=sites) + base_params.update(location_id=sites) if entity is not None: - params.update(entity=entity) + base_params.update(entity=entity) if sensor_type is not None: - params.update(sensor_type=sensor_type) + base_params.update(sensor_type=sensor_type) + + def iter_queries(): + for parameter in parameters: + for t_from, t_to in iter_time_slices(): + if search_radius is not None: + for coords, radius in search_radius.items(): + lat, lon = coords + yield { + **base_params, + "parameter": parameter, + "date_from": t_from, + "date_to": t_to, + "coordinates": f"{lat:.8f},{lon:.8f}", + "radius": radius, + } + else: + yield { + **base_params, + "parameter": parameter, + "date_from": t_from, + "date_to": t_to, + } data = [] - for parameter in parameters: - params.update(parameter=parameter) - for t_from, t_to in iter_time_slices(): - params.update( - date_from=t_from, - date_to=t_to, - ) - if search_radius is not None: - for coords, radius in search_radius.items(): - params.update( - coordinates=f"{coords[0]:.8f},{coords[1]:.8f}", - radius=radius, - ) - logger.info( - f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}' " - f"coords={coords} radius={radius}" - ) - data_ = _consume( - _ENDPOINTS["measurements"], - params=params, - **kwargs, - ) - data.extend(data_) - else: - logger.info(f"parameter={parameter!r} t_from='{t_from}' t_to='{t_to}'") - data_ = _consume( - _ENDPOINTS["measurements"], - params=params, - **kwargs, - ) - data.extend(data_) + for params in iter_queries(): + data_ = _consume( + _ENDPOINTS["measurements"], + params=params, + **kwargs, + ) + data.extend(data_) df = pd.DataFrame(data) if df.empty: From ee81f6accc7b578a225260bb32abaef196ff26c2 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 15:11:07 -0500 Subject: [PATCH 59/77] Initial multi-thread support --- monetio/obs/openaq_v2.py | 29 +++++++++++++++++++++-------- tests/test_openaq_v2.py | 2 +- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index ecf192a5..b01ae022 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -354,14 +354,27 @@ def iter_queries(): "date_to": t_to, } - data = [] - for params in iter_queries(): - data_ = _consume( - _ENDPOINTS["measurements"], - params=params, - **kwargs, - ) - data.extend(data_) + threads = kwargs.pop("threads", None) + if threads is not None: + import concurrent.futures + from itertools import chain + + with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: + data = chain.from_iterable( + executor.map( + lambda params: _consume(_ENDPOINTS["measurements"], params=params, **kwargs), + iter_queries(), + ) + ) + else: + data = [] + for params in iter_queries(): + this_data = _consume( + _ENDPOINTS["measurements"], + params=params, + **kwargs, + ) + data.extend(this_data) df = pd.DataFrame(data) if df.empty: diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 63ce1e82..05112a5a 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -57,7 +57,7 @@ def test_get_data_near_ncwcp_sites(): def test_get_data_near_ncwcp_search_radius(): latlon = LATLON_NCWCP dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") - df = openaq.add_data(dates, search_radius={latlon: 10_000}) + df = openaq.add_data(dates, search_radius={latlon: 10_000}, threads=2) assert len(df) > 0 assert "pm25" in df.parameter.values assert df.latitude.round().eq(39).all() From a62c16f716856cc96442941a394078e0406e0350 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 16:31:48 -0500 Subject: [PATCH 60/77] Allow passing single time --- monetio/obs/openaq_v2.py | 39 +++++++++++++++++++++++++++------------ tests/test_openaq_v2.py | 7 +++++++ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index b01ae022..64dc73ee 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -97,14 +97,13 @@ def _consume(endpoint, *, params=None, timeout=10, retry=5, limit=500, npages=No while tries < retry: logger.debug(f"GET {url} params={params}") r = requests.get(url, params=params, headers=headers, timeout=timeout) + tries += 1 if r.status_code == 408: - tries += 1 logger.info(f"request timed out (try {tries}/{retry})") time.sleep(tries + 0.1 * rand()) - if r.status_code == 429: - tries += 1 + elif r.status_code == 429: logger.info(f"rate limited (try {tries}/{retry})") - time.sleep(tries * 1.5 + 0.1 * rand()) + time.sleep(tries * 2 + 0.2 * rand()) else: break r.raise_for_status() @@ -252,6 +251,10 @@ def add_data( Parameters ---------- + dates : datetime-like or array-like of datetime-like + One desired date/time or + an array, of which the min and max wil be used + as inclusive time bounds of the desired data. parameters : str or list of str, optional For example, ``'o3'`` or ``['pm25', 'o3']`` (default). country : str or list of str, optional @@ -280,23 +283,35 @@ def add_data( Set to ``None`` for no time splitting. Default: 1 hour (OpenAQ data are hourly, so setting to something smaller won't help). + Ignored if only one date/time is provided. wide_fmt : bool Convert dataframe to wide format (one column per parameter). """ - dates = pd.DatetimeIndex(dates) + dates = pd.to_datetime(dates) + if pd.api.types.is_scalar(dates): + dates = pd.DatetimeIndex([dates]) + dates = dates.dropna() + if dates.empty: + raise ValueError("must provide at least one datetime-like") + if parameters is None: parameters = ["pm25", "o3"] elif isinstance(parameters, str): parameters = [parameters] - query_dt = pd.to_timedelta(query_time_split) - if query_dt is not None and query_dt <= pd.Timedelta(0): - raise ValueError( - f"query_time_split must be positive, got {query_dt} from {query_time_split!r}" - ) + + query_dt = pd.to_timedelta(query_time_split) if len(dates) > 1 else None date_min, date_max = dates.min(), dates.max() - if date_min == date_max or len(dates) == 0: - raise ValueError("must provide at least two unique datetimes") + if query_dt is not None: + if query_dt <= pd.Timedelta(0): + raise ValueError( + f"query_time_split must be positive, got {query_dt} from {query_time_split!r}" + ) + if date_min == date_max: + raise ValueError( + "must provide at least two unique datetimes to use query_time_split. " + "Set query_time_split=None to disable time splitting." + ) if search_radius is not None: for coords, radius in search_radius.items(): diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 05112a5a..2797291e 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -75,6 +75,13 @@ def test_get_data_near_ncwcp_sensor_type(): assert df.sensorType.eq("low-cost sensor").all() +def test_get_data_single_dt_single_site(): + site = 843 + dates = "2023-08-01" + df = openaq.add_data(dates, parameters="o3", sites=site) + assert len(df) == 1 + + @pytest.mark.parametrize( "entity", [ From ee6dbf778b645e95dacc1a4120a4261acc84c4af Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 16:35:28 -0500 Subject: [PATCH 61/77] More renames --- monetio/obs/openaq_v2.py | 9 ++++++++- tests/test_openaq_v2.py | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 64dc73ee..57137acb 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -435,7 +435,14 @@ def iter_queries(): ) # Site ID - df = df.rename(columns={"locationId": "siteid"}) + df = df.rename( + columns={ + "locationId": "siteid", + "isMobile": "is_mobile", + "isAnalysis": "is_analysis", + "sensorType": "sensor_type", + }, + ) df["siteid"] = df.siteid.astype(str) return df diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 2797291e..78246cb1 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -63,7 +63,7 @@ def test_get_data_near_ncwcp_search_radius(): assert df.latitude.round().eq(39).all() assert df.longitude.round().eq(-77).all() assert (sorted(df.time.unique()) == dates).all() - assert not df.sensorType.eq("low-cost sensor").all() + assert not df.sensor_type.eq("low-cost sensor").all() assert df.entity.eq("Governmental Organization").all() @@ -72,7 +72,7 @@ def test_get_data_near_ncwcp_sensor_type(): dates = pd.date_range("2023-08-01", "2023-08-01 03:00", freq="1H") df = openaq.add_data(dates, sensor_type="low-cost sensor", search_radius={latlon: 25_000}) assert len(df) > 0 - assert df.sensorType.eq("low-cost sensor").all() + assert df.sensor_type.eq("low-cost sensor").all() def test_get_data_single_dt_single_site(): From 06b490939b376a6d3c0529f96a28671942388065 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 23 Aug 2024 17:33:17 -0500 Subject: [PATCH 62/77] Wait more when rate-limited --- monetio/obs/openaq_v2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 57137acb..d368d44b 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -102,8 +102,9 @@ def _consume(endpoint, *, params=None, timeout=10, retry=5, limit=500, npages=No logger.info(f"request timed out (try {tries}/{retry})") time.sleep(tries + 0.1 * rand()) elif r.status_code == 429: + # Note: reponse headers don't seem to include Retry-After logger.info(f"rate limited (try {tries}/{retry})") - time.sleep(tries * 2 + 0.2 * rand()) + time.sleep(tries * 5 + 0.2 * rand()) else: break r.raise_for_status() From e497ab4092ac697fe1f4503e817dee4958f16fb3 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 28 Aug 2024 14:07:35 -0500 Subject: [PATCH 63/77] Neg -> NaN --- monetio/obs/openaq_v2.py | 40 +++++++++++++++++++++++++++++++++++++++- tests/test_openaq_v2.py | 1 + 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index d368d44b..00d7bdc6 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -9,6 +9,7 @@ import os import warnings +import numpy as np import pandas as pd import requests @@ -435,7 +436,7 @@ def iter_queries(): longitude=lon, ) - # Site ID + # Rename columns and ensure site ID is string df = df.rename( columns={ "locationId": "siteid", @@ -446,4 +447,41 @@ def iter_queries(): ) df["siteid"] = df.siteid.astype(str) + # Most variables invalid if < 0 + # > preferredUnit.value_counts() + # ppb 19 + # µg/m³ 13 + # ppm 10 + # particles/cm³ 8 + # % 3 relative humidity + # umol/mol 1 + # ng/m3 1 + # deg 1 wind direction + # m/s 1 wind speed + # deg_c 1 + # hpa 1 + # ugm3 1 + # c 1 + # f 1 + # mb 1 + # iaq 1 + non_neg_units = [ + "particles/cm³", + "ppm", + "ppb", + "umol/mol", + "µg/m³", + "ugm3", + "ng/m3", + "iaq", + # + "%", + # + "m/s", + # + "hpa", + "mb", + ] + df.loc[df.unit.isin(non_neg_units) & (df.value < 0), "value"] = np.nan + return df diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 78246cb1..c44512d5 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -52,6 +52,7 @@ def test_get_data_near_ncwcp_sites(): assert df.longitude.round().eq(-77).all() assert (sorted(df.time.unique()) == dates).all() assert set(df.siteid) <= {str(site) for site in sites} + assert not df.value.isna().all() and not df.value.lt(0).any() def test_get_data_near_ncwcp_search_radius(): From 427bddcfe1892ceb539f01dc0b66b70d10dffab3 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 30 Aug 2024 10:32:28 -0500 Subject: [PATCH 64/77] Initial wide format transformation --- .pre-commit-config.yaml | 1 + monetio/obs/openaq_v2.py | 93 ++++++++++++++++++++++++++++++++++++++-- tests/test_openaq_v2.py | 16 ++++--- 3 files changed, 102 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c99b1a0a..4a918888 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,6 +6,7 @@ repos: exclude: tdump\.[0-9]* - id: end-of-file-fixer - id: check-docstring-first + exclude: monetio/obs/openaq_v2\.py - id: check-yaml - repo: https://github.com/asottile/pyupgrade diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 00d7bdc6..2fb6dd89 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -17,6 +17,37 @@ API_KEY = os.environ.get("OPENAQ_API_KEY", None) +_PPM_TO_UGM3 = { + "o3": 1990, + "co": 1160, + "no2": 1900, + "no": 1240, + "so2": 2650, + "ch4": 664, + "co2": 1820, +} +"""Conversion factors from ppmv to µg/m³. + +Based on + +- air average molecular weight: 29 g/mol +- air density: 1.2 kg m -3 + +and rounded to 3 significant figures. +""" + +# NOx assumption +_PPM_TO_UGM3["nox"] = _PPM_TO_UGM3["no2"] + +_NON_MOLEC_PARAMS = [ + "pm1", + "pm25", + "pm4", + "pm10", + "bc", +] +"""Parameters that are not molecules and should be in µg/m³ units.""" + def _api_key_warning(func): @functools.wraps(func) @@ -288,6 +319,12 @@ def add_data( Ignored if only one date/time is provided. wide_fmt : bool Convert dataframe to wide format (one column per parameter). + Note that for some variables that involves conversion from + µg/m³ to ppmv. + This conversion is based on an average air molecular weight of 29 g/mol + and an air density of 1.2 kg/m³. + Use ``wide_fmt=False`` if you want to do the conversion yourself. + In some cases, the conversion to wide format also reduces the amount of data returned. """ dates = pd.to_datetime(dates) @@ -323,9 +360,6 @@ def add_data( "Must be positive and <= 25000 (25 km)." ) - if wide_fmt is True: - raise NotImplementedError("wide format not implemented yet") - def iter_time_slices(): # seems that (from < time <= to) == (from , to] is used # i.e. `from` is exclusive, `to` is inclusive @@ -484,4 +518,57 @@ def iter_queries(): ] df.loc[df.unit.isin(non_neg_units) & (df.value < 0), "value"] = np.nan + if wide_fmt: + # Normalize units + for vn, f in _PPM_TO_UGM3.items(): + is_ug = (df.parameter == vn) & (df.unit == "µg/m³") + df.loc[is_ug, "value"] /= f + df.loc[is_ug, "unit"] = "ppm" + + # Warn if inconsistent units + p_units = df.groupby("parameter").unit.unique() + unique = p_units.apply(len).eq(1) + if not unique.all(): + p_units_non_unique = p_units[~unique] + warnings.warn(f"inconsistent units among parameters:\n{p_units_non_unique}") + + # Pivot + index = [ + "siteid", + "time", + "latitude", + "longitude", + "time_local", + "utcoffset", + # + "location", + "city", + "country", + # + "entity", + "sensor_type", + "is_mobile", + "is_analysis", + ] + dupes = df[df.duplicated(keep=False)] + if not dupes.empty: + logging.info(f"found {dupes.sum()} duplicated rows") + for col in index: + if df[col].isnull().all(): + index.remove(col) + warnings.warn(f"dropping {col!r} from index for wide fmt (all null)") + df = ( + df.drop_duplicates(keep="first") + .pivot_table( + values="value", + index=index, + columns="parameter", + ) + .reset_index() + ) + + # Rename so that units are clear + df = df.rename(columns={p: f"{p}_ugm3" for p in _NON_MOLEC_PARAMS}, errors="ignore") + df = df.rename(columns={p: f"{p}_ppm" for p in _PPM_TO_UGM3}, errors="ignore") + return df diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index c44512d5..2ed54192 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -55,6 +55,17 @@ def test_get_data_near_ncwcp_sites(): assert not df.value.isna().all() and not df.value.lt(0).any() +def test_get_data_near_ncwcp_sites_wide(): + sites = SITES_NEAR_NCWCP + dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") + + with pytest.warns(UserWarning, match=r"dropping '.*' from index for wide fmt \(all null\)"): + df = openaq.add_data(dates, sites=sites, wide_fmt=True) + assert len(df) > 0 + assert {"pm25_ugm3", "o3_ppm"} <= set(df.columns) + assert not {"parameter", "value", "unit"} <= set(df.columns) + + def test_get_data_near_ncwcp_search_radius(): latlon = LATLON_NCWCP dates = pd.date_range("2023-08-01", "2023-08-01 01:00", freq="1H") @@ -98,11 +109,6 @@ def test_get_data_near_ncwcp_entity(entity): assert df.empty -def test_get_data_wide_error(): - with pytest.raises(NotImplementedError, match="wide format not implemented"): - openaq.add_data(["2023-08-01", "2023-08-02"], wide_fmt=True) - - @pytest.mark.parametrize( "radius", [ From 5c865a95da9d5858898c98f97f8b03c7213769f0 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 10:51:11 -0500 Subject: [PATCH 65/77] Document some API-related params --- monetio/obs/openaq_v2.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 2fb6dd89..ee2d035d 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -1,6 +1,7 @@ """Get AQ data from the OpenAQ v2 REST API. https://openaq.org/ + https://api.openaq.org/docs#/v2 """ import functools @@ -279,9 +280,6 @@ def add_data( ): """Get OpenAQ API v2 data, including low-cost sensors. - kwargs are passed to :func:`_consume`, - though currently ``params`` can't be one of them. - Parameters ---------- dates : datetime-like or array-like of datetime-like @@ -325,6 +323,13 @@ def add_data( and an air density of 1.2 kg/m³. Use ``wide_fmt=False`` if you want to do the conversion yourself. In some cases, the conversion to wide format also reduces the amount of data returned. + retry : int, default: 5 + Number of times to retry an API request if it times out. + timeout : float, default: 10 + Seconds to wait for the server before giving up, for a single request. + threads : int, optional + Number of threads to use for fetching data. + Default: no multi-threading. """ dates = pd.to_datetime(dates) From b6329f0d2eb994ac927dc00995ced7374cb2b7f5 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 11:35:11 -0500 Subject: [PATCH 66/77] Import --- monetio/__init__.py | 16 +++++++++++++++- monetio/obs/__init__.py | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/monetio/__init__.py b/monetio/__init__.py index 8796418e..5b0c82e8 100644 --- a/monetio/__init__.py +++ b/monetio/__init__.py @@ -1,6 +1,19 @@ from . import grids from .models import camx, cmaq, fv3chem, hysplit, hytraj, ncep_grib, pardump, prepchem, raqms -from .obs import aeronet, airnow, aqs, cems, crn, improve, ish, ish_lite, nadp, openaq, pams +from .obs import ( + aeronet, + airnow, + aqs, + cems, + crn, + improve, + ish, + ish_lite, + nadp, + openaq, + openaq_v2, + pams, +) from .profile import geoms, icartt, tolnet from .sat import goes @@ -29,6 +42,7 @@ "ish_lite", "nadp", "openaq", + "openaq_v2", "pams", # # profile obs diff --git a/monetio/obs/__init__.py b/monetio/obs/__init__.py index 58ce51f6..2add7e9f 100644 --- a/monetio/obs/__init__.py +++ b/monetio/obs/__init__.py @@ -10,6 +10,7 @@ ish_lite, nadp, openaq, + openaq_v2, pams, ) @@ -25,6 +26,7 @@ "cems_mod", "nadp", "openaq", + "openaq_v2", "pams", ] From a3e1da3167c1d1265c930add7aeee57c38ba5724 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 11:58:26 -0500 Subject: [PATCH 67/77] Seems like API key is required now --- monetio/obs/openaq_v2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index ee2d035d..3c93eb82 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -55,7 +55,8 @@ def _api_key_warning(func): def wrapper(*args, **kwargs): if API_KEY is None: warnings.warn( - "Non-cached requests to the OpenAQ v2 web API will be slow without an API key. " + "Non-cached requests to the OpenAQ v2 web API will be slow without an API key, " + "or might fail (HTTP error 401). " "Obtain one (https://docs.openaq.org/docs/getting-started#api-key) " "and set your OPENAQ_API_KEY environment variable.", stacklevel=2, From f1687e8ecd7adfa4b85e5f5fd321d85480097c7e Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 12:42:24 -0500 Subject: [PATCH 68/77] Ensure site has just one location name --- monetio/obs/openaq_v2.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 3c93eb82..13c672bf 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -538,6 +538,22 @@ def iter_queries(): p_units_non_unique = p_units[~unique] warnings.warn(f"inconsistent units among parameters:\n{p_units_non_unique}") + # Location name should be unique for given site (location) ID + site_names = df.groupby("siteid").location.unique() + unique = site_names.apply(len).eq(1) + if not unique.all(): + site_names_non_unique = site_names[~unique] + warnings.warn( + f"non-unique location names among site IDs:\n{site_names_non_unique}" + "\nUsing first." + ) + df = df.drop(columns=["location"]).merge( + site_names.str.get(0), + left_on="siteid", + right_index=True, + how="left", + ) + # Pivot index = [ "siteid", From 2d3dfeda98ba320dc4db4e8632cc24b28e889695 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 12:55:55 -0500 Subject: [PATCH 69/77] len --- monetio/obs/openaq_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 13c672bf..9b70af84 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -574,7 +574,7 @@ def iter_queries(): ] dupes = df[df.duplicated(keep=False)] if not dupes.empty: - logging.info(f"found {dupes.sum()} duplicated rows") + logging.info(f"found {len(dupes)} duplicated rows") for col in index: if df[col].isnull().all(): index.remove(col) From f8558777824a8312e650559ae078159193566a30 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 13:09:03 -0500 Subject: [PATCH 70/77] Ensure lat/lon unique for site as well again just for wide format --- monetio/obs/openaq_v2.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 9b70af84..58e28c0d 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -538,21 +538,22 @@ def iter_queries(): p_units_non_unique = p_units[~unique] warnings.warn(f"inconsistent units among parameters:\n{p_units_non_unique}") - # Location name should be unique for given site (location) ID - site_names = df.groupby("siteid").location.unique() - unique = site_names.apply(len).eq(1) - if not unique.all(): - site_names_non_unique = site_names[~unique] - warnings.warn( - f"non-unique location names among site IDs:\n{site_names_non_unique}" - "\nUsing first." - ) - df = df.drop(columns=["location"]).merge( - site_names.str.get(0), - left_on="siteid", - right_index=True, - how="left", - ) + # Certain metadata should be unique for a given site but sometimes aren't + # (e.g. location names of different specificity, slight differences in lat/lon coords) + for col in ["location", "latitude", "longitude"]: + site_col = df.groupby("siteid")[col].unique() + unique = site_col.apply(len).eq(1) + if not unique.all(): + site_col_non_unique = site_col[~unique] + warnings.warn( + f"non-unique {col!r} among site IDs:\n{site_col_non_unique}" "\nUsing first." + ) + df = df.drop(columns=[col]).merge( + site_col.str.get(0), + left_on="siteid", + right_index=True, + how="left", + ) # Pivot index = [ From 70cd53adf22f27ac8524d16a430477ea0ab35adb Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 14:52:29 -0500 Subject: [PATCH 71/77] Fix typos --- monetio/obs/openaq_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 58e28c0d..188f86ea 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -136,7 +136,7 @@ def _consume(endpoint, *, params=None, timeout=10, retry=5, limit=500, npages=No logger.info(f"request timed out (try {tries}/{retry})") time.sleep(tries + 0.1 * rand()) elif r.status_code == 429: - # Note: reponse headers don't seem to include Retry-After + # Note: response headers don't seem to include Retry-After logger.info(f"rate limited (try {tries}/{retry})") time.sleep(tries * 5 + 0.2 * rand()) else: @@ -285,7 +285,7 @@ def add_data( ---------- dates : datetime-like or array-like of datetime-like One desired date/time or - an array, of which the min and max wil be used + an array, of which the min and max will be used as inclusive time bounds of the desired data. parameters : str or list of str, optional For example, ``'o3'`` or ``['pm25', 'o3']`` (default). From 5729d87e5ef64ec638a82394d61b656aebcbfa9c Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 15:14:41 -0500 Subject: [PATCH 72/77] Provide OpenAQ API key --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 69e5cf35..06bce9da 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,6 +9,9 @@ on: schedule: - cron: "0 12 * * 1" +env: + OPENAQ_API_KEY: ${{ secrets.OPENAQ_API_KEY }} + jobs: test: name: Test (Py ${{ matrix.python-version }}) From de87dd039a5204e6c48a13e8511eb04d13eb7718 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 15:21:25 -0500 Subject: [PATCH 73/77] Skip on 3.6 --- tests/test_openaq_v2.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index 2ed54192..c9b5a6f8 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -1,8 +1,13 @@ +import sys + import pandas as pd import pytest import monetio.obs.openaq_v2 as openaq +if sys.version_info < (3, 7): + pytest.skip("asdf", allow_module_level=True) + LATLON_NCWCP = 38.9721, -76.9248 SITES_NEAR_NCWCP = [ # AirGradient monitor From 8f0be93e217146710d7b8e361727d9d46c391aba Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 15:38:18 -0500 Subject: [PATCH 74/77] Quote API key --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06bce9da..1eacb69d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,7 @@ on: - cron: "0 12 * * 1" env: - OPENAQ_API_KEY: ${{ secrets.OPENAQ_API_KEY }} + OPENAQ_API_KEY: "${{ secrets.OPENAQ_API_KEY }}" jobs: test: From bbc16ecc930c35d681d06fd1f1efa7f319e534bb Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 15:48:48 -0500 Subject: [PATCH 75/77] Check length --- monetio/obs/openaq_v2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 188f86ea..56a8d934 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -17,6 +17,10 @@ logger = logging.getLogger(__name__) API_KEY = os.environ.get("OPENAQ_API_KEY", None) +if API_KEY is not None: + API_KEY = API_KEY.strip() + if len(API_KEY) != 64: + warnings.warn(f"API key length is {len(API_KEY)}, expected 64") _PPM_TO_UGM3 = { "o3": 1990, From 4490a50d7e76bcc7d2e10d33feb5ff5ef69c53dc Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 11 Sep 2024 16:12:01 -0500 Subject: [PATCH 76/77] Skip if on CI and API key is empty string "With the exception of GITHUB_TOKEN, secrets are not passed to the runner when a workflow is triggered from a forked repository." --- tests/test_openaq_v2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_openaq_v2.py b/tests/test_openaq_v2.py index c9b5a6f8..3e19335f 100644 --- a/tests/test_openaq_v2.py +++ b/tests/test_openaq_v2.py @@ -1,12 +1,16 @@ -import sys +import os import pandas as pd import pytest import monetio.obs.openaq_v2 as openaq -if sys.version_info < (3, 7): - pytest.skip("asdf", allow_module_level=True) +if ( + os.environ.get("CI", "false").lower() not in {"false", "0"} + and os.environ.get("OPENAQ_API_KEY", "") == "" +): + # PRs from forks don't get the secret + pytest.skip("no API key", allow_module_level=True) LATLON_NCWCP = 38.9721, -76.9248 SITES_NEAR_NCWCP = [ From 7d16a4470a3ce98fa98ab6ade8ef8bb25ad0253f Mon Sep 17 00:00:00 2001 From: zmoon Date: Thu, 19 Sep 2024 11:56:29 -0500 Subject: [PATCH 77/77] API key notes in module docstring --- monetio/obs/openaq.py | 1 + monetio/obs/openaq_v2.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/monetio/obs/openaq.py b/monetio/obs/openaq.py index 56dcaa7e..9ca08601 100644 --- a/monetio/obs/openaq.py +++ b/monetio/obs/openaq.py @@ -1,6 +1,7 @@ """Get v1 (government-only) OpenAQ data from AWS. https://openaq.org/ + https://openaq-fetches.s3.amazonaws.com/index.html """ diff --git a/monetio/obs/openaq_v2.py b/monetio/obs/openaq_v2.py index 889db5eb..d709f3c5 100644 --- a/monetio/obs/openaq_v2.py +++ b/monetio/obs/openaq_v2.py @@ -1,5 +1,14 @@ """Get AQ data from the OpenAQ v2 REST API. +Visit https://docs.openaq.org/docs/getting-started to get an API key +and set environment variable ``OPENAQ_API_KEY`` to use it. + +For example, in Bash: + +.. code-block:: bash + + export OPENAQ_API_KEY="your_api_key_here" + https://openaq.org/ https://api.openaq.org/docs#/v2 @@ -60,8 +69,8 @@ def _api_key_warning(func): def wrapper(*args, **kwargs): if API_KEY is None: warnings.warn( - "Non-cached requests to the OpenAQ v2 web API will be slow without an API key, " - "or might fail (HTTP error 401). " + "Non-cached requests to the OpenAQ v2 web API will be slow without an API key " + "or requests will fail (HTTP error 401). " "Obtain one (https://docs.openaq.org/docs/getting-started#api-key) " "and set your OPENAQ_API_KEY environment variable.", stacklevel=2,