Skip to content

Commit

Permalink
Merge pull request #1369 from metno/expand-colocated-data-to_dataframe
Browse files Browse the repository at this point in the history
Improve ColocatedData.to_dataframe() method
  • Loading branch information
lewisblake authored Oct 10, 2024
2 parents f6148b2 + 23809dc commit 661b82f
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 12 deletions.
69 changes: 57 additions & 12 deletions pyaerocom/colocation/colocated_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,27 +1299,72 @@ def read_netcdf(self, file_path):
def to_dataframe(self):
"""Convert this object into pandas.DataFrame
Note
----
This does not include meta information
The resulting DataFrame will have the following columns:
station: The name of the station for a given value.
The following columns will be available in the resulting dataframe:
- time: Time.
- station_name: Station name.
- data_source_obs: Data source obs (eg. EBASMC).
- data_source_mod: Data source model (eg. EMEP).
- latitude.
- longitude.
- altitude.
- {var_name}_obs: Variable value of observation.
- {var_name}_mod: Variable value of model.
{var_name} is the aerocom variable name of the variable name.
"""
if self.data.ndim == 4:
raise NotImplementedError
obs_df = self.data[0, :, :].to_dataframe(name=self.var_name[0]).reset_index()
mod_df = self.data[1, :, :].to_dataframe(name=self.var_name[0]).reset_index()

df = pd.merge(
obs_df,
mod_df,
how="outer",
on=("time", "station_name", "latitude", "longitude", "altitude"),
suffixes=("_obs", "_mod"),
)

return df

@staticmethod
def _validate_dataframe_for_import(df: pd.DataFrame):
"""Validates a pandas dataframe and checks that it will likely
work with ColocatedData.from_dataframe()
:param df: The pandas dataframe to be validated.
"""
logger.warning("This method is currently not completely finished")
model_vals = self.data.values[1].flatten()
obs_vals = self.data.values[0].flatten()
mask = ~np.isnan(obs_vals)
return pd.DataFrame({"ref": obs_vals[mask], "data": model_vals[mask]})
if not isinstance(df, pd.DataFrame):
raise TypeError(f"Expected pandas DataFrame, got {type(df)}")

def from_dataframe(self, df):
if (tmp := df.shape[1]) != 9:
raise ValueError(f"Expected DataFrame with 9 columns, got {tmp}")

if (tmp := len(df["data_source_obs"].unique())) != 1:
raise ValueError(f"Expected dataframe with 1 unique data_source_obs, got {tmp}.")

if (tmp := len(df["data_source_mod"].unique())) != 1:
raise ValueError(f"Expected dataframe with 1 unique data_source_mod, got {tmp}.")

# TODO: Check that required columns exist.
if "time" not in set(df.columns):
raise ValueError("Missing column '{time}'")

# ...

@staticmethod
def from_dataframe(df: pd.DataFrame) -> ColocatedData:
"""Create colocated Data object from dataframe
Note
----
This is intended to be used as back-conversion from :func:`to_dataframe`
and methods that use the latter (e.g. :func:`to_csv`).
"""
raise NotImplementedError("Coming soon...")
data = df.to_xarray()
self.data = data
ColocatedData._validate_dataframe_for_import(df)

def to_csv(self, out_dir, savename=None):
"""Save data object as .csv file
Expand Down
71 changes: 71 additions & 0 deletions tests/colocation/test_colocated_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,3 +498,74 @@ def test_ColocatedData_resample_time(coldata: ColocatedData, args: dict, mean):

resampled_mean = resampled.data.mean().data
assert resampled_mean == pytest.approx(mean, abs=1e-3, nan_ok=True)


@pytest.mark.parametrize(
"coldataset",
(
pytest.param(
"tm5_aeronet",
),
pytest.param(
"fake_3d_hr",
),
pytest.param(
"fake_3d",
),
),
)
def test_ColocatedData_to_dataframe(coldata: ColocatedData):
df = coldata.to_dataframe()

exp_columns = set(
[
"time",
"station_name",
"data_source_obs",
"latitude",
"longitude",
"altitude",
f"{coldata.var_name[0]}_obs",
"data_source_mod",
f"{coldata.var_name[1]}_mod",
]
)

assert df.shape[1] == 9
assert set(df.columns) == exp_columns
assert not df["time"].isnull().values.any()
assert not df["data_source_obs"].isnull().values.any()
assert not df["data_source_mod"].isnull().values.any()


@pytest.mark.parametrize(
"coldataset",
(
pytest.param(
"fake_4d",
),
),
)
def test_ColocatedData_to_dataframe_exception(coldata: ColocatedData):
with pytest.raises(NotImplementedError):
coldata.to_dataframe()


@pytest.mark.parametrize(
"coldataset",
(
pytest.param(
"tm5_aeronet",
),
pytest.param(
"fake_3d_hr",
),
pytest.param(
"fake_3d",
),
),
)
def test_ColocatedData_from_dataframe(coldata: ColocatedData):
df = coldata.to_dataframe()

ColocatedData.from_dataframe(df)

0 comments on commit 661b82f

Please sign in to comment.