Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/Check input for NaNs when available_mask = 1 #894

Merged
merged 8 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 124 additions & 1 deletion nbs/core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,8 @@
" self.id_col = id_col\n",
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
" self.time_col = time_col\n",
" self.target_col = target_col\n",
" self._check_nan(df, static_df, id_col, time_col, target_col)\n",
" \n",
" dataset, uids, last_dates, ds = TimeSeriesDataset.from_df(\n",
" df=df,\n",
" static_df=static_df,\n",
Expand All @@ -358,6 +360,29 @@
" self._scalers_fit_transform(dataset)\n",
" return dataset, uids, last_dates, ds\n",
"\n",
"\n",
" def _check_nan(self, df, static_df, id_col, time_col, target_col):\n",
" cols_with_nans = []\n",
"\n",
" temporal_cols = [target_col] + [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
" if \"available_mask\" in temporal_cols:\n",
" available_mask = df[\"available_mask\"].to_numpy().astype(bool)\n",
" else:\n",
" available_mask = np.full(df.shape[0], True)\n",
"\n",
" df_to_check = ufp.filter_with_mask(df, available_mask)\n",
" for col in temporal_cols:\n",
" if ufp.is_nan_or_none(df_to_check[col]).any():\n",
" cols_with_nans.append(col)\n",
"\n",
" if static_df is not None:\n",
" for col in [x for x in static_df.columns if x != id_col]:\n",
" if ufp.is_nan_or_none(static_df[col]).any():\n",
" cols_with_nans.append(col)\n",
"\n",
" if cols_with_nans:\n",
" raise ValueError(f\"Found missing values in {cols_with_nans}.\") \n",
"\n",
" def fit(self,\n",
" df: Optional[DataFrame] = None,\n",
" static_df: Optional[DataFrame] = None,\n",
Expand Down Expand Up @@ -1301,7 +1326,9 @@
"from neuralforecast.models.stemgnn import StemGNN\n",
"\n",
"from neuralforecast.losses.pytorch import MQLoss, MAE, MSE\n",
"from neuralforecast.utils import AirPassengersDF, AirPassengersPanel, AirPassengersStatic"
"from neuralforecast.utils import AirPassengersDF, AirPassengersPanel, AirPassengersStatic\n",
"\n",
"from datetime import date"
]
},
{
Expand Down Expand Up @@ -2281,6 +2308,102 @@
"assert_equal_dfs(insample_preds, insample_preds_pl)\n",
"assert_equal_dfs(cv_res, cv_res_pl)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fa887b3-4164-4758-931d-8d28a71b19b1",
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# Test if any of the inputs contains NaNs with available_mask = 1, fit shall raise error\n",
"# input type is pandas.DataFrame\n",
"# available_mask is explicitly given\n",
"\n",
"n_static_features = 2\n",
"n_temporal_features = 4\n",
"temporal_df, static_df = generate_series(n_series=4,\n",
" min_length=50,\n",
" max_length=50,\n",
" n_static_features=n_static_features,\n",
" n_temporal_features=n_temporal_features, \n",
" equal_ends=False) \n",
"temporal_df[\"available_mask\"] = 1\n",
"temporal_df.loc[10:20, \"available_mask\"] = 0\n",
"models = [NHITS(h=12, input_size=24, max_steps=20)]\n",
"nf = NeuralForecast(models=models, freq='D')\n",
"\n",
"# test case 1: target has NaN values\n",
"test_df1 = temporal_df.copy()\n",
"test_df1.loc[5:7, \"y\"] = np.nan\n",
"test_fail(lambda: nf.fit(test_df1), contains=\"Found missing values in ['y']\")\n",
"\n",
"# test case 2: exogenous has NaN values that are correctly flagged with exception\n",
"test_df2 = temporal_df.copy()\n",
"# temporal_0 won't raise ValueError as available_mask = 0\n",
"test_df2.loc[15:18, \"temporal_0\"] = np.nan\n",
"test_df2.loc[5, \"temporal_1\"] = np.nan\n",
"test_df2.loc[25, \"temporal_2\"] = np.nan\n",
"test_fail(lambda: nf.fit(test_df2), contains=\"Found missing values in ['temporal_1', 'temporal_2']\")\n",
"\n",
"# test case 3: static column has NaN values\n",
"test_df3 = static_df.copy()\n",
"test_df3.loc[3, \"static_1\"] = np.nan\n",
"test_fail(lambda: nf.fit(temporal_df, static_df=test_df3), contains=\"Found missing values in ['static_1']\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a157b6b4-0943-48f9-9427-fa8cf0b15d49",
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"#| polars\n",
"# Test if any of the inputs contains NaNs with available_mask = 1, fit shall raise error\n",
"# input type is polars.Dataframe\n",
"# Note that available_mask is not explicitly provided for this test\n",
"\n",
"pl_df = polars.DataFrame(\n",
" {\n",
" 'unique_id': [1]*50,\n",
" 'y': list(range(50)), \n",
" 'temporal_0': list(range(100,150)),\n",
" 'temporal_1': list(range(200,250)),\n",
" 'ds': polars.date_range(start=date(2022, 1, 1), end=date(2022, 2, 19), interval=\"1d\", eager=True), \n",
" }\n",
")\n",
"\n",
"pl_static_df = polars.DataFrame(\n",
" {\n",
" 'unique_id': [1],\n",
" 'static_0': [1.2], \n",
" 'static_1': [10.9],\n",
" }\n",
")\n",
"\n",
"models = [NHITS(h=12, input_size=24, max_steps=20)]\n",
"nf = NeuralForecast(models=models, freq='1d')\n",
"\n",
"# test case 1: target has NaN values\n",
"test_pl_df1 = pl_df.clone()\n",
"test_pl_df1[3, 'y'] = np.nan\n",
"test_pl_df1[4, 'y'] = None\n",
"test_fail(lambda: nf.fit(test_pl_df1), contains=\"Found missing values in ['y']\")\n",
"\n",
"# test case 2: exogenous has NaN values that are correctly flagged with exception\n",
"test_pl_df2 = pl_df.clone()\n",
"test_pl_df2[15, \"temporal_0\"] = np.nan\n",
"test_pl_df2[5, \"temporal_1\"] = np.nan\n",
"test_fail(lambda: nf.fit(test_pl_df2), contains=\"Found missing values in ['temporal_0', 'temporal_1']\")\n",
"\n",
"# test case 3: static column has NaN values\n",
"test_pl_df3 = pl_static_df.clone()\n",
"test_pl_df3[0, \"static_1\"] = np.nan\n",
"test_fail(lambda: nf.fit(pl_df, static_df=test_pl_df3), contains=\"Found missing values in ['static_1']\")"
]
}
],
"metadata": {
Expand Down
27 changes: 27 additions & 0 deletions neuralforecast/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def _warn_id_as_idx():

# %% ../nbs/core.ipynb 10
class NeuralForecast:

def __init__(
self,
models: List[Any],
Expand Down Expand Up @@ -248,6 +249,8 @@ def _prepare_fit(
self.id_col = id_col
self.time_col = time_col
self.target_col = target_col
self._check_nan(df, static_df, id_col, time_col, target_col)

dataset, uids, last_dates, ds = TimeSeriesDataset.from_df(
df=df,
static_df=static_df,
Expand All @@ -262,6 +265,30 @@ def _prepare_fit(
self._scalers_fit_transform(dataset)
return dataset, uids, last_dates, ds

def _check_nan(self, df, static_df, id_col, time_col, target_col):
cols_with_nans = []

temporal_cols = [target_col] + [
c for c in df.columns if c not in (id_col, time_col, target_col)
]
if "available_mask" in temporal_cols:
available_mask = df["available_mask"].to_numpy().astype(bool)
else:
available_mask = np.full(df.shape[0], True)

df_to_check = ufp.filter_with_mask(df, available_mask)
for col in temporal_cols:
if ufp.is_nan_or_none(df_to_check[col]).any():
cols_with_nans.append(col)

if static_df is not None:
for col in [x for x in static_df.columns if x != id_col]:
if ufp.is_nan_or_none(static_df[col]).any():
cols_with_nans.append(col)

if cols_with_nans:
raise ValueError(f"Found missing values in {cols_with_nans}.")

def fit(
self,
df: Optional[DataFrame] = None,
Expand Down
Loading