Nixtla · jmoralez · Feb 22, 2024 · Feb 18, 2024 · Feb 18, 2024 · Feb 18, 2024
diff --git a/nbs/core.ipynb b/nbs/core.ipynb
@@ -344,6 +344,8 @@
     "        self.id_col = id_col\n",
     "        self.time_col = time_col\n",
     "        self.target_col = target_col\n",
+    "        self._check_nan(df, static_df, id_col, time_col, target_col)\n",
+    "        \n",
     "        dataset, uids, last_dates, ds = TimeSeriesDataset.from_df(\n",
     "            df=df,\n",
     "            static_df=static_df,\n",
@@ -358,6 +360,29 @@
     "            self._scalers_fit_transform(dataset)\n",
     "        return dataset, uids, last_dates, ds\n",
     "\n",
+    "\n",
+    "    def _check_nan(self, df, static_df, id_col, time_col, target_col):\n",
+    "        cols_with_nans = []\n",
+    "\n",
+    "        temporal_cols = [target_col] + [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
+    "        if \"available_mask\" in temporal_cols:\n",
+    "            available_mask = df[\"available_mask\"].to_numpy().astype(bool)\n",
+    "        else:\n",
+    "            available_mask = np.full(df.shape[0], True)\n",
+    "\n",
+    "        df_to_check = ufp.filter_with_mask(df, available_mask)\n",
+    "        for col in temporal_cols:\n",
+    "            if ufp.is_nan_or_none(df_to_check[col]).any():\n",
+    "                cols_with_nans.append(col)\n",
+    "\n",
+    "        if static_df is not None:\n",
+    "            for col in [x for x in static_df.columns if x != id_col]:\n",
+    "                if ufp.is_nan_or_none(static_df[col]).any():\n",
+    "                    cols_with_nans.append(col)\n",
+    "\n",
+    "        if cols_with_nans:\n",
+    "            raise ValueError(f\"Found missing values in {cols_with_nans}.\")        \n",
+    "\n",
     "    def fit(self,\n",
     "        df: Optional[DataFrame] = None,\n",
     "        static_df: Optional[DataFrame] = None,\n",
@@ -1301,7 +1326,9 @@
     "from neuralforecast.models.stemgnn import StemGNN\n",
     "\n",
     "from neuralforecast.losses.pytorch import MQLoss, MAE, MSE\n",
-    "from neuralforecast.utils import AirPassengersDF, AirPassengersPanel, AirPassengersStatic"
+    "from neuralforecast.utils import AirPassengersDF, AirPassengersPanel, AirPassengersStatic\n",
+    "\n",
+    "from datetime import date"
    ]
   },
   {
@@ -2281,6 +2308,102 @@
     "assert_equal_dfs(insample_preds, insample_preds_pl)\n",
     "assert_equal_dfs(cv_res, cv_res_pl)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fa887b3-4164-4758-931d-8d28a71b19b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# Test if any of the inputs contains NaNs with available_mask = 1, fit shall raise error\n",
+    "# input type is pandas.DataFrame\n",
+    "# available_mask is explicitly given\n",
+    "\n",
+    "n_static_features = 2\n",
+    "n_temporal_features = 4\n",
+    "temporal_df, static_df = generate_series(n_series=4,\n",
+    "                                         min_length=50,\n",
+    "                                         max_length=50,\n",
+    "                                         n_static_features=n_static_features,\n",
+    "                                         n_temporal_features=n_temporal_features, \n",
+    "                                         equal_ends=False) \n",
+    "temporal_df[\"available_mask\"] = 1\n",
+    "temporal_df.loc[10:20, \"available_mask\"] = 0\n",
+    "models = [NHITS(h=12, input_size=24, max_steps=20)]\n",
+    "nf = NeuralForecast(models=models, freq='D')\n",
+    "\n",
+    "# test case 1: target has NaN values\n",
+    "test_df1 = temporal_df.copy()\n",
+    "test_df1.loc[5:7, \"y\"] = np.nan\n",
+    "test_fail(lambda: nf.fit(test_df1), contains=\"Found missing values in ['y']\")\n",
+    "\n",
+    "# test case 2: exogenous has NaN values that are correctly flagged with exception\n",
+    "test_df2 = temporal_df.copy()\n",
+    "# temporal_0 won't raise ValueError as available_mask = 0\n",
+    "test_df2.loc[15:18, \"temporal_0\"] = np.nan\n",
+    "test_df2.loc[5, \"temporal_1\"] = np.nan\n",
+    "test_df2.loc[25, \"temporal_2\"] = np.nan\n",
+    "test_fail(lambda: nf.fit(test_df2), contains=\"Found missing values in ['temporal_1', 'temporal_2']\")\n",
+    "\n",
+    "# test case 3: static column has NaN values\n",
+    "test_df3 = static_df.copy()\n",
+    "test_df3.loc[3, \"static_1\"] = np.nan\n",
+    "test_fail(lambda: nf.fit(temporal_df, static_df=test_df3), contains=\"Found missing values in ['static_1']\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a157b6b4-0943-48f9-9427-fa8cf0b15d49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "#| polars\n",
+    "# Test if any of the inputs contains NaNs with available_mask = 1, fit shall raise error\n",
+    "# input type is polars.Dataframe\n",
+    "# Note that available_mask is not explicitly provided for this test\n",
+    "\n",
+    "pl_df = polars.DataFrame(\n",
+    "    {\n",
+    "        'unique_id': [1]*50,\n",
+    "        'y': list(range(50)), \n",
+    "        'temporal_0': list(range(100,150)),\n",
+    "        'temporal_1': list(range(200,250)),\n",
+    "        'ds': polars.date_range(start=date(2022, 1, 1), end=date(2022, 2, 19), interval=\"1d\", eager=True), \n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "pl_static_df = polars.DataFrame(\n",
+    "    {\n",
+    "        'unique_id': [1],\n",
+    "        'static_0': [1.2], \n",
+    "        'static_1': [10.9],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "models = [NHITS(h=12, input_size=24, max_steps=20)]\n",
+    "nf = NeuralForecast(models=models, freq='1d')\n",
+    "\n",
+    "# test case 1: target has NaN values\n",
+    "test_pl_df1 = pl_df.clone()\n",
+    "test_pl_df1[3, 'y'] = np.nan\n",
+    "test_pl_df1[4, 'y'] = None\n",
+    "test_fail(lambda: nf.fit(test_pl_df1), contains=\"Found missing values in ['y']\")\n",
+    "\n",
+    "# test case 2: exogenous has NaN values that are correctly flagged with exception\n",
+    "test_pl_df2 = pl_df.clone()\n",
+    "test_pl_df2[15, \"temporal_0\"] = np.nan\n",
+    "test_pl_df2[5, \"temporal_1\"] = np.nan\n",
+    "test_fail(lambda: nf.fit(test_pl_df2), contains=\"Found missing values in ['temporal_0', 'temporal_1']\")\n",
+    "\n",
+    "# test case 3: static column has NaN values\n",
+    "test_pl_df3 = pl_static_df.clone()\n",
+    "test_pl_df3[0, \"static_1\"] = np.nan\n",
+    "test_fail(lambda: nf.fit(pl_df, static_df=test_pl_df3), contains=\"Found missing values in ['static_1']\")"
+   ]
   }
  ],
  "metadata": {

diff --git a/neuralforecast/core.py b/neuralforecast/core.py
@@ -166,6 +166,7 @@ def _warn_id_as_idx():
 
 # %% ../nbs/core.ipynb 10
 class NeuralForecast:
+
     def __init__(
         self,
         models: List[Any],
@@ -248,6 +249,8 @@ def _prepare_fit(
         self.id_col = id_col
         self.time_col = time_col
         self.target_col = target_col
+        self._check_nan(df, static_df, id_col, time_col, target_col)
+
         dataset, uids, last_dates, ds = TimeSeriesDataset.from_df(
             df=df,
             static_df=static_df,
@@ -262,6 +265,30 @@ def _prepare_fit(
             self._scalers_fit_transform(dataset)
         return dataset, uids, last_dates, ds
 
+    def _check_nan(self, df, static_df, id_col, time_col, target_col):
+        cols_with_nans = []
+
+        temporal_cols = [target_col] + [
+            c for c in df.columns if c not in (id_col, time_col, target_col)
+        ]
+        if "available_mask" in temporal_cols:
+            available_mask = df["available_mask"].to_numpy().astype(bool)
+        else:
+            available_mask = np.full(df.shape[0], True)
+
+        df_to_check = ufp.filter_with_mask(df, available_mask)
+        for col in temporal_cols:
+            if ufp.is_nan_or_none(df_to_check[col]).any():
+                cols_with_nans.append(col)
+
+        if static_df is not None:
+            for col in [x for x in static_df.columns if x != id_col]:
+                if ufp.is_nan_or_none(static_df[col]).any():
+                    cols_with_nans.append(col)
+
+        if cols_with_nans:
+            raise ValueError(f"Found missing values in {cols_with_nans}.")
+
     def fit(
         self,
         df: Optional[DataFrame] = None,