Nixtla · elephaint · Mar 11, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 11, 2024
diff --git a/nbs/core.ipynb b/nbs/core.ipynb
@@ -86,7 +86,7 @@
     "    MLP, NHITS, NBEATS, NBEATSx, DLinear, NLinear,\n",
     "    TFT, VanillaTransformer,\n",
     "    Informer, Autoformer, FEDformer,\n",
-    "    StemGNN, PatchTST, TimesNet, TimeLLM, TSMixer\n",
+    "    StemGNN, PatchTST, TimesNet, TimeLLM, TSMixer, TSMixerx\n",
     ")"
    ]
   },
@@ -225,6 +225,8 @@
     "    'vanillatransformer': VanillaTransformer, 'autovanillatransformer': VanillaTransformer,\n",
     "    'timellm': TimeLLM,\n",
     "    'tsmixer': TSMixer, 'autotsmixer': TSMixer,\n",
+    "    'tsmixerx': TSMixerx, 'autotsmixerx': TSMixerx,\n",
+    "\n",
     "}"
    ]
   },
@@ -1330,6 +1332,7 @@
     "\n",
     "from neuralforecast.models.stemgnn import StemGNN\n",
     "from neuralforecast.models.tsmixer import TSMixer\n",
+    "from neuralforecast.models.tsmixerx import TSMixerx\n",
     "\n",
     "from neuralforecast.losses.pytorch import MQLoss, MAE, MSE\n",
     "from neuralforecast.utils import AirPassengersDF, AirPassengersPanel, AirPassengersStatic\n",
@@ -1775,6 +1778,8 @@
     "        PatchTST(h=12, input_size=24, max_steps=1),\n",
     "        TimesNet(h=12, input_size=24, max_steps=1),\n",
     "        StemGNN(h=12, input_size=24, n_series=2, max_steps=1, scaler_type='robust'),\n",
+    "        TSMixer(h=12, input_size=24, n_series=2, max_steps=1, scaler_type='robust'),\n",
+    "        TSMixerx(h=12, input_size=24, n_series=2, max_steps=1, scaler_type='robust'),\n",
     "    ],\n",
     "    freq='M'\n",
     ")\n",
@@ -1897,6 +1902,8 @@
     "            PatchTST(h=12, input_size=24, max_steps=1, scaler_type=None),\n",
     "            TimesNet(h=12, input_size=24, max_steps=1, scaler_type='standard'),\n",
     "            StemGNN(h=12, input_size=12, n_series=2, max_steps=1, scaler_type='robust'),\n",
+    "            TSMixer(h=12, input_size=12, n_series=2, max_steps=1, scaler_type='robust'),\n",
+    "            TSMixerx(h=12, input_size=12, n_series=2, max_steps=1, scaler_type='robust'),\n",
     "            DeepAR(h=12, input_size=24, max_steps=1,\n",
     "               stat_exog_list=['airline1'], futr_exog_list=['trend']),\n",
     "        ],\n",
@@ -1933,6 +1940,8 @@
     "            PatchTST(h=12, input_size=24, max_steps=1, scaler_type=None),\n",
     "            TimesNet(h=12, input_size=24, max_steps=1, scaler_type='standard'),\n",
     "            StemGNN(h=12, input_size=12, n_series=2, max_steps=1, scaler_type='robust'),\n",
+    "            TSMixer(h=12, input_size=12, n_series=2, max_steps=1, scaler_type='robust'),\n",
+    "            TSMixerx(h=12, input_size=12, n_series=2, max_steps=1, scaler_type='robust'),\n",
     "            DeepAR(h=12, input_size=24, max_steps=1,\n",
     "               stat_exog_list=['airline1'], futr_exog_list=['trend']),\n",
     "        ],\n",

diff --git a/nbs/examples/Multivariate_with_TSMixer.ipynb b/nbs/examples/Multivariate_with_TSMixer.ipynb
diff --git a/nbs/imgs_models/tsmixerx.png b/nbs/imgs_models/tsmixerx.png
diff --git a/nbs/models.ipynb b/nbs/models.ipynb
@@ -63,6 +63,7 @@
     "from neuralforecast.models.stemgnn import StemGNN\n",
     "from neuralforecast.models.hint import HINT\n",
     "from neuralforecast.models.tsmixer import TSMixer\n",
+    "from neuralforecast.models.tsmixerx import TSMixerx\n",
     "\n",
     "from neuralforecast.losses.pytorch import MAE, MQLoss, DistributionLoss"
    ]
@@ -2503,6 +2504,108 @@
     "y_hat = model.predict(dataset=dataset)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "640fbbc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class AutoTSMixerx(BaseAuto):\n",
+    "\n",
+    "    default_config = {\n",
+    "        \"input_size_multiplier\": [1, 2, 3, 4],\n",
+    "        \"h\": None,\n",
+    "        \"n_series\": None,\n",
+    "        \"n_block\": tune.choice([1, 2, 4, 6, 8]),\n",
+    "        \"learning_rate\": tune.loguniform(1e-4, 1e-2),\n",
+    "        \"ff_dim\": tune.choice([32, 64, 128]),\n",
+    "        \"scaler_type\": tune.choice(['identity', 'robust', 'standard']),\n",
+    "        \"max_steps\": tune.choice([500, 1000, 2000]),\n",
+    "        \"batch_size\": tune.choice([32, 64, 128, 256]),\n",
+    "        \"dropout\": tune.uniform(0.0, 0.99),\n",
+    "        \"loss\": None,\n",
+    "        \"random_seed\": tune.randint(1, 20),\n",
+    "    }\n",
+    "\n",
+    "    def __init__(self,\n",
+    "                 h,\n",
+    "                 n_series,\n",
+    "                 loss=MAE(),\n",
+    "                 valid_loss=None,\n",
+    "                 config=None, \n",
+    "                 search_alg=BasicVariantGenerator(random_state=1),\n",
+    "                 num_samples=10,\n",
+    "                 refit_with_val=False,\n",
+    "                 cpus=cpu_count(),\n",
+    "                 gpus=torch.cuda.device_count(),\n",
+    "                 verbose=False,\n",
+    "                 alias=None,\n",
+    "                 backend='ray',\n",
+    "                 callbacks=None):\n",
+    "        \n",
+    "        # Define search space, input/output sizes\n",
+    "        if config is None:\n",
+    "            config = self.default_config.copy()        \n",
+    "            config['input_size'] = tune.choice([h*x \\\n",
+    "                         for x in self.default_config[\"input_size_multiplier\"]])\n",
+    "\n",
+    "            # Rolling windows with step_size=1 or step_size=h\n",
+    "            # See `BaseWindows` and `BaseRNN`'s create_windows\n",
+    "            config['step_size'] = tune.choice([1, h])\n",
+    "            del config[\"input_size_multiplier\"]\n",
+    "            if backend == 'optuna':\n",
+    "                config = self._ray_config_to_optuna(config)          \n",
+    "\n",
+    "        # Always use n_series from parameters\n",
+    "        config['n_series'] = n_series\n",
+    "\n",
+    "        super(AutoTSMixerx, self).__init__(\n",
+    "              cls_model=TSMixerx, \n",
+    "              h=h,\n",
+    "              loss=loss,\n",
+    "              valid_loss=valid_loss,\n",
+    "              config=config,\n",
+    "              search_alg=search_alg,\n",
+    "              num_samples=num_samples, \n",
+    "              refit_with_val=refit_with_val,\n",
+    "              cpus=cpus,\n",
+    "              gpus=gpus,\n",
+    "              verbose=verbose,\n",
+    "              alias=alias,\n",
+    "              backend=backend,\n",
+    "              callbacks=callbacks,            \n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe839643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(AutoTSMixerx, title_level=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "475c8c68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "# Use your own config or AutoTSMixer.default_config\n",
+    "config = dict(max_steps=1, val_check_steps=1, input_size=12)\n",
+    "model = AutoTSMixerx(h=12, n_series=1, config=config, num_samples=1, cpus=1)\n",
+    "\n",
+    "# Fit and predict\n",
+    "model.fit(dataset=dataset)\n",
+    "y_hat = model.predict(dataset=dataset)"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",

diff --git a/nbs/models.tsmixer.ipynb b/nbs/models.tsmixer.ipynb
@@ -25,7 +25,8 @@
    "metadata": {},
    "source": [
     "# TSMixer\n",
-    "> Time-Series Mixer (`TSMixer`) is a MLP-based multivariate time-series forecasting model. `TSMixer` jointly learns temporal and cross-sectional representations of the time-series by repeatedly combining time- and feature information using stacked mixing layers. A mixing layer consists of a sequential time- and feature Multi Layer Perceptron (`MLP`).\n",
+    "> Time-Series Mixer (`TSMixer`) is a MLP-based multivariate time-series forecasting model. `TSMixer` jointly learns temporal and cross-sectional representations of the time-series by repeatedly combining time- and feature information using stacked mixing layers. A mixing layer consists of a sequential time- and feature Multi Layer Perceptron (`MLP`). Note: this model cannot handle exogenous inputs. If you want to use additional exogenous inputs, use `TSMixerx`.\n",
+    "\n",
     "<br><br>**References**<br>-[Chen, Si-An, Chun-Liang Li, Nate Yoder, Sercan O. Arik, and Tomas Pfister (2023). \"TSMixer: An All-MLP Architecture for Time Series Forecasting.\"](http://arxiv.org/abs/2303.06053)<br>"
    ]
   },
@@ -84,49 +85,67 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "class MixingLayer(nn.Module):\n",
-    "    def __init__(self, n_series, input_size, dropout, ff_dim):\n",
+    "class TemporalMixing(nn.Module):\n",
+    "    def __init__(self, n_series, input_size, dropout):\n",
     "        super().__init__()\n",
-    "        # Normalization layers\n",
     "        self.temporal_norm = nn.BatchNorm1d(num_features=n_series * input_size, eps=0.001, momentum=0.01)\n",
-    "        self.feature_norm = nn.BatchNorm1d(num_features=n_series * input_size, eps=0.001, momentum=0.01)\n",
-    "        \n",
-    "        # Linear layers\n",
     "        self.temporal_lin = nn.Linear(input_size, input_size)\n",
+    "        self.temporal_drop = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, input):\n",
+    "        # Get shapes\n",
+    "        batch_size = input.shape[0]\n",
+    "        input_size = input.shape[1]\n",
+    "        n_series = input.shape[2]\n",
+    "\n",
+    "        # Temporal MLP\n",
+    "        x = input.permute(0, 2, 1)                                      # [B, L, N] -> [B, N, L]\n",
+    "        x = x.reshape(batch_size, -1)                                   # [B, N, L] -> [B, N * L]\n",
+    "        x = self.temporal_norm(x)                                       # [B, N * L] -> [B, N * L]\n",
+    "        x = x.reshape(batch_size, n_series, input_size)                 # [B, N * L] -> [B, N, L]\n",
+    "        x = F.relu(self.temporal_lin(x))                                # [B, N, L] -> [B, N, L]\n",
+    "        x = x.permute(0, 2, 1)                                          # [B, N, L] -> [B, L, N]\n",
+    "        x = self.temporal_drop(x)                                       # [B, L, N] -> [B, L, N]\n",
+    "\n",
+    "        return x + input \n",
+    "\n",
+    "class FeatureMixing(nn.Module):\n",
+    "    def __init__(self, n_series, input_size, dropout, ff_dim):\n",
+    "        super().__init__()\n",
+    "        self.feature_norm = nn.BatchNorm1d(num_features=n_series * input_size, eps=0.001, momentum=0.01)\n",
     "        self.feature_lin_1 = nn.Linear(n_series, ff_dim)\n",
     "        self.feature_lin_2 = nn.Linear(ff_dim, n_series)\n",
-    "\n",
-    "        # Drop out layers\n",
-    "        self.temporal_drop = nn.Dropout(dropout)\n",
     "        self.feature_drop_1 = nn.Dropout(dropout)\n",
     "        self.feature_drop_2 = nn.Dropout(dropout)\n",
     "\n",
     "    def forward(self, input):\n",
     "        # Get shapes\n",
     "        batch_size = input.shape[0]\n",
-    "        n_series = input.shape[1]\n",
-    "        input_size = input.shape[2]\n",
-    "\n",
-    "        # Temporal MLP\n",
-    "        x = input.reshape(batch_size, -1)\n",
-    "        x = self.temporal_norm(x)\n",
-    "        x = x.reshape(batch_size, input_size, n_series)\n",
-    "        x = F.relu(self.temporal_lin(x))\n",
-    "        x = x.permute(0, 2, 1)\n",
-    "        x = self.temporal_drop(x)\n",
-    "        res = x + input\n",
+    "        input_size = input.shape[1]\n",
+    "        n_series = input.shape[2]\n",
     "\n",
     "        # Feature MLP\n",
-    "        x = res.reshape(batch_size, -1)\n",
-    "        x = self.feature_norm(x)\n",
-    "        x = x.reshape(batch_size, input_size, n_series)\n",
-    "        x = x.permute(0, 2, 1)\n",
-    "        x = F.relu(self.feature_lin_1(x))\n",
-    "        x = self.feature_drop_1(x)\n",
-    "        x = self.feature_lin_2(x)\n",
-    "        x = self.feature_drop_2(x)\n",
+    "        x = input.reshape(batch_size, -1)                               # [B, L, N] -> [B, L * N]\n",
+    "        x = self.feature_norm(x)                                        # [B, L * N] -> [B, L * N]\n",
+    "        x = x.reshape(batch_size, input_size, n_series)                 # [B, L * N] -> [B, L, N]\n",
+    "        x = F.relu(self.feature_lin_1(x))                               # [B, L, N] -> [B, L, ff_dim]\n",
+    "        x = self.feature_drop_1(x)                                      # [B, L, ff_dim] -> [B, L, ff_dim]\n",
+    "        x = self.feature_lin_2(x)                                       # [B, L, ff_dim] -> [B, L, N]\n",
+    "        x = self.feature_drop_2(x)                                      # [B, L, N] -> [B, L, N]\n",
+    "\n",
+    "        return x + input \n",
     "\n",
-    "        return x + res"
+    "class MixingLayer(nn.Module):\n",
+    "    def __init__(self, n_series, input_size, dropout, ff_dim):\n",
+    "        super().__init__()\n",
+    "        # Mixing layer consists of a temporal and feature mixer\n",
+    "        self.temporal_mixer = TemporalMixing(n_series, input_size, dropout)\n",
+    "        self.feature_mixer = FeatureMixing(n_series, input_size, dropout, ff_dim)\n",
+    "\n",
+    "    def forward(self, input):\n",
+    "        x = self.temporal_mixer(input)\n",
+    "        x = self.feature_mixer(x)\n",
+    "        return x"
    ]
   },
   {
@@ -145,10 +164,11 @@
    "source": [
     "#| exporti\n",
     "class ReversibleInstanceNorm1d(nn.Module):\n",
-    "    def __init__(self, num_features, eps=1e-5):\n",
+    "    def __init__(self, n_series, eps=1e-5):\n",
     "        super().__init__()\n",
-    "        self.weight = nn.Parameter(torch.ones(num_features))\n",
-    "        self.bias = nn.Parameter(torch.zeros(num_features))\n",
+    "        self.weight = nn.Parameter(torch.ones((1, 1, n_series)))\n",
+    "        self.bias = nn.Parameter(torch.zeros((1, 1, n_series)))\n",
+    "\n",
     "        self.eps = eps\n",
     "\n",
     "    def forward(self, x):\n",
@@ -174,6 +194,13 @@
     "        return x"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Model"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -267,10 +294,18 @@
     "                                    num_workers_loader=num_workers_loader,\n",
     "                                    drop_last_loader=drop_last_loader,\n",
     "                                    **trainer_kwargs)\n",
+    "        # Asserts\n",
+    "        if stat_exog_list is not None:\n",
+    "            raise Exception(\"TSMixer does not support static exogenous variables. Use TSMixerx if you want to use static exogenous variables.\")\n",
+    "        if futr_exog_list is not None:\n",
+    "            raise Exception(\"TSMixer does not support future exogenous variables. Use TSMixerx if you want to use future exogenous variables.\")\n",
+    "        if hist_exog_list is not None:\n",
+    "            raise Exception(\"TSMixer does not support historical exogenous variables. Use TSMixerx if you want to use historical exogenous variables.\")        \n",
+    "\n",
     "        # Reversible InstanceNormalization layer\n",
     "        self.revin = revin\n",
     "        if self.revin:\n",
-    "            self.norm = ReversibleInstanceNorm1d(num_features = n_series)\n",
+    "            self.norm = ReversibleInstanceNorm1d(n_series = n_series)\n",
     "\n",
     "        # Mixing layers\n",
     "        mixing_layers = [MixingLayer(n_series=n_series, \n",
@@ -313,7 +348,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Usage Examples"
+    "## 3. Usage Examples"
    ]
   },
   {
@@ -349,9 +384,9 @@
     "                dropout=0,\n",
     "                revin=True,\n",
     "                scaler_type='standard',\n",
-    "                max_steps=80,\n",
+    "                max_steps=200,\n",
     "                early_stop_patience_steps=-1,\n",
-    "                val_check_steps=10,\n",
+    "                val_check_steps=5,\n",
     "                learning_rate=1e-3,\n",
     "                loss=MAE(),\n",
     "                valid_loss=MAE(),\n",