Update docs, use ipy format to write documents and add dataset module…

… with example synthetic datasets
felipeangelimvieira · Dec 3, 2024 · 927613b · 927613b
1 parent a1b3815
commit 927613b
Show file tree

Hide file tree

Showing 37 changed files with 2,690 additions and 283 deletions.
diff --git a/docs/howto/composite-exogenous-effects/index copy.ipy b/docs/howto/composite-exogenous-effects/index copy.ipy
@@ -0,0 +1,351 @@
+# %% [markdown]
+#
+# # Composition of effects
+#
+# In previous examples, we saw how to create a simple custom effect,
+# which applies a simple transformation to the input data. However, the effect's
+# interface allows us to apply more complex transformations, such as using the output
+# of previous components as input for the current component, or creating a composite
+# effect that wraps an effect and applies some sort of transformation. This example
+# will cover these topics.
+#
+# ## Creating a custom effect
+#
+# The idea here is to create an effect that
+# 1. First, uses another effect and compute its output
+# 2. Scales the output of 1 by another effect and returns it
+#
+# One classic use-case for this would be using campaign or seasonality to scale
+# the effect of another input, that might be proportional to these effects.
+# Marketing investments are a good example of this. We will implement such a composite
+# effect in this section.
+#
+# ### Example dataset
+#
+# The dataset we use is synthetic, and the relation between the exogenous variable
+# and the target is known. However, let's pretend we don't know this relation, and
+# analize the data to find some insights that motivate the creation of a custom
+# effect.
+
+
+# %%
+
+from matplotlib import pyplot as plt
+from sktime.split import temporal_train_test_split
+from sktime.utils.plotting import plot_series
+
+from prophetverse.datasets.synthetic import load_composite_effect_example
+
+y, X = load_composite_effect_example()
+
+y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=365)
+
+display(y_train.head())
+display(X_train.head())
+
+plot_series(y_train, y_test, labels=["Train", "Test"], title="Target series")
+
+plot_series(
+    X["investment"],
+    markers=[None],
+    labels=["investment"],
+    title="Features",
+)
+plt.show()
+
+# %% [markdown]
+#
+# By plotting the years together, we can see that the target has a clear
+# yearly seasonality.
+
+# %%
+import matplotlib.dates as mdates
+
+fig, ax = plt.subplots()
+for year, g in y_train.groupby(y_train.index.year):
+    idx = g.index.to_timestamp().map(lambda t: t.replace(year=2024))
+    ax.plot(idx, g, label=year, alpha=0.7)
+ax.legend()
+# Format by month name
+ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
+ax.set(
+    title="Target series (grouped by year)",
+)
+fig.show()
+
+
+# %% [markdown]
+# In addition, we also see a lot of peaks and oscilations that seem to be related
+# to the investment variable and to campaign variable. Below, we detrend and
+# deseasonalize the target to see if we can have an intuition of the relation between
+# the exogenous variables and these oscillations. We highlight dates where we have
+# a campaign dummy.
+# %%
+
+from sktime.forecasting.trend import PolynomialTrendForecaster
+from sktime.transformations.compose import TransformerPipeline
+from sktime.transformations.series.detrend import Deseasonalizer, Detrender
+
+transformer = TransformerPipeline(
+    steps=[
+        ("detrend", Detrender(PolynomialTrendForecaster(degree=2))),
+        ("deseasonalize", Deseasonalizer(sp=365)),
+    ]
+)
+
+cleaned = transformer.fit_transform(y_train)
+
+campaign_mask = X_train["campaign"] > 0
+
+fig, ax = plt.subplots()
+ax.scatter(X_train["investment"], cleaned, label="Data point")
+ax.scatter(
+    X_train["investment"][campaign_mask],
+    cleaned[campaign_mask],
+    s=4,
+    label="Day with campaign",
+)
+ax.set(
+    xlabel="investment",
+    ylabel="Detrended and deseasonalized target",
+    title="Scatter plot of the investment vs target (wo trend and seasonality)",
+)
+ax.legend()
+fig.show()
+
+
+# %% [markdown]
+# As we can see, the target seems to be proportional to the investment, and dates
+# with campaign seem to have an slope higher than the dates without campaign.
+# This is a good motivation to try to capture the interaction between two variables.
+# We first fit a simple Prophetverse model to the data
+
+
+# %%
+from prophetverse.effects import LinearEffect
+from prophetverse.effects.fourier import LinearFourierSeasonality
+from prophetverse.effects.trend import PiecewiseLinearTrend
+from prophetverse.engine import MAPInferenceEngine
+from prophetverse.sktime import Prophetverse
+from prophetverse.utils.regex import exact, no_input_columns
+
+model = Prophetverse(
+    trend=PiecewiseLinearTrend(
+        changepoint_interval=500,
+        changepoint_prior_scale=0.00001,
+        changepoint_range=-500,
+    ),
+    exogenous_effects=[
+        (
+            "seasonality",
+            LinearFourierSeasonality(
+                freq="D",
+                sp_list=[365.25],
+                fourier_terms_list=[5],
+                prior_scale=0.1,
+                effect_mode="multiplicative",
+            ),
+            None,
+        ),
+        (
+            "campaign",
+            LinearEffect("additive"),
+            exact("campaign"),
+        ),
+        (
+            "investment",
+            LinearEffect("additive"),
+            exact("investment"),
+        ),
+    ],
+    default_effect=LinearEffect("additive"),
+    inference_engine=MAPInferenceEngine(
+        num_steps=50_000,
+    ),
+)
+
+model.fit(y=y_train, X=X_train)
+
+y_pred = model.predict(X=X_test, fh=y_test.index)
+
+# %%
+
+plot_series(y_train, y_test, y_pred, labels=["Train", "Test", "Pred"],
+            title="Target series")
+plt.show()
+# %%
+from typing import Any, Dict, List
+
+import jax.numpy as jnp
+import pandas as pd
+
+from prophetverse.effects.base import BaseEffect
+
+
+class WrapEffectAndScaleByAnother(BaseEffect):
+    """Wrap an effect and scale it by another effect.
+
+    Parameters
+    ----------
+    effect : BaseEffect
+        The effect to wrap.
+
+    """
+
+    _tags = {"skip_predict_if_no_match": False, "supports_multivariate": False}
+
+    def __init__(
+        self,
+        effect: BaseEffect,
+        base_effect_name: str,
+    ):
+
+        self.effect = effect
+        self.base_effect_name = base_effect_name
+
+        super().__init__()
+
+        self.clone_tags(effect)
+
+    def _fit(self, y: pd.DataFrame, X: pd.DataFrame, scale: float = 1):
+        """Initialize the effect.
+
+        This method is called during `fit()` of the forecasting model.
+        It receives the Exogenous variables DataFrame and should be used to initialize
+        any necessary parameters or data structures, such as detecting the columns that
+        match the regex pattern.
+
+        This method MUST set _input_feature_columns_names to a list of column names
+
+        Parameters
+        ----------
+        y : pd.DataFrame
+            The timeseries dataframe
+
+        X : pd.DataFrame
+            The DataFrame to initialize the effect.
+
+        scale : float, optional
+            The scale of the timeseries. For multivariate timeseries, this is
+            a dataframe. For univariate, it is a simple float.
+
+        Returns
+        -------
+        None
+        """
+        self.effect.fit(X=X, y=y, scale=scale)
+
+    def _transform(self, X: pd.DataFrame, fh: pd.Index) -> Dict[str, Any]:
+        """Prepare input data to be passed to numpyro model.
+
+        Returns a dictionary with the data for the lift and for the inner effect.
+
+        Parameters
+        ----------
+        X : pd.DataFrame
+            The input DataFrame containing the exogenous variables for the training
+            time indexes, if passed during fit, or for the forecasting time indexes, if
+            passed during predict.
+
+        fh : pd.Index
+            The forecasting horizon as a pandas Index.
+
+        Returns
+        -------
+        Dict[str, Any]
+            Dictionary with data for the lift and for the inner effect
+        """
+        return self.effect.transform(X=X, fh=fh)
+
+    def _predict(
+        self, data: Dict, predicted_effects: Dict[str, jnp.ndarray]
+    ) -> jnp.ndarray:
+        """Apply and return the effect values.
+
+        Parameters
+        ----------
+        data : Any
+            Data obtained from the transformed method.
+
+        predicted_effects : Dict[str, jnp.ndarray], optional
+            A dictionary containing the predicted effects, by default None.
+
+        Returns
+        -------
+        jnp.ndarray
+            An array with shape (T,1) for univariate timeseries.
+        """
+        out = self.effect.predict(
+            data=data,  predicted_effects=predicted_effects
+        )
+
+        base_effect = predicted_effects[self.base_effect_name]
+        return base_effect * out
+
+    @property
+    def input_feature_column_names(self) -> List[str]:
+        """Return the input feature columns names."""
+        return self.effect.input_feature_column_names
+
+
+# %%
+
+
+import numpyro.distributions as dist
+
+from prophetverse.engine.optimizer import AdamOptimizer
+
+model = Prophetverse(
+    trend=PiecewiseLinearTrend(
+        changepoint_interval=500,
+        changepoint_prior_scale=0.00001,
+        changepoint_range=-500,
+    ),
+    exogenous_effects=[
+        (
+            "seasonality",
+            LinearFourierSeasonality(
+                freq="D",
+                sp_list=[365.25],
+                fourier_terms_list=[5],
+                prior_scale=0.1,
+                effect_mode="multiplicative",
+            ),
+            no_input_columns,
+        ),
+        (
+            "campaign",
+            LinearEffect("additive"),
+            exact("campaign"),
+        ),
+        (
+            "investment",
+            LinearEffect("additive"),
+            exact("investment"),
+        ),
+        (
+            "investment_campaign",
+            WrapEffectAndScaleByAnother(
+                effect=LinearEffect("additive", prior=dist.HalfNormal(10)),
+                base_effect_name="campaign",
+            ),
+            exact("investment"),
+        ),
+    ],
+    inference_engine=MAPInferenceEngine(
+        num_steps=50_000,
+    ),
+)
+
+model.fit(y=y_train, X=X_train)
+y_pred_composite = model.predict(X=X_test, fh=y_test.index)
+
+# %%
+plot_series(
+    y_train, y_test, y_pred, labels=["Train", "Test", "Pred"], title="Target series"
+)
+
+# %%
+
+plot_series(
+    y_test, y_pred, y_pred_composite, labels=["Test", "Pred", "Pred composite"], title="Target series")