Add prediction_model parameter to ICC sample function

bloebp · bloebp · commit 86fcb28ba232 · 2023-10-13T09:56:01.000-07:00
Previously, only the 'exact' model could be used in the sample-wise ICC function. Now, there is a "prediction_model" parameter, similar to the population-based ICC function, that allows for the passing of a pre-defined prediction model.

Signed-off-by: Patrick Bloebaum &lt;bloebp@amazon.com&gt;
diff --git a/dowhy/gcm/influence.py b/dowhy/gcm/influence.py
@@ -243,10 +243,12 @@ def intrinsic_causal_influence(
     :param prediction_model: Prediction model for estimating the functional relationship between subsets of ancestor
                              noise terms and the target node. This can be an instance of a PredictionModel, the string
                              'approx' or the string 'exact'. With 'exact', the underlying causal models in the graph
-                             are utilized directly by propagating given noise inputs through the graph. This is
-                             generally more accurate but slow. With 'approx', an appropriate model is selected and
-                             trained based on sampled data from the graph, which is less accurate but faster. A more
-                             detailed treatment on why we need this parameter is also provided in :ref:`icc`.
+                             are utilized directly by propagating given noise inputs through the graph, which ensures
+                             that generated samples follow the fitted models. In contrast, the 'approx' method involves
+                             selecting and training a suitable model based on data sampled from the graph. This might
+                             lead to deviations from the outcomes of the fitted models, but is faster and can be more
+                             robust in certain settings. A more detailed treatment on why we need this parameter is
+                             also provided in :ref:`icc`.
     :param attribution_func: Optional attribution function to measure the statistical property of the target node. This
                              function expects two inputs; predictions after the randomization of certain features (i.e.
                              samples from noise nodes) and a baseline where no features were randomized. The baseline
@@ -325,9 +327,11 @@ def intrinsic_causal_influence_sample(
     target_node: Any,
     baseline_samples: pd.DataFrame,
     noise_feature_samples: Optional[pd.DataFrame] = None,
+    prediction_model: Union[PredictionModel, ClassificationModel, str] = "approx",
     subset_scoring_func: Optional[Callable[[np.ndarray, np.ndarray], Union[np.ndarray, float]]] = None,
     num_noise_feature_samples: int = 5000,
     max_batch_size: int = 100,
+    auto_assign_quality: auto.AssignmentQuality = auto.AssignmentQuality.GOOD,
     shapley_config: Optional[ShapleyConfig] = None,
 ) -> List[Dict[Any, Any]]:
     """Estimates the intrinsic causal impact of upstream nodes on a specified target_node, using the provided
@@ -342,9 +346,18 @@ def intrinsic_causal_influence_sample(
     :param causal_model: The fitted invertible structural causal model.
     :param target_node: Node of interest.
     :param baseline_samples: Samples for which the influence should be estimated.
-    :param noise_feature_samples: Optional noise samples of upstream nodes used as 'background' samples.. If None is
+    :param noise_feature_samples: Optional noise samples of upstream nodes used as 'background' samples. If None is
                                   given, new noise samples are generated based on the graph. These samples are used for
                                   randomizing features that are not in the subset.
+    :param prediction_model: Prediction model for estimating the functional relationship between subsets of ancestor
+                             noise terms and the target node. This can be an instance of a PredictionModel, the string
+                             'approx' or the string 'exact'. With 'exact', the underlying causal models in the graph
+                             are utilized directly by propagating given noise inputs through the graph, which ensures
+                             that generated samples follow the fitted models. In contrast, the 'approx' method involves
+                             selecting and training a suitable model based on data sampled from the graph. This might
+                             lead to deviations from the outcomes of the fitted models, but is faster and can be more
+                             robust in certain settings. A more detailed treatment on why we need this parameter is
+                             also provided in :ref:`icc`.
     :param subset_scoring_func: Set function for estimating the quantity of interest based. This function
                                 expects two inputs; the outcome of the model for some samples if certain features are permuted and the
                                 outcome of the model for the same samples when no features were permuted. By default,
@@ -353,6 +366,7 @@ def intrinsic_causal_influence_sample(
                                       This parameter indicates how many.
     :param max_batch_size: Maximum batch size for estimating multiple predictions at once. This has a significant influence on the
                           overall memory usage. If set to -1, all samples are used in one batch.
+    :param auto_assign_quality: Auto assign quality for the 'approx' prediction_model option.
     :param shapley_config: :class:`~dowhy.gcm.shapley.ShapleyConfig` for the Shapley estimator.
     :return: A list of dictionaries indicating the intrinsic causal influence of a node on the target for a particular
              sample. This is, each dictionary belongs to one baseline sample.
@@ -376,21 +390,32 @@ def intrinsic_causal_influence_sample(
     if subset_scoring_func is None:
         subset_scoring_func = means_difference
 
+    target_samples = feature_samples[target_node].to_numpy()
+    node_names = noise_feature_samples.columns
+    noise_feature_samples, target_samples = shape_into_2d(noise_feature_samples.to_numpy(), target_samples)
+
+    prediction_method = _get_icc_noise_function(
+        causal_model,
+        target_node,
+        prediction_model,
+        noise_feature_samples,
+        node_names,
+        target_samples,
+        auto_assign_quality,
+        False,  # Currently only supports continues target since we need to reconstruct its noise term.
+    )
+
     shapley_vales = feature_relevance_sample(
-        _get_icc_noise_function(
-            causal_model, target_node, "exact", noise_feature_samples, noise_feature_samples.columns, None, None, False
-        ),
-        feature_samples=noise_feature_samples.to_numpy(),
-        baseline_samples=compute_noise_from_data(causal_model, baseline_samples)[
-            noise_feature_samples.columns
-        ].to_numpy(),
+        prediction_method,
+        feature_samples=noise_feature_samples,
+        baseline_samples=compute_noise_from_data(causal_model, baseline_samples)[node_names].to_numpy(),
         subset_scoring_func=subset_scoring_func,
         max_batch_size=max_batch_size,
         shapley_config=shapley_config,
     )
 
     return [
-        {(predecessor, target_node): shapley_vales[i][q] for q, predecessor in enumerate(noise_feature_samples.columns)}
+        {(predecessor, target_node): shapley_vales[i][q] for q, predecessor in enumerate(node_names)}
         for i in range(shapley_vales.shape[0])
     ]
 
@@ -432,7 +457,7 @@ def icc_set_function(subset: np.ndarray) -> Union[np.ndarray, float]:
 
 
 def _get_icc_noise_function(
-    causal_model: InvertibleStructuralCausalModel,
+    causal_model: StructuralCausalModel,
     target_node: Any,
     prediction_model: Union[PredictionModel, ClassificationModel, str],
     noise_samples: np.ndarray,
diff --git a/tests/gcm/test_intrinsic_influence.py b/tests/gcm/test_intrinsic_influence.py
@@ -17,7 +17,12 @@
 )
 from dowhy.gcm._noise import noise_samples_of_ancestors
 from dowhy.gcm.influence import intrinsic_causal_influence_sample
-from dowhy.gcm.ml import create_hist_gradient_boost_classifier, create_linear_regressor_with_given_parameters
+from dowhy.gcm.ml import (
+    create_hist_gradient_boost_classifier,
+    create_hist_gradient_boost_regressor,
+    create_linear_regressor,
+    create_linear_regressor_with_given_parameters,
+)
 from dowhy.gcm.uncertainty import estimate_entropy_of_probabilities, estimate_variance
 from dowhy.gcm.util.general import apply_one_hot_encoding, fit_one_hot_encoders
 from dowhy.graph import node_connected_subgraph_view
@@ -247,3 +252,46 @@ def test_given_linear_gaussian_data_when_estimate_sample_wise_intrinsic_causal_i
     assert shapley_values[1][("X1", "X3")] == approx(0.5, abs=0.1)
     assert shapley_values[1][("X2", "X3")] == approx(2, abs=0.1)
     assert shapley_values[1][("X3", "X3")] == approx(1, abs=0.1)
+
+
+@flaky(max_runs=3)
+def test_given_linear_gaussian_data_when_estimate_sample_wise_intrinsic_causal_influence_with_a_pre_defined_model_then_returns_expected_values():
+    causal_model = InvertibleStructuralCausalModel(nx.DiGraph([("X0", "X1"), ("X1", "X2"), ("X2", "X3")]))
+
+    causal_model.set_causal_mechanism("X0", ScipyDistribution(stats.norm, loc=0, scale=1))
+    causal_model.set_causal_mechanism(
+        "X1",
+        AdditiveNoiseModel(
+            create_linear_regressor_with_given_parameters(np.array([2])), ScipyDistribution(stats.norm, loc=0, scale=1)
+        ),
+    )
+    causal_model.set_causal_mechanism(
+        "X2",
+        AdditiveNoiseModel(
+            create_linear_regressor_with_given_parameters(np.array([1])), ScipyDistribution(stats.norm, loc=0, scale=1)
+        ),
+    )
+    causal_model.set_causal_mechanism(
+        "X3",
+        AdditiveNoiseModel(
+            create_linear_regressor_with_given_parameters(np.array([1])), ScipyDistribution(stats.norm, loc=0, scale=1)
+        ),
+    )
+    _persist_parents(causal_model.graph)
+
+    shapley_values = intrinsic_causal_influence_sample(
+        causal_model,
+        "X3",
+        pd.DataFrame({"X0": [0, 1], "X1": [0.5, 2.5], "X2": [1.5, 4.5], "X3": [1.5, 5.5]}),
+        prediction_model=create_linear_regressor(),
+    )
+
+    assert shapley_values[0][("X0", "X3")] == approx(0, abs=0.15)
+    assert shapley_values[0][("X1", "X3")] == approx(0.5, abs=0.15)
+    assert shapley_values[0][("X2", "X3")] == approx(1, abs=0.15)
+    assert shapley_values[0][("X3", "X3")] == approx(0, abs=0.15)
+
+    assert shapley_values[1][("X0", "X3")] == approx(2, abs=0.15)
+    assert shapley_values[1][("X1", "X3")] == approx(0.5, abs=0.15)
+    assert shapley_values[1][("X2", "X3")] == approx(2, abs=0.15)
+    assert shapley_values[1][("X3", "X3")] == approx(1, abs=0.15)