From d83fc3f022c9ce4bc8f3ebfc7eee099a3c5db74b Mon Sep 17 00:00:00 2001 From: Patrick Bloebaum Date: Thu, 19 Oct 2023 15:34:34 -0700 Subject: [PATCH] GCM auto assignment now returns a summary object The summary object contains information about the evaluated models and model choices. This object is printable to provide quick summary. Signed-off-by: Patrick Bloebaum --- dowhy/gcm/auto.py | 157 ++++++++++++++---- dowhy/gcm/causal_mechanisms.py | 20 ++- .../generalised_cov_measure.py | 6 +- dowhy/gcm/influence.py | 2 +- dowhy/gcm/ml/regression.py | 3 + dowhy/gcm/stochastic_models.py | 8 +- tests/gcm/test_auto.py | 118 +++++++++++++ 7 files changed, 277 insertions(+), 37 deletions(-) diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py index 0ecbe58e00..9144f00cac 100644 --- a/dowhy/gcm/auto.py +++ b/dowhy/gcm/auto.py @@ -1,8 +1,9 @@ import warnings from enum import Enum, auto from functools import partial -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import networkx as nx import numpy as np import pandas as pd from joblib import Parallel, delayed @@ -53,7 +54,7 @@ from dowhy.graph import get_ordered_predecessors, is_root_node _LIST_OF_POTENTIAL_CLASSIFIERS_GOOD = [ - partial(create_logistic_regression_classifier, max_iter=1000), + partial(create_logistic_regression_classifier, max_iter=10000), create_hist_gradient_boost_classifier, ] _LIST_OF_POTENTIAL_REGRESSORS_GOOD = [ @@ -71,7 +72,6 @@ ] _LIST_OF_POTENTIAL_REGRESSORS_BETTER = _LIST_OF_POTENTIAL_REGRESSORS_GOOD + [ create_ridge_regressor, - create_polynom_regressor, partial(create_lasso_regressor, max_iter=5000), create_random_forest_regressor, create_support_vector_regressor, @@ -87,12 +87,56 @@ class AssignmentQuality(Enum): BEST = auto() +class AutoAssignmentSummary: + """Summary class for logging and storing information of the auto assignment process.""" + + def __init__(self): + self._nodes: Dict[Dict[Any, Any]] = {} + + def _init_node_entry(self, node: Any): + if node not in self._nodes: + self._nodes[node] = {"messages": [], "model_performances": []} + + def add_node_log_message(self, node: Any, message: str): + self._init_node_entry(node) + + self._nodes[node]["messages"].append(message) + + def add_model_performance(self, node, model: str, performance: str, metric_name: str): + self._nodes[node]["model_performances"].append((model, performance, metric_name)) + + def __str__(self): + summary_strings = [] + + summary_strings.append("Analyzed %d nodes." % len(list(self._nodes))) + + for node in self._nodes: + summary_strings.append("--- Node: %s" % node) + summary_strings.extend(self._nodes[node]["messages"]) + + if len(self._nodes[node]["model_performances"]) > 0: + summary_strings.append( + "For the model selection, the following models were evaluated on the %s metric:" + % self._nodes[node]["model_performances"][0][2] + ) + + for (model, performance, metric_name) in self._nodes[node]["model_performances"]: + summary_strings.append("%s: %s" % (str(model()).replace("()", ""), str(performance))) + + summary_strings.append( + "Based on the type of causal mechanism, the model with the lowest metric value " + "represents the best choice." + ) + + return "\n".join(summary_strings) + + def assign_causal_mechanisms( causal_model: ProbabilisticCausalModel, based_on: pd.DataFrame, quality: AssignmentQuality = AssignmentQuality.GOOD, override_models: bool = False, -) -> None: +) -> AutoAssignmentSummary: """Automatically assigns appropriate causal models. If causal models are already assigned to nodes and override_models is set to False, this function only validates the assignments with respect to the graph structure. Here, the validation checks whether root nodes have StochasticModels and non-root ConditionalStochasticModels @@ -124,50 +168,93 @@ def assign_causal_mechanisms( Model training speed: Slow Model inference speed: Slow-Medium Model accuracy: Best - :param override_models: If set to True, existing model assignments are replaced with automatically selected - ones. If set to False, the assigned models are only validated with respect to the graph structure. - - :return: None + :param override_models: If set to True, existing model assignments are replaced with automatically selected + ones. If set to False, the assigned models are only validated with respect to the graph + structure. + :return: A summary object containing details about the model selection process. """ - for node in causal_model.graph.nodes: + auto_assignment_summary = AutoAssignmentSummary() + + for node in nx.topological_sort(causal_model.graph): if not override_models and CAUSAL_MECHANISM in causal_model.graph.nodes[node]: + auto_assignment_summary.add_node_log_message( + node, + "Node %s already has a model assigned and the override parameter is False. Skipping this node." % node, + ) validate_causal_model_assignment(causal_model.graph, node) continue - assign_causal_mechanism_node(causal_model, node, based_on, quality) + + model_performances = assign_causal_mechanism_node(causal_model, node, based_on, quality) + + if is_root_node(causal_model.graph, node): + auto_assignment_summary.add_node_log_message( + node, + "Node %s is a root node. Assigning '%s' to the node representing the marginal distribution." + % (node, causal_model.causal_mechanism(node)), + ) + else: + auto_assignment_summary.add_node_log_message( + node, + "Node %s is a non-root node. Assigning '%s' to the node." % (node, causal_model.causal_mechanism(node)), + ) + + if isinstance(causal_model.causal_mechanism(node), AdditiveNoiseModel): + auto_assignment_summary.add_node_log_message( + node, + "This represents the causal relationship as " + + str(node) + + " := f(" + + ",".join([str(parent) for parent in get_ordered_predecessors(causal_model.graph, node)]) + + ") + N.", + ) + elif isinstance(causal_model.causal_mechanism(node), ClassifierFCM): + auto_assignment_summary.add_node_log_message( + node, + "This represents the causal relationship as " + + str(node) + + " := f(" + + ",".join([str(parent) for parent in get_ordered_predecessors(causal_model.graph, node)]) + + ",N).", + ) + + for (model, performance, metric_name) in model_performances: + auto_assignment_summary.add_model_performance(node, model, performance, metric_name) + + return auto_assignment_summary def assign_causal_mechanism_node( - causal_model: ProbabilisticCausalModel, - node: str, - based_on: pd.DataFrame, - quality: AssignmentQuality = AssignmentQuality.GOOD, -) -> None: + causal_model: ProbabilisticCausalModel, node: str, based_on: pd.DataFrame, quality: AssignmentQuality +) -> List[Tuple[Callable[[], PredictionModel], float, str]]: if is_root_node(causal_model.graph, node): causal_model.set_causal_mechanism(node, EmpiricalDistribution()) + model_performances = [] else: - prediction_model = select_model( + best_model, model_performances = select_model( based_on[get_ordered_predecessors(causal_model.graph, node)].to_numpy(), based_on[node].to_numpy(), quality, ) - if isinstance(prediction_model, ClassificationModel): - causal_model.set_causal_mechanism(node, ClassifierFCM(prediction_model)) + if isinstance(best_model, ClassificationModel): + causal_model.set_causal_mechanism(node, ClassifierFCM(best_model)) else: - causal_model.set_causal_mechanism(node, AdditiveNoiseModel(prediction_model)) + causal_model.set_causal_mechanism(node, AdditiveNoiseModel(best_model)) + + return model_performances def select_model( X: np.ndarray, Y: np.ndarray, model_selection_quality: AssignmentQuality -) -> Union[PredictionModel, ClassificationModel]: +) -> Tuple[Union[PredictionModel, ClassificationModel], List[Tuple[Callable[[], PredictionModel], float, str]]]: if model_selection_quality == AssignmentQuality.BEST: try: from dowhy.gcm.ml.autogluon import AutoGluonClassifier, AutoGluonRegressor if is_categorical(Y): - return AutoGluonClassifier() + return AutoGluonClassifier(), [] else: - return AutoGluonRegressor() + return AutoGluonRegressor(), [] except ImportError: raise RuntimeError( "AutoGluon module not found! For the BEST auto assign quality, consider installing the " @@ -187,12 +274,18 @@ def select_model( if auto_apply_encoders(X, auto_fit_encoders(X)).shape[1] <= 5: # Avoid too many features list_of_regressor += [create_polynom_regressor] - list_of_classifier += [partial(create_polynom_logistic_regression_classifier, max_iter=1000)] + list_of_classifier += [partial(create_polynom_logistic_regression_classifier, max_iter=10000)] if is_categorical(Y): - return find_best_model(list_of_classifier, X, Y, model_selection_splits=model_selection_splits)() + best_model, model_performances = find_best_model( + list_of_classifier, X, Y, model_selection_splits=model_selection_splits + ) + return best_model(), model_performances else: - return find_best_model(list_of_regressor, X, Y, model_selection_splits=model_selection_splits)() + best_model, model_performances = find_best_model( + list_of_regressor, X, Y, model_selection_splits=model_selection_splits + ) + return best_model(), model_performances def has_linear_relationship(X: np.ndarray, Y: np.ndarray, max_num_samples: int = 3000) -> bool: @@ -253,19 +346,23 @@ def find_best_model( max_samples_per_split: int = 10000, model_selection_splits: int = 5, n_jobs: Optional[int] = None, -) -> Callable[[], PredictionModel]: +) -> Tuple[Callable[[], PredictionModel], List[Tuple[Callable[[], PredictionModel], float, str]]]: n_jobs = config.default_n_jobs if n_jobs is None else n_jobs X, Y = shape_into_2d(X, Y) is_classification_problem = isinstance(prediction_model_factories[0](), ClassificationModel) + metric_name = "given" + if metric is None: + metric_name = "(negative) F1" if is_classification_problem: metric = lambda y_true, y_preds: -metrics.f1_score( y_true, y_preds, average="macro", zero_division=0 ) # Higher score is better else: + metric_name = "mean squared error (MSE)" metric = metrics.mean_squared_error labelBinarizer = None @@ -273,7 +370,7 @@ def find_best_model( labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit(Y) - kfolds = list(KFold(n_splits=model_selection_splits).split(range(X.shape[0]))) + kfolds = list(KFold(n_splits=model_selection_splits, shuffle=True).split(range(X.shape[0]))) def estimate_average_score(prediction_model_factory: Callable[[], PredictionModel], random_seed: int) -> float: set_random_seed(random_seed) @@ -301,5 +398,9 @@ def estimate_average_score(prediction_model_factory: Callable[[], PredictionMode delayed(estimate_average_score)(prediction_model_factory, int(random_seed)) for prediction_model_factory, random_seed in zip(prediction_model_factories, random_seeds) ) + sorted_results = sorted( + zip(prediction_model_factories, average_metric_scores, [metric_name] * len(prediction_model_factories)), + key=lambda x: x[1], + ) - return sorted(zip(prediction_model_factories, average_metric_scores), key=lambda x: x[1])[0][0] + return sorted_results[0][0], sorted_results diff --git a/dowhy/gcm/causal_mechanisms.py b/dowhy/gcm/causal_mechanisms.py index 71e6682c92..05e8c449fc 100644 --- a/dowhy/gcm/causal_mechanisms.py +++ b/dowhy/gcm/causal_mechanisms.py @@ -10,7 +10,7 @@ import numpy as np from dowhy.gcm.ml import ClassificationModel, PredictionModel -from dowhy.gcm.ml.regression import InvertibleFunction +from dowhy.gcm.ml.regression import InvertibleFunction, SklearnRegressionModel from dowhy.gcm.util.general import is_categorical, shape_into_2d @@ -155,9 +155,14 @@ def evaluate(self, parent_samples: np.ndarray, noise_samples: np.ndarray) -> np. return self._invertible_function.evaluate(predictions + noise_samples) def __str__(self) -> str: + if isinstance(self._prediction_model, SklearnRegressionModel): + prediction_model_string = self._prediction_model.sklearn_model.__class__.__name__ + else: + prediction_model_string = self._prediction_model.__class__.__name__ + return "%s with %s and an %s" % ( self.__class__.__name__, - self._prediction_model.__class__.__name__, + prediction_model_string, self._invertible_function.__class__.__name__, ) @@ -207,6 +212,14 @@ def __init__(self, prediction_model: PredictionModel, noise_model: Optional[Stoc def clone(self): return AdditiveNoiseModel(prediction_model=self.prediction_model.clone(), noise_model=self.noise_model.clone()) + def __str__(self) -> str: + if isinstance(self._prediction_model, SklearnRegressionModel): + prediction_model_string = self._prediction_model.sklearn_model.__class__.__name__ + else: + prediction_model_string = self._prediction_model.__class__.__name__ + + return "AdditiveNoiseModel using %s" % prediction_model_string + class ProbabilityEstimatorModel(ABC): @abstractmethod @@ -291,3 +304,6 @@ def get_class_names(self, class_indices: np.ndarray) -> List[str]: @property def classifier_model(self) -> ClassificationModel: return self._classifier_model + + def __repr__(self): + return "Classifier FCM based on %s" % self.classifier_model diff --git a/dowhy/gcm/independence_test/generalised_cov_measure.py b/dowhy/gcm/independence_test/generalised_cov_measure.py index 9c0942d25d..e1d602f557 100644 --- a/dowhy/gcm/independence_test/generalised_cov_measure.py +++ b/dowhy/gcm/independence_test/generalised_cov_measure.py @@ -87,8 +87,4 @@ def _create_model( if not isinstance(model, AssignmentQuality): return model() else: - return select_model( - input_features, - target, - model, - ) + return select_model(input_features, target, model)[0] diff --git a/dowhy/gcm/influence.py b/dowhy/gcm/influence.py index e8ede42095..7fe949ed5e 100644 --- a/dowhy/gcm/influence.py +++ b/dowhy/gcm/influence.py @@ -478,7 +478,7 @@ def _get_icc_noise_function( return prediction_model.predict if prediction_model == "approx": - prediction_model = auto.select_model(noise_samples, target_samples, auto_assign_quality) + prediction_model = auto.select_model(noise_samples, target_samples, auto_assign_quality)[0] prediction_model.fit(noise_samples, target_samples) if target_is_categorical: diff --git a/dowhy/gcm/ml/regression.py b/dowhy/gcm/ml/regression.py index fdd4d2b911..7e9d1a64d6 100644 --- a/dowhy/gcm/ml/regression.py +++ b/dowhy/gcm/ml/regression.py @@ -57,6 +57,9 @@ def clone(self): """ return SklearnRegressionModel(sklearn_mdl=sklearn.clone(self._sklearn_mdl)) + def __str__(self): + return str(self._sklearn_mdl) + def create_linear_regressor_with_given_parameters( coefficients: np.ndarray, intercept: float = 0, **kwargs diff --git a/dowhy/gcm/stochastic_models.py b/dowhy/gcm/stochastic_models.py index e62291296d..ff248c962a 100644 --- a/dowhy/gcm/stochastic_models.py +++ b/dowhy/gcm/stochastic_models.py @@ -176,6 +176,9 @@ def map_scipy_distribution_parameters_to_names( return parameters_dictionary + def __str__(self) -> str: + return str(self._distribution.name) + " distribution" + class EmpiricalDistribution(StochasticModel): """An implementation of a stochastic model that uniformly samples from data samples. By randomly returning a sample @@ -202,6 +205,9 @@ def draw_samples(self, num_samples: int) -> np.ndarray: def clone(self): return EmpiricalDistribution() + def __str__(self): + return "Empirical Distribution" + class BayesianGaussianMixtureDistribution(StochasticModel): def __init__(self) -> None: @@ -247,7 +253,7 @@ def draw_samples(self, num_samples: int) -> np.ndarray: return shape_into_2d(self._gmm_model.sample(num_samples)[0]) def __str__(self) -> str: - return "Approximated data distribution" + return "Gaussian Mixture Distribution" def clone(self): return BayesianGaussianMixtureDistribution() diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py index ea7b39ac94..956323b10c 100644 --- a/tests/gcm/test_auto.py +++ b/tests/gcm/test_auto.py @@ -313,3 +313,121 @@ def test_given_data_with_rare_categorical_features_when_calling_has_linear_relat Y = np.append(np.array(["Class1"] * 10), np.array(["Class2"] * 10)) assert has_linear_relationship(X, Y) + + +@flaky(max_runs=2) +def test_given_continuous_data_when_print_auto_summary_then_returns_expected_formats(): + X, Y = _generate_non_linear_regression_data() + + causal_model = ProbabilisticCausalModel( + nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y"), ("X4", "Y")]) + ) + data = {"X" + str(i): X[:, i] for i in range(X.shape[1])} + data.update({"Y": Y}) + + summary_result = assign_causal_mechanisms(causal_model, pd.DataFrame(data)) + summary_string = str(summary_result) + + assert "X0" in summary_result._nodes + assert "X1" in summary_result._nodes + assert "X2" in summary_result._nodes + assert "X3" in summary_result._nodes + assert "X4" in summary_result._nodes + assert "Y" in summary_result._nodes + + assert len(summary_result._nodes["X0"]["model_performances"]) == 0 + assert len(summary_result._nodes["X1"]["model_performances"]) == 0 + assert len(summary_result._nodes["X2"]["model_performances"]) == 0 + assert len(summary_result._nodes["X3"]["model_performances"]) == 0 + assert len(summary_result._nodes["X4"]["model_performances"]) == 0 + assert len(summary_result._nodes["Y"]["model_performances"]) > 0 + + expected_summary = """Analyzed 6 nodes. +--- Node: X0 +Node X0 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X1 +Node X1 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X2 +Node X2 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X3 +Node X3 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X4 +Node X4 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: Y +Node Y is a non-root node. Assigning 'AdditiveNoiseModel using HistGradientBoostingRegressor' to the node. +This represents the causal relationship as Y := f(X0,X1,X2,X3,X4) + N. +For the model selection, the following models were evaluated on the mean squared error (MSE) metric: +* +Based on the type of causal mechanism, the model with the lowest metric value represents the best choice.""" + + assert ( + summary_string.split( + "For the model selection, the following models were evaluated on the mean squared error (MSE) metric:" + )[0] + == expected_summary.split( + "For the model selection, the following models were evaluated on the mean squared error (MSE) metric:" + )[0] + ) + assert ( + "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice." + in summary_string + ) + + +@flaky(max_runs=2) +def test_given_categorical_data_when_print_auto_summary_then_returns_expected_formats(): + X, Y = _generate_linear_classification_data() + + causal_model = ProbabilisticCausalModel( + nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y"), ("X4", "Y")]) + ) + data = {"X" + str(i): X[:, i] for i in range(X.shape[1])} + data.update({"Y": Y}) + + summary_result = assign_causal_mechanisms(causal_model, pd.DataFrame(data)) + summary_string = str(summary_result) + + assert "X0" in summary_result._nodes + assert "X1" in summary_result._nodes + assert "X2" in summary_result._nodes + assert "X3" in summary_result._nodes + assert "X4" in summary_result._nodes + assert "Y" in summary_result._nodes + + assert len(summary_result._nodes["X0"]["model_performances"]) == 0 + assert len(summary_result._nodes["X1"]["model_performances"]) == 0 + assert len(summary_result._nodes["X2"]["model_performances"]) == 0 + assert len(summary_result._nodes["X3"]["model_performances"]) == 0 + assert len(summary_result._nodes["X4"]["model_performances"]) == 0 + assert len(summary_result._nodes["Y"]["model_performances"]) > 0 + + expected_summary = """Analyzed 6 nodes. +--- Node: X0 +Node X0 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X1 +Node X1 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X2 +Node X2 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X3 +Node X3 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: X4 +Node X4 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +--- Node: Y +Node Y is a non-root node. Assigning 'Classifier FCM based on LogisticRegression(max_iter=10000)' to the node. +This represents the causal relationship as Y := f(X0,X1,X2,X3,X4,N). +For the model selection, the following models were evaluated on the (negative) F1 metric: +* +Based on the type of causal mechanism, the model with the lowest metric value represents the best choice.""" + + assert ( + summary_string.split( + "For the model selection, the following models were evaluated on the (negative) F1 metric:" + )[0] + == expected_summary.split( + "For the model selection, the following models were evaluated on the (negative) F1 metric:" + )[0] + ) + assert ( + "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice." + in summary_string + )