From 944eb207362b9f8dc937c235db42dc4c2ecf203e Mon Sep 17 00:00:00 2001 From: Patrick Bloebaum Date: Wed, 22 Nov 2023 10:46:30 -0800 Subject: [PATCH] Add explicit support for discrete ANMs - Add new Discrete Additive Noise Model class that enforces the outputs to be discrete. This should help in generating more consistent data. - As part of this, revised the auto assignment function and revised its docstring. - Revise the auto assignment summary. - Revise the evaluation summary. Signed-off-by: Patrick Bloebaum --- .../modeling_gcm/model_evaluation.rst | 44 +++-- dowhy/gcm/__init__.py | 2 +- dowhy/gcm/auto.py | 181 +++++++++++++++--- dowhy/gcm/causal_mechanisms.py | 48 ++++- dowhy/gcm/fitting_sampling.py | 27 ++- dowhy/gcm/model_evaluation.py | 21 +- dowhy/gcm/util/general.py | 9 + tests/gcm/test_auto.py | 123 ++++++++---- tests/gcm/test_fcms.py | 37 ++++ tests/gcm/test_model_evaluation.py | 26 ++- tests/gcm/test_whatif.py | 37 +++- tests/gcm/util/test_general.py | 12 ++ 12 files changed, 464 insertions(+), 103 deletions(-) diff --git a/docs/source/user_guide/modeling_gcm/model_evaluation.rst b/docs/source/user_guide/modeling_gcm/model_evaluation.rst index c7259abbe8..c893869ce6 100644 --- a/docs/source/user_guide/modeling_gcm/model_evaluation.rst +++ b/docs/source/user_guide/modeling_gcm/model_evaluation.rst @@ -52,29 +52,49 @@ this, consider the chain structure example X→Y→Z: .. code-block:: - Analyzed 3 nodes. + When using this auto assignment function, the given data is used to automatically assign a causal mechanism to each node. Note that causal mechanisms can also be customized and assigned manually. + The following types of causal mechanisms are considered for the automatic selection: + + If root node: + An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of data modalities. + + If non-root node and the data is continuous: + Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i.To select the best model for f, different regression models are evaluated and the model with the smallest mean squared error is selected.Note that minimizing the mean squared error here is equivalent to selecting the best choice of an ANM. + + If non-root node and the data is discrete: + Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional constraint for f to only return discrete values. + Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider representing them as strings to ensure proper model selection. + + If non-root node and the data is categorical: + A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). + Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected. + + In total, 3 nodes were analyzed: + --- Node: X - Node X is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. + Node X is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: Y - Node Y is a non-root node. Assigning 'AdditiveNoiseModel using LinearRegression' to the node. + Node Y is a non-root node with continuous data. Assigning 'AdditiveNoiseModel using LinearRegression' to the node. This represents the causal relationship as Y := f(X) + N. For the model selection, the following models were evaluated on the mean squared error (MSE) metric: - LinearRegression: 1.0023387259040388 + LinearRegression: 0.9978767184153945 Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(include_bias=False)), - ('linearregression', LinearRegression)]): 1.0099017476403862 - HistGradientBoostingRegressor: 1.1091403766880177 - Based on the type of causal mechanism, the model with the lowest metric value represents the best choice. + ('linearregression', LinearRegression)]): 1.00448207264867 + HistGradientBoostingRegressor: 1.1386270868995179 --- Node: Z - Node Z is a non-root node. Assigning 'AdditiveNoiseModel using LinearRegression' to the node. + Node Z is a non-root node with continuous data. Assigning 'AdditiveNoiseModel using LinearRegression' to the node. This represents the causal relationship as Z := f(Y) + N. For the model selection, the following models were evaluated on the mean squared error (MSE) metric: - LinearRegression: 0.9451918596711175 + LinearRegression: 1.0240822102491627 Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(include_bias=False)), - ('linearregression', LinearRegression)]): 0.9488259577453813 - HistGradientBoostingRegressor: 1.682146254853607 - Based on the type of causal mechanism, the model with the lowest metric value represents the best choice. + ('linearregression', LinearRegression)]): 1.02567150836141 + HistGradientBoostingRegressor: 1.358002751994007 + + ===Note=== + Note, based on the selected auto assignment quality, the set of evaluated models changes. + For more insights toward the quality of the fitted graphical causal model, consider using the evaluate_causal_model function after fitting the causal mechanisms. In this scenario, an empirical distribution is assigned to the root node X, while additive noise models are applied to nodes Y and Z. In both of these cases, a linear regression model demonstrated the best performance in terms diff --git a/dowhy/gcm/__init__.py b/dowhy/gcm/__init__.py index 735d5d40e8..4ab7758ced 100644 --- a/dowhy/gcm/__init__.py +++ b/dowhy/gcm/__init__.py @@ -10,7 +10,7 @@ MedianDeviationScorer, RescaledMedianCDFQuantileScorer, ) -from .causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, PostNonlinearModel +from .causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, DiscreteAdditiveNoiseModel, PostNonlinearModel from .causal_models import InvertibleStructuralCausalModel, ProbabilisticCausalModel, StructuralCausalModel from .confidence_intervals import confidence_intervals from .confidence_intervals_cms import bootstrap_sampling, fit_and_compute diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py index 0d8021eceb..bce4639da1 100644 --- a/dowhy/gcm/auto.py +++ b/dowhy/gcm/auto.py @@ -14,7 +14,7 @@ from sklearn.preprocessing import MultiLabelBinarizer from dowhy.gcm import config -from dowhy.gcm.causal_mechanisms import AdditiveNoiseModel, ClassifierFCM +from dowhy.gcm.causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, DiscreteAdditiveNoiseModel from dowhy.gcm.causal_models import CAUSAL_MECHANISM, ProbabilisticCausalModel, validate_causal_model_assignment from dowhy.gcm.ml import ( ClassificationModel, @@ -48,6 +48,7 @@ auto_apply_encoders, auto_fit_encoders, is_categorical, + is_discrete, set_random_seed, shape_into_2d, ) @@ -108,7 +109,43 @@ def add_model_performance(self, node, model: str, performance: str, metric_name: def __str__(self): summary_strings = [] - summary_strings.append("Analyzed %d nodes." % len(list(self._nodes))) + summary_strings.append( + "When using this auto assignment function, the given data is used to automatically assign a causal " + "mechanism to each node. Note that causal mechanisms can also be customized and assigned manually.\n" + "The following types of causal mechanisms are considered for the automatic selection:" + ) + summary_strings.append("\nIf root node:") + summary_strings.append( + "An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided " + "data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for " + "all types of data modalities." + ) + summary_strings.append("\nIf non-root node and the data is continuous:") + summary_strings.append( + "Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the " + "parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i." + "To select the best model for f, different regression models are evaluated and the model " + "with the smallest mean squared error is selected." + "Note that minimizing the mean squared error here is equivalent to selecting the best " + "choice of an ANM." + ) + summary_strings.append("\nIf non-root node and the data is discrete:") + summary_strings.append( + "Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an " + "additional constraint for f to only return discrete values.\n" + "Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider " + "representing them as strings to ensure proper model selection." + ) + summary_strings.append("\nIf non-root node and the data is categorical:") + summary_strings.append( + "A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).\n" + "Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a " + "class (category) using the conditional probability distribution produced by a " + "classification model." + "Here, different model classes are evaluated using the (negative) F1 score and the best" + " performing model class is selected." + ) + summary_strings.append("\nIn total, %d nodes were analyzed:" % len(list(self._nodes))) for node in self._nodes: summary_strings.append("\n--- Node: %s" % node) @@ -123,11 +160,13 @@ def __str__(self): for (model, performance, metric_name) in self._nodes[node]["model_performances"]: summary_strings.append("%s: %s" % (str(model()).replace("()", ""), str(performance))) - summary_strings.append( - "Based on the type of causal mechanism, the model with the lowest metric value " - "represents the best choice." - ) - + summary_strings.append( + "\n===Note===\nNote, based on the selected auto assignment quality, the set of " "evaluated models changes." + ) + summary_strings.append( + "For more insights toward the quality of the fitted graphical causal model, consider " + "using the evaluate_causal_model function after fitting the causal mechanisms." + ) return "\n".join(summary_strings) @@ -137,26 +176,86 @@ def assign_causal_mechanisms( quality: AssignmentQuality = AssignmentQuality.GOOD, override_models: bool = False, ) -> AutoAssignmentSummary: - """Automatically assigns appropriate causal models. If causal models are already assigned to nodes and - override_models is set to False, this function only validates the assignments with respect to the graph structure. - Here, the validation checks whether root nodes have StochasticModels and non-root ConditionalStochasticModels - assigned. + """Automatically assigns appropriate causal mechanisms to nodes. If causal mechanisms are already assigned to nodes + and override_models is set to False, this function only validates the assignments with respect to the graph + structure. This is, the validation checks whether root nodes have StochasticModels and non-root + ConditionalStochasticModels assigned. + + The following types of causal mechanisms are considered for the automatic selection: + + If root node: + An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data. + This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of + data modalities. + + If non-root node and the data is continuous: + Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved + noise N_i is assumed to be independent of PA_i. To select the best model for f, different regression models are + evaluated and the model with the smallest mean squared error is selected. Note that minimizing the mean squared + error here is equivalent to selecting the best choice of an ANM. + + If non-root node and the data is discrete: + Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional + constraint to return discrete values. Note that 'discrete' here refers to numerical values with an order. If the + data is categorical, consider representing them as strings to ensure proper model selection. + + If non-root node and the data is categorical: + A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). + Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the + conditional probability distribution produced by a classification model. Here, different model classes are evaluated + using the (negative) F1 score and the best performing model class is selected. + + The current model zoo is: + + With "GOOD" quality: + Numerical: + - Linear Regressor + - Linear Regressor with polynomial features + - Histogram Gradient Boost Regressor + + Categorical: + - Logistic Regressor + - Logistic Regressor with polynomial features + - Histogram Gradient Boost Classifier + + With "BETTER" quality: + Numerical: + - Linear Regressor + - Linear Regressor with polynomial features + - Gradient Boost Regressor + - Ridge Regressor + - Lasso Regressor + - Random Forest Regressor + - Support Vector Regressor + - Extra Trees Regressor + - KNN Regressor + - Ada Boost Regressor + + Categorical: + - Logistic Regressor + - Logistic Regressor with polynomial features + - Histogram Gradient Boost Classifier + - Random Forest Classifier + - Extra Trees Classifier + - Support Vector Classifier + - KNN Classifier + - Gaussian Naive Bayes Classifier + - Ada Boost Classifier + + With "BEST" quality: + An auto ML model based on AutoGluon (optional dependency, needs to be installed). :param causal_model: The causal model to whose nodes to assign causal models. :param based_on: Jointly sampled data corresponding to the nodes of the given graph. :param quality: AssignmentQuality for the automatic model selection and model accuracy. This changes the type of - prediction model and time spent on the selection. Options are: - - AssignmentQuality.GOOD: Compares a linear, polynomial and gradient boost model on small test-training split - of the data. The best performing model is then selected. + prediction model and time spent on the selection. See the docstring for a list of potential models. + The options for the quality are: + - AssignmentQuality.GOOD: Only a small set of models are evaluated. Model selection speed: Fast Model training speed: Fast Model inference speed: Fast Model accuracy: Medium - - AssignmentQuality.BETTER: Compares multiple model types and uses the one with the best performance - averaged over multiple splits of the training data. By default, the model with the smallest root mean - squared error is selected for regression problems and the model with the highest F1 score is selected for - classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS_BETTER and - _LIST_OF_POTENTIAL_CLASSIFIERS_BETTER, respectively. + - AssignmentQuality.BETTER: A larger set of models are evaluated. Model selection speed: Medium Model training speed: Fast Model inference speed: Fast @@ -168,8 +267,8 @@ def assign_causal_mechanisms( Model training speed: Slow Model inference speed: Slow-Medium Model accuracy: Best - :param override_models: If set to True, existing model assignments are replaced with automatically selected - ones. If set to False, the assigned models are only validated with respect to the graph + :param override_models: If set to True, existing mechanism assignments are replaced with automatically selected + ones. If set to False, the assigned mechanisms are only validated with respect to the graph structure. :return: A summary object containing details about the model selection process. """ @@ -179,7 +278,8 @@ def assign_causal_mechanisms( if not override_models and CAUSAL_MECHANISM in causal_model.graph.nodes[node]: auto_assignment_summary.add_node_log_message( node, - "Node %s already has a model assigned and the override parameter is False. Skipping this node." % node, + "Node %s already has a causal mechanism assigned and the override parameter is False. Skipping this " + "node." % node, ) validate_causal_model_assignment(causal_model.graph, node) continue @@ -189,16 +289,36 @@ def assign_causal_mechanisms( if is_root_node(causal_model.graph, node): auto_assignment_summary.add_node_log_message( node, - "Node %s is a root node. Assigning '%s' to the node representing the marginal distribution." + "Node %s is a root node. Therefore, assigning '%s' to the node representing the marginal distribution." % (node, causal_model.causal_mechanism(node)), ) else: + data_type = "continuous" + if isinstance(causal_model.causal_mechanism(node), ClassifierFCM): + data_type = "categorical" + elif isinstance(causal_model.causal_mechanism(node), DiscreteAdditiveNoiseModel): + data_type = "discrete" + auto_assignment_summary.add_node_log_message( node, - "Node %s is a non-root node. Assigning '%s' to the node." % (node, causal_model.causal_mechanism(node)), + "Node %s is a non-root node with %s data. Assigning '%s' to the node." + % ( + node, + data_type, + causal_model.causal_mechanism(node), + ), ) - if isinstance(causal_model.causal_mechanism(node), AdditiveNoiseModel): + if isinstance(causal_model.causal_mechanism(node), DiscreteAdditiveNoiseModel): + auto_assignment_summary.add_node_log_message( + node, + "This represents the discrete causal relationship as " + + str(node) + + " := f(" + + ",".join([str(parent) for parent in get_ordered_predecessors(causal_model.graph, node)]) + + ") + N.", + ) + elif isinstance(causal_model.causal_mechanism(node), AdditiveNoiseModel): auto_assignment_summary.add_node_log_message( node, "This represents the causal relationship as " @@ -230,16 +350,21 @@ def assign_causal_mechanism_node( causal_model.set_causal_mechanism(node, EmpiricalDistribution()) model_performances = [] else: + node_data = based_on[node].to_numpy() + best_model, model_performances = select_model( based_on[get_ordered_predecessors(causal_model.graph, node)].to_numpy(), - based_on[node].to_numpy(), + node_data, quality, ) if isinstance(best_model, ClassificationModel): causal_model.set_causal_mechanism(node, ClassifierFCM(best_model)) else: - causal_model.set_causal_mechanism(node, AdditiveNoiseModel(best_model)) + if is_discrete(node_data): + causal_model.set_causal_mechanism(node, DiscreteAdditiveNoiseModel(best_model)) + else: + causal_model.set_causal_mechanism(node, AdditiveNoiseModel(best_model)) return model_performances @@ -263,7 +388,7 @@ def select_model( elif model_selection_quality == AssignmentQuality.GOOD: list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_GOOD) list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD) - model_selection_splits = 2 + model_selection_splits = 5 elif model_selection_quality == AssignmentQuality.BETTER: list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_BETTER) list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER) diff --git a/dowhy/gcm/causal_mechanisms.py b/dowhy/gcm/causal_mechanisms.py index e033a2753f..e1aba3eda2 100644 --- a/dowhy/gcm/causal_mechanisms.py +++ b/dowhy/gcm/causal_mechanisms.py @@ -8,7 +8,7 @@ from dowhy.gcm.ml import ClassificationModel, PredictionModel from dowhy.gcm.ml.regression import InvertibleFunction, SklearnRegressionModel -from dowhy.gcm.util.general import is_categorical, shape_into_2d +from dowhy.gcm.util.general import is_categorical, is_discrete, shape_into_2d class StochasticModel(ABC): @@ -218,6 +218,52 @@ def __str__(self) -> str: return "AdditiveNoiseModel using %s" % prediction_model_string +class DiscreteAdditiveNoiseModel(AdditiveNoiseModel): + """Implements a discrete ANM. This is, it follows a normal ANM of the form Y = f(X) + N, where N is assumed to be + independent of X and f is forced to output discrete values. To allow for flexible models, f can be any regression + model and the output will be rounded to a discrete value accordingly. Note that this remains a valid additive noise + model, but assumes that Y can take any integer value.""" + + def fit(self, X: np.ndarray, Y: np.ndarray) -> None: + if not is_discrete(Y): + raise ValueError("Cannot fit a discrete ANM to non-discrete target values!") + + X, Y = shape_into_2d(X, Y) + Y = Y.astype(np.int32) + + self._prediction_model.fit(X=X, Y=Y) + self._noise_model.fit(self._rounded_prediction(X) - Y) + + def evaluate(self, parent_samples: np.ndarray, noise_samples: np.ndarray) -> np.ndarray: + if not is_discrete(noise_samples): + raise ValueError("Noise values have to be discrete!") + + parent_samples, noise_samples = shape_into_2d(parent_samples, noise_samples) + predictions = shape_into_2d(self._rounded_prediction(parent_samples)) + + return predictions + noise_samples + + def estimate_noise(self, target_samples: np.ndarray, parent_samples: np.ndarray) -> np.ndarray: + if not is_discrete(target_samples): + raise ValueError("Target samples have to be discrete!") + + target_samples, parent_samples = shape_into_2d(target_samples, parent_samples) + + return target_samples - self._rounded_prediction(parent_samples) + + def _rounded_prediction(self, X: np.ndarray) -> np.ndarray: + return np.round(self._prediction_model.predict(X).astype(float)).astype(np.int32) + + def clone(self): + return DiscreteAdditiveNoiseModel( + prediction_model=self.prediction_model.clone(), + noise_model=self.noise_model.clone(), + ) + + def __str__(self) -> str: + return "Discrete " + super().__str__() + + class ProbabilityEstimatorModel(ABC): @abstractmethod def estimate_probabilities(self, parent_samples: np.ndarray) -> np.ndarray: diff --git a/dowhy/gcm/fitting_sampling.py b/dowhy/gcm/fitting_sampling.py index d8758a45cb..c1cf48aa93 100644 --- a/dowhy/gcm/fitting_sampling.py +++ b/dowhy/gcm/fitting_sampling.py @@ -17,11 +17,19 @@ from dowhy.graph import get_ordered_predecessors, is_root_node -def fit(causal_model: ProbabilisticCausalModel, data: pd.DataFrame): - """Learns generative causal models of nodes in the causal graph from data. +def fit(causal_model: ProbabilisticCausalModel, data: pd.DataFrame, return_evaluation_summary: bool = False): + """Fits the causal mechanism of each node to the data. - :param causal_model: The causal model containing the mechanisms that will be fitted. + Optionally, returns a summary of different metrics of the causal mechanisms evaluated via cross-validation. Note, + this will use the evaluate_causal_model method. For more detailed and extensive evaluations, consider using the + evaluate_causal_model method directly. + + :param causal_model: The causal model containing the mechanisms of the node that will be fitted. :param data: Observations of nodes in the causal model. + :param return_evaluation_summary: If True, returns a summary of the performances of the fitted mechanisms using the + evaluate_causal_model method. If False, nothing is returned. + :return: Optionally, a CausalModelEvaluationResult summarizing the performances of the causal mechanisms via + cross-validation. """ progress_bar = tqdm( causal_model.graph.nodes, @@ -41,6 +49,19 @@ def fit(causal_model: ProbabilisticCausalModel, data: pd.DataFrame): fit_causal_model_of_target(causal_model, node, data) + if return_evaluation_summary: + from dowhy.gcm import evaluate_causal_model + + return evaluate_causal_model( + causal_model, + data, + evaluate_causal_mechanisms=True, + compare_mechanism_baselines=False, + evaluate_invertibility_assumptions=False, + evaluate_overall_kl_divergence=False, + evaluate_causal_structure=False, + ) + def fit_causal_model_of_target( causal_model: ProbabilisticCausalModel, target_node: Any, training_data: pd.DataFrame diff --git a/dowhy/gcm/model_evaluation.py b/dowhy/gcm/model_evaluation.py index 604372f53d..1a1d62612f 100644 --- a/dowhy/gcm/model_evaluation.py +++ b/dowhy/gcm/model_evaluation.py @@ -183,7 +183,7 @@ def __str__(self): if self.overall_kl_divergence is not None: summary_string += " and the overall average KL divergence between generated and observed distribution" if self.graph_falsification is not None: - summary_string += " and graph structure" + summary_string += " and the graph structure" summary_string += ". The results are as follows:" summary_strings = [summary_string] @@ -191,16 +191,15 @@ def __str__(self): if self.mechanism_performances is not None: summary_strings.append("\n==== Evaluation of Causal Mechanisms ====") summary_strings.append( - "Root nodes are evaluated based on the KL divergence between the generated " - "and the observed distribution." - ) - summary_strings.append( - "Non-root nodes are mainly evaluated based on the (normalized) Continuous Ranked Probability Score " - "(CRPS), which is a generalizes the Mean Absolute Percentage Error to probabilistic " - "predictions. Since the causal mechanisms produce conditional distributions, this " - "should give some insights into their performance and calibration. In addition, the mean squared error " - "(MSE), the normalized MSE (NMSE), the R2 coefficient and the F1 score (for categorical nodes) is " - "reported." + "The used evaluation metrics are:\n" + "- KL divergence (only for root-nodes): Evaluates the divergence between the generated and the observed distribution.\n" + "- Mean Squared Error (MSE): Evaluates the average squared differences between the observed values and the conditional expectation of the causal mechanisms.\n" + "- Normalized MSE (NMSE): The MSE normalized by the standard deviation for better comparison.\n" + "- R2 coefficient: Indicates how much variance is explained by the conditional expectations of the mechanisms. Note, however, that this can be misleading for nonlinear relationships.\n" + "- F1 score (only for categorical non-root nodes): The harmonic mean of the precision and recall indicating the goodness of the underlying classifier model.\n" + "- (normalized) Continuous Ranked Probability Score (CRPS): The CRPS generalizes the Mean Absolute Percentage Error to probabilistic predictions. This gives insights into the accuracy and calibration of the causal mechanisms.\n" + "NOTE: Every metric focuses on different aspects and they might not consistently indicate a good or bad performance.\n" + "We will mostly utilize the CRPS for comparing and interpreting the performance of the mechanisms, since this captures the most important properties for the causal model." ) for mechanism_performance in self.mechanism_performances.values(): diff --git a/dowhy/gcm/util/general.py b/dowhy/gcm/util/general.py index 97d43c7bf8..15c6618186 100644 --- a/dowhy/gcm/util/general.py +++ b/dowhy/gcm/util/general.py @@ -196,6 +196,15 @@ def has_categorical(X: np.ndarray) -> bool: return False +def is_discrete(X: np.ndarray) -> bool: + """Checks if all values in the given array are discrete. + + :param X: Input array to check. + :return: True if all values in the input are discrete, False otherwise. + """ + return np.all(X == np.floor(X)) + + def setdiff2d(ar1: np.ndarray, ar2: np.ndarray, assume_unique: bool = False) -> np.ndarray: """This method generalizes numpy's setdiff1d to 2d, i.e., it compares vectors for arbitrary length. See https://numpy.org/doc/stable/reference/generated/numpy.setdiff1d.html for more details.""" diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py index db7c18e6bf..980f90f05a 100644 --- a/tests/gcm/test_auto.py +++ b/tests/gcm/test_auto.py @@ -10,7 +10,15 @@ from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline -from dowhy.gcm import ProbabilisticCausalModel, StructuralCausalModel, draw_samples, fit +from dowhy.gcm import ( + AdditiveNoiseModel, + DiscreteAdditiveNoiseModel, + EmpiricalDistribution, + ProbabilisticCausalModel, + StructuralCausalModel, + draw_samples, + fit, +) from dowhy.gcm.auto import AssignmentQuality, assign_causal_mechanisms, has_linear_relationship @@ -206,6 +214,20 @@ def test_given_polynomial_classification_data_with_categorical_input_when_auto_a assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD, override_models=True) +def test_given_continuous_and_discrete_data_when_auto_assign_then_correct_assigns_discrete_anm(): + causal_model = ProbabilisticCausalModel(nx.DiGraph([("X", "Y"), ("Y", "Z")])) + data = { + "X": np.random.normal(0, 1, 100), + "Y": np.random.choice(2, 100, replace=True), + "Z": np.random.normal(0, 1, 100), + } + + assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD) + assert isinstance(causal_model.causal_mechanism("X"), EmpiricalDistribution) + assert isinstance(causal_model.causal_mechanism("Y"), DiscreteAdditiveNoiseModel) + assert isinstance(causal_model.causal_mechanism("Z"), AdditiveNoiseModel) + + def test_when_auto_called_from_main_namespace_returns_no_attribute_error(): from dowhy import gcm @@ -343,40 +365,51 @@ def test_given_continuous_data_when_print_auto_summary_then_returns_expected_for assert len(summary_result._nodes["X4"]["model_performances"]) == 0 assert len(summary_result._nodes["Y"]["model_performances"]) > 0 - expected_summary = """Analyzed 6 nodes. + assert ( + """When using this auto assignment function, the given data is used to automatically assign a causal mechanism to each node. Note that causal mechanisms can also be customized and assigned manually. +The following types of causal mechanisms are considered for the automatic selection: + +If root node: +An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of data modalities. + +If non-root node and the data is continuous: +Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i.To select the best model for f, different regression models are evaluated and the model with the smallest mean squared error is selected.Note that minimizing the mean squared error here is equivalent to selecting the best choice of an ANM. + +If non-root node and the data is discrete: +Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional constraint to return discrete values. +Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider representing them as strings to ensure proper model selection. + +If non-root node and the data is categorical: +A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). +Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected. + +In total, 6 nodes were analyzed: --- Node: X0 -Node X0 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X0 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X1 -Node X1 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X1 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X2 -Node X2 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X2 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X3 -Node X3 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X3 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X4 -Node X4 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X4 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: Y -Node Y is a non-root node. Assigning 'AdditiveNoiseModel using HistGradientBoostingRegressor' to the node. +Node Y is a non-root node with continuous data. Assigning 'AdditiveNoiseModel using HistGradientBoostingRegressor' to the node. This represents the causal relationship as Y := f(X0,X1,X2,X3,X4) + N. -For the model selection, the following models were evaluated on the mean squared error (MSE) metric: -* -Based on the type of causal mechanism, the model with the lowest metric value represents the best choice.""" - - assert ( - summary_string.split( - "For the model selection, the following models were evaluated on the mean squared error (MSE) metric:" - )[0] - == expected_summary.split( - "For the model selection, the following models were evaluated on the mean squared error (MSE) metric:" - )[0] +For the model selection, the following models were evaluated on the mean squared error (MSE) metric:""" + in summary_string ) assert ( - "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice." + """===Note=== +Note, based on the selected auto assignment quality, the set of evaluated models changes. +For more insights toward the quality of the fitted graphical causal model, consider using the evaluate_causal_model function after fitting the causal mechanisms.""" in summary_string ) @@ -408,40 +441,50 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo assert len(summary_result._nodes["X4"]["model_performances"]) == 0 assert len(summary_result._nodes["Y"]["model_performances"]) > 0 - expected_summary = """Analyzed 6 nodes. + assert ( + """The following types of causal mechanisms are considered for the automatic selection: + +If root node: +An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of data modalities. + +If non-root node and the data is continuous: +Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i.To select the best model for f, different regression models are evaluated and the model with the smallest mean squared error is selected.Note that minimizing the mean squared error here is equivalent to selecting the best choice of an ANM. + +If non-root node and the data is discrete: +Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional constraint for f to only return discrete values. +Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider representing them as strings to ensure proper model selection. + +If non-root node and the data is categorical: +A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). +Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected. + +In total, 6 nodes were analyzed: --- Node: X0 -Node X0 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X0 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X1 -Node X1 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X1 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X2 -Node X2 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X2 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X3 -Node X3 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X3 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: X4 -Node X4 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution. +Node X4 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution. --- Node: Y -Node Y is a non-root node. Assigning 'Classifier FCM based on LogisticRegression(max_iter=10000)' to the node. -This represents the causal relationship as Y := f(X0,X1,X2,X3,X4,N). -For the model selection, the following models were evaluated on the (negative) F1 metric: -* -Based on the type of causal mechanism, the model with the lowest metric value represents the best choice.""" - - assert ( - summary_string.split( - "For the model selection, the following models were evaluated on the (negative) F1 metric:" - )[0] - == expected_summary.split( - "For the model selection, the following models were evaluated on the (negative) F1 metric:" - )[0] +Node Y is a non-root node with categorical data. Assigning 'Classifier FCM based on """ + in summary_string ) + assert "This represents the causal relationship as Y := f(X0,X1,X2,X3,X4,N)." in summary_string + assert "For the model selection, the following models were evaluated on the (negative) F1 metric:" in summary_string assert ( - "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice." + """===Note=== +Note, based on the selected auto assignment quality, the set of evaluated models changes. +For more insights toward the quality of the fitted graphical causal model, consider using the evaluate_causal_model function after fitting the causal mechanisms.""" in summary_string ) diff --git a/tests/gcm/test_fcms.py b/tests/gcm/test_fcms.py index 64ab7f844f..8c1eae5054 100644 --- a/tests/gcm/test_fcms.py +++ b/tests/gcm/test_fcms.py @@ -12,6 +12,7 @@ from dowhy.gcm import ( AdditiveNoiseModel, ClassifierFCM, + DiscreteAdditiveNoiseModel, EmpiricalDistribution, PostNonlinearModel, ProbabilisticCausalModel, @@ -33,6 +34,7 @@ InvertibleIdentityFunction, InvertibleLogarithmicFunction, ) +from dowhy.gcm.util.general import is_discrete def test_given_linear_data_when_fit_causal_graph_with_linear_anm_then_learns_correct_coefficients(): @@ -241,6 +243,41 @@ def test_given_logarithmic_data_when_fit_post_non_linear_sem_with_invertible_log assert sem_fitted.prediction_model.sklearn_model.coef_ == approx(np.array([2]), abs=0.05) +@flaky(max_runs=3) +def test_given_discrete_target_data_when_fit_discrete_additive_noise_model_then_behaves_as_expected(): + X = np.random.normal(0, 1, (1000, 2)) + X[X > 3] = 3 + X[X < -3] = -3 + Y = np.round(np.sum(X, axis=1)) + + danm = DiscreteAdditiveNoiseModel(create_linear_regressor()) + danm.fit(X, Y) + + test_X = np.random.normal(0, 1, (1000, 2)) + test_X[test_X > 3] = 3 + test_X[test_X < -3] = -3 + test_Y = np.round(np.sum(test_X, axis=1)).reshape(-1) + + assert danm.evaluate(test_X, np.zeros(1000)).reshape(-1) == approx(test_Y, abs=3) + assert danm.evaluate(test_X, np.zeros(1000)).reshape(-1) == approx(test_Y, abs=3) + assert set(danm.draw_samples(test_X).reshape(-1)).issubset(set(Y.reshape(-1))) + assert is_discrete(danm.draw_samples(test_X)) + + assert danm.estimate_noise(np.array([0, 1, 2]), np.array([[-1, 1], [0, 0], [0, 1]])).reshape(-1) == approx( + np.array([0, 1, 1]) + ) + + X = np.array([0.1, 10.5, 20, 30.7, 40.3]) + Y = np.floor(X) # Y has only 0, 10, 20, 30, 40 + + danm = DiscreteAdditiveNoiseModel(create_linear_regressor()) + danm.fit(X, Y) + + assert danm.evaluate(np.array([-100, -32.4, 0.4, 4, 9, 11, 30.1, 101.4, 0.9]), np.zeros(9)) == approx( + [-100, -32, 0, 4, 9, 11, 30, 101, 1] + ) + + def _generate_data_with_categorical_input(): X0 = np.random.normal(0, 1, 1000) X1 = np.random.choice(3, 1000).astype(str) diff --git a/tests/gcm/test_model_evaluation.py b/tests/gcm/test_model_evaluation.py index 6ce9d00a47..225e00c444 100644 --- a/tests/gcm/test_model_evaluation.py +++ b/tests/gcm/test_model_evaluation.py @@ -234,11 +234,18 @@ def test_given_continuous_data_only_when_evaluate_model_returns_expected_informa summary_string = str(summary) assert ( - """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and graph structure. The results are as follows: + """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and the graph structure. The results are as follows: ==== Evaluation of Causal Mechanisms ==== -Root nodes are evaluated based on the KL divergence between the generated and the observed distribution. -Non-root nodes are mainly evaluated based on the (normalized) Continuous Ranked Probability Score (CRPS), which is a generalizes the Mean Absolute Percentage Error to probabilistic predictions. Since the causal mechanisms produce conditional distributions, this should give some insights into their performance and calibration. In addition, the mean squared error (MSE), the normalized MSE (NMSE), the R2 coefficient and the F1 score (for categorical nodes) is reported.""" +The used evaluation metrics are: +- KL divergence (only for root-nodes): Evaluates the divergence between the generated and the observed distribution. +- Mean Squared Error (MSE): Evaluates the average squared differences between the observed values and the conditional expectation of the causal mechanisms. +- Normalized MSE (NMSE): The MSE normalized by the standard deviation for better comparison. +- R2 coefficient: Indicates how much variance is explained by the conditional expectations of the mechanisms. Note, however, that this can be misleading for nonlinear relationships. +- F1 score (only for categorical non-root nodes): The harmonic mean of the precision and recall indicating the goodness of the underlying classifier model. +- (normalized) Continuous Ranked Probability Score (CRPS): The CRPS generalizes the Mean Absolute Percentage Error to probabilistic predictions. This gives insights into the accuracy and calibration of the causal mechanisms. +NOTE: Every metric focuses on different aspects and they might not consistently indicate a good or bad performance. +We will mostly utilize the CRPS for comparing and interpreting the performance of the mechanisms, since this captures the most important properties for the causal model.""" in summary_string ) assert "--- Node X0\n" "- The KL divergence between generated and observed distribution is " in summary_string @@ -328,11 +335,18 @@ def test_given_categorical_data_only_when_evaluate_model_returns_expected_inform summary_string = str(summary) assert ( - """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and graph structure. The results are as follows: + """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and the graph structure. The results are as follows: ==== Evaluation of Causal Mechanisms ==== -Root nodes are evaluated based on the KL divergence between the generated and the observed distribution. -Non-root nodes are mainly evaluated based on the (normalized) Continuous Ranked Probability Score (CRPS), which is a generalizes the Mean Absolute Percentage Error to probabilistic predictions. Since the causal mechanisms produce conditional distributions, this should give some insights into their performance and calibration. In addition, the mean squared error (MSE), the normalized MSE (NMSE), the R2 coefficient and the F1 score (for categorical nodes) is reported.""" +The used evaluation metrics are: +- KL divergence (only for root-nodes): Evaluates the divergence between the generated and the observed distribution. +- Mean Squared Error (MSE): Evaluates the average squared differences between the observed values and the conditional expectation of the causal mechanisms. +- Normalized MSE (NMSE): The MSE normalized by the standard deviation for better comparison. +- R2 coefficient: Indicates how much variance is explained by the conditional expectations of the mechanisms. Note, however, that this can be misleading for nonlinear relationships. +- F1 score (only for categorical non-root nodes): The harmonic mean of the precision and recall indicating the goodness of the underlying classifier model. +- (normalized) Continuous Ranked Probability Score (CRPS): The CRPS generalizes the Mean Absolute Percentage Error to probabilistic predictions. This gives insights into the accuracy and calibration of the causal mechanisms. +NOTE: Every metric focuses on different aspects and they might not consistently indicate a good or bad performance. +We will mostly utilize the CRPS for comparing and interpreting the performance of the mechanisms, since this captures the most important properties for the causal model.""" in summary_string ) assert "--- Node X0\n" "- The KL divergence between generated and observed distribution is " in summary_string diff --git a/tests/gcm/test_whatif.py b/tests/gcm/test_whatif.py index 5dbdf90f5a..345780bef9 100644 --- a/tests/gcm/test_whatif.py +++ b/tests/gcm/test_whatif.py @@ -8,6 +8,7 @@ from dowhy.gcm import ( AdditiveNoiseModel, ClassifierFCM, + DiscreteAdditiveNoiseModel, EmpiricalDistribution, InvertibleStructuralCausalModel, ProbabilisticCausalModel, @@ -17,7 +18,11 @@ fit, interventional_samples, ) -from dowhy.gcm.ml import create_linear_regressor, create_logistic_regression_classifier +from dowhy.gcm.ml import ( + create_hist_gradient_boost_regressor, + create_linear_regressor, + create_logistic_regression_classifier, +) def _create_and_fit_simple_probabilistic_causal_model(): @@ -243,3 +248,33 @@ def test_given_binary_target_when_estimate_average_causal_effect_then_return_exp interventions_reference={"T": lambda x: 0}, num_samples_to_draw=1000, ) == approx(0.5, abs=0.1) + + +@flaky(max_runs=3) +def test_given_discrete_data_when_performing_interventions_then_returns_correct_samples(): + X = np.random.normal(0, 1, 1000) + Y = [] + for x in X: + if x < -1.5: + Y.append(-1) + elif -1.5 <= x <= 1.5: + Y.append(0) + else: + Y.append(1) + Y = np.array(Y) + Z = 2 * Y + np.random.normal(0, 0.1, 1000) + + causal_model = ProbabilisticCausalModel(nx.DiGraph([("X", "Y"), ("Y", "Z")])) + causal_model.set_causal_mechanism("X", EmpiricalDistribution()) + causal_model.set_causal_mechanism( + "Y", DiscreteAdditiveNoiseModel(prediction_model=create_hist_gradient_boost_regressor()) + ) + causal_model.set_causal_mechanism("Z", AdditiveNoiseModel(prediction_model=create_linear_regressor())) + data = pd.DataFrame({"X": X, "Y": Y, "Z": Z}) + + fit(causal_model, data) + + samples = interventional_samples(causal_model, {"X": lambda x: -2}, num_samples_to_draw=1000) + assert np.all(samples["X"].to_numpy() == -2) + assert np.median(samples["Y"].to_numpy()) == -1 + assert np.mean(samples["Z"].to_numpy()) == approx(-2, abs=0.05) diff --git a/tests/gcm/util/test_general.py b/tests/gcm/util/test_general.py index 74904ecaf1..f29df56585 100644 --- a/tests/gcm/util/test_general.py +++ b/tests/gcm/util/test_general.py @@ -16,6 +16,7 @@ fit_one_hot_encoders, has_categorical, is_categorical, + is_discrete, set_random_seed, setdiff2d, shape_into_2d, @@ -207,3 +208,14 @@ def test_given_categorical_data_when_using_auto_fit_and_apply_encoder_then_retur ] ).T ) + + +def test_given_discrete_data_when_calling_is_discrete_then_returns_true(): + assert is_discrete(np.array([0, -4, 5, 10])) + assert is_discrete(np.array([0, -4, 5, 10]).reshape(-1, 1)) + + +def test_given_non_discrete_data_when_calling_is_discrete_then_returns_false(): + assert not is_discrete(np.array([0, -4, 5, 10, 1.0000000001, 0.000000001, 10**-15, 99.9, 40.5])) + assert not is_discrete(np.array([10**-15])) + assert not is_discrete(np.array([0, -4, 5, 10, 1.0000000001, 0.000000001, 10**-15, 99.9, 40.5]).reshape(-1, 1))