Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a bias detector based on optimal transport #434

Merged
merged 26 commits into from
Jul 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
8e42e67
Updated all required files
Illia-Kryvoviaz Jun 7, 2023
7a85647
Some minor changes in files and updated tests
Illia-Kryvoviaz Jun 7, 2023
0d276aa
Deleted extra prints
Illia-Kryvoviaz Jun 7, 2023
a6faacd
Simplifying the ot notebook and correcting some mistypes
Illia-Kryvoviaz Jun 10, 2023
2db5f83
Added more examples to ot notebook
Illia-Kryvoviaz Jun 12, 2023
6d77b72
Update __init__.py
Illia-Kryvoviaz Jun 12, 2023
a9e4b55
Update detectors.py
Illia-Kryvoviaz Jun 12, 2023
4921f5b
Update requirements.txt
Illia-Kryvoviaz Jun 12, 2023
881ea08
Improving the notebook and adding a new feature
Illia-Kryvoviaz Jun 18, 2023
90c538f
Dev ot detector (#4)
Illia-Kryvoviaz Jun 24, 2023
7029b20
Added outputs to the notebook
Illia-Kryvoviaz Jun 24, 2023
425452b
Minor docstrings changes and update detectors.py
Illia-Kryvoviaz Jun 25, 2023
0be802e
changed demo_ot_detector to use load_preproc_data_adult
Illia-Kryvoviaz Jul 10, 2023
f96a451
ot_detector: renamed sensitive_attribute to prot_attr, minor changes
Illia-Kryvoviaz Jul 10, 2023
9c46f9a
updated comments, demo_ot_detector.ipynb
Illia-Kryvoviaz Jul 11, 2023
872f112
ot_detector: removed str arguments
Illia-Kryvoviaz Jul 11, 2023
d280b4e
ot_detector: added cost_matrix as a named parameter, minor changes
Illia-Kryvoviaz Jul 11, 2023
aa24084
ot_detector: minor changes
Illia-Kryvoviaz Jul 11, 2023
f587762
added outputs to demo_ot_detector
Illia-Kryvoviaz Jul 11, 2023
2094b47
ot_detector: changed default scoring to Wasserstein1
Illia-Kryvoviaz Jul 11, 2023
47a2678
moved OT from detector to metric
Illia-Kryvoviaz Jul 14, 2023
6acd270
renamed ot_detector to ot_metric
Illia-Kryvoviaz Jul 14, 2023
6155f99
reworked demo_ot_metric to use aif360.sklearn definition
Illia-Kryvoviaz Jul 14, 2023
3d3f039
renamed ot_bias_scan to ot_distance, minor changes
Illia-Kryvoviaz Jul 21, 2023
bb2ec68
detectors.py: reset changes
Illia-Kryvoviaz Jul 21, 2023
10c7ec8
test_ot_metric: minor changes
Illia-Kryvoviaz Jul 21, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 225 additions & 0 deletions aif360/metrics/ot_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
from typing import Union
import pandas as pd
import numpy as np
import ot
from sklearn.preprocessing import LabelEncoder

def _normalize(distribution1, distribution2):
"""
Transform distributions to pleasure form, that is their sums are equal to 1,
and in case if there is negative values, increase all values with absolute value of smallest number.

Args:
distribution1 (numpy array): nontreated distribution
distribution2 (numpy array): nontreated distribution
"""
if np.minimum(np.min(distribution1), np.min(distribution2)) < 0:
extra = -np.minimum(np.min(distribution1), np.min(distribution2))
distribution1 += extra
distribution2 += extra

total_of_distribution1 = np.sum(distribution1)
if total_of_distribution1 != 0:
distribution1 /= total_of_distribution1
total_of_distribution2 = np.sum(distribution2)
if total_of_distribution2 != 0:
distribution2 /= total_of_distribution2

def _transform(ground_truth, classifier, cost_matrix=None):
"""
Transform given distributions from pandas type to numpy arrays, and _normalize them.
Rearanges distributions, with totall data allocated of one.
Generates matrix distance with respect to (ground_truth[i] - classifier[j])^2.

Args:
ground_truth (series): ground truth (correct) target values
classifier (series, dataframe, optional): pandas series estimated targets
as returned by a model for binary, continuous and ordinal modes.

Returns:
initial_distribution, which is an processed ground_truth (numpy array)
required_distribution, which is an processed classifier (numpy array)
matrix_distance, which stores the distances between the cells of distributions (2d numpy array)
"""
initial_distribution = ground_truth.to_numpy().astype(float)
required_distribution = classifier.to_numpy().astype(float)

_normalize(initial_distribution, required_distribution)

if cost_matrix is not None:
matrix_distance = cost_matrix
else:
matrix_distance = np.array([abs(i - required_distribution) for i in initial_distribution], dtype=float)
return initial_distribution, required_distribution, matrix_distance

def _evaluate(
ground_truth: pd.Series,
classifier: pd.Series,
prot_attr: pd.Series=None,
num_iters=1e5,
cost_matrix: np.ndarray=None,
**kwargs):
"""calculate Wasserstein distance between groups defined by `prot_attr` in `ground_truth` and `classifier`.

Args:
ground_truth (pd.Series, str): ground truth (correct) target value
classifier (pd.Series): estimated target values
prot_attr (pd.Series, str): pandas series of sensitive attribute values
num_iters (int, optional): number of iterations (random restarts). Should be positive.

Returns:
ot.emd2 (float, dict): Earth mover's distance or dictionary of optimal transports for each of option of classifier
"""

# Calculate just the EMD between ground_truth and classifier
if prot_attr is None:
initial_distribution, required_distribution, matrix_distance = _transform(ground_truth, classifier, cost_matrix)
return ot.emd2(a=initial_distribution, b=required_distribution, M=matrix_distance, numItermax=num_iters)

if not ground_truth.nunique() == 2:
raise ValueError(f"Expected to have exactly 2 target values, got {ground_truth.nunique()}.")

# Calculate EMD between ground truth distribution and distribution of each group
emds = {}
for sa_val in sorted(prot_attr.unique()):
initial_distribution = ground_truth[prot_attr == sa_val]
required_distribution = classifier[prot_attr == sa_val]
initial_distribution, required_distribution, matrix_distance = _transform(initial_distribution, required_distribution, cost_matrix)
emds[sa_val] = ot.emd2(a=initial_distribution, b=required_distribution, M=matrix_distance, numItermax=num_iters)

return emds


def ot_distance(
ground_truth: pd.Series,
classifier: Union[pd.Series, pd.DataFrame],
prot_attr: pd.Series = None,
favorable_value: Union[str, float] = None,
scoring: str = "Wasserstein1",
num_iters: int = 1e5,
penalty: float = 1e-17,
mode: str = "binary",
cost_matrix: np.ndarray=None,
**kwargs,
):
"""Normalize and calculate Wasserstein distance between groups defined by `prot_attr` in `ground_truth` and `classifier`.

Args:
ground_truth (pd.Series, str): ground truth (correct) target values.
classifier (pd.Series, pd.DataFrame, str): estimated target values.
If `mode` is nominal, must be a dataframe with columns containing predictions for each nominal class.
If `None`, model is assumed to be a dummy model that predicts the mean of the targets
or 1/(number of categories) for nominal mode.
prot_attr (pd.Series, str): sensitive attribute values.
If `None`, assume all samples belong to the same protected group.
favorable_value(str, float, optional): Either "high", "low" or a float value if the mode in [binary, ordinal, or continuous].
If float, value has to be the minimum or the maximum in the ground_truth column.
Defaults to high if None for these modes.
Support for float left in to keep the intuition clear in binary classification tasks.
If `mode` is nominal, favorable values should be one of the unique categories in the ground_truth.
Defaults to a one-vs-all scan if None for nominal mode.
scoring (str or class): only 'Wasserstein1'
num_iters (int, optional): number of iterations (random restarts) for EMD. Should be positive.
penalty (float, optional): penalty term. Should be positive. The penalty term as with any regularization parameter
may need to be tuned for a particular use case. The higher the penalty, the higher the influence of entropy regualizer.
mode: one of ['binary', 'continuous', 'nominal', 'ordinal']. Defaults to binary.
In nominal mode, up to 10 categories are supported by default.
To increase this, pass in keyword argument max_nominal = integer value.
cost_matrix (np.ndarray): cost matrix for the Wasserstein distance. Defaults to absolute difference between samples.

Returns:
ot.emd2 (float, dict): Earth mover's distance or dictionary of optimal transports for each of option of classifier

Raises:
ValueError: if `mode` is 'binary' but `ground_truth` contains less than 1 or more than 2 unique values.
"""

# Assert correct mode passed
if mode not in ['binary', 'continuous', 'nominal', 'ordinal']:
raise ValueError(f"Expected one of {['binary', 'continuous', 'nominal', 'ordinal']}, got {mode}.")

# Assert correct types passed to ground_truth, classifier and prot_attr
if not isinstance(ground_truth, (pd.Series, str)):
raise TypeError(f"ground_truth: expected pd.Series or str, got {type(ground_truth)}")
if classifier is not None:
if mode in ["binary", "continuous"] and not isinstance(classifier, pd.Series):
raise TypeError(f"classifier: expected pd.Series for {mode} mode, got {type(classifier)}")
if mode in ["nominal", "ordinal"] and not isinstance(classifier, pd.DataFrame):
raise TypeError(f"classifier: expected pd.DataFrame for {mode} mode, got {type(classifier)}")
if prot_attr is not None and not isinstance(prot_attr, (pd.Series, str)):
raise TypeError(f"prot_attr: expected pd.Series or str, got {type(prot_attr)}")

# Assert correct type passed to cost_matrix
if cost_matrix is not None and not isinstance(cost_matrix, np.ndarray):
raise TypeError(f"cost_matrix: expected numpy.ndarray, got {type(cost_matrix)}")

# Assert scoring is "Wasserstein1"
if not scoring == "Wasserstein1":
raise ValueError(f"Scoring mode can only be \"Wasserstein1\", got {scoring}")

grt = ground_truth.copy()

if classifier is not None:
cls = classifier.copy()
if prot_attr is not None:
cls.index = grt.index
else:
cls = None

if prot_attr is not None:
sat = prot_attr.copy()
sat.index = grt.index
else:
sat = None

uniques = list(grt.unique())
if mode == "binary":
if len(uniques) > 2:
raise ValueError(f"Only 2 unique values allowed in ground_truth for binary mode, got {uniques}")

# Encode variables
if not np.issubdtype(grt.dtype, np.number):
grt_encoder = LabelEncoder().fit(grt)
grt = pd.Series(grt_encoder.transform(grt))

# Set correct favorable value (this tells us if higher or lower is better)
min_val, max_val = grt.min(), grt.max()

if favorable_value == 'high':
favorable_value = max_val
elif favorable_value == 'low':
favorable_value = min_val
elif favorable_value is None:
if mode in ["binary", "ordinal", "continuous"]:
favorable_value = max_val # Default to higher is better
elif mode == "nominal":
favorable_value = "flag-all" # Default to scan through all categories

if favorable_value not in [min_val, max_val, "flag-all", *uniques,]:
raise ValueError(f"Favorable_value should be high, low, or one of categories {uniques}, got {favorable_value}.")

if mode == "binary": # Flip ground truth if favorable_value is 0 in binary mode.
grt = pd.Series(grt == favorable_value, dtype=int)
if cls is None:
cls = pd.Series(grt.mean(), index=grt.index)
emds = _evaluate(grt, cls, sat, num_iters, cost_matrix, **kwargs)

elif mode == "continuous":
if cls is None:
cls = pd.Series(grt.mean(), index=grt.index)
emds = _evaluate(grt, cls, sat, num_iters,cost_matrix, **kwargs)

## TODO: rework ordinal mode to take into account distance between pred and true
elif mode in ["nominal", "ordinal"]:
if cls is None: # Set classifier to 1/(num of categories) for nominal mode
cls = pd.DataFrame([pd.Series(1 / grt.nunique(), index=grt.index)]*grt.nunique())
if grt.nunique() != cls.shape[-1]:
raise ValueError(
f"classifier must have a column for each class. Expected shape [:, {grt.nunique()}], got {cls.shape}")
emds = {}
for class_label in uniques:
grt_cl = grt.map({class_label: 1}).fillna(0)
cls_cl = cls[class_label]
emds[class_label] = _evaluate(grt_cl, cls_cl, sat, num_iters, cost_matrix, **kwargs)

return emds
62 changes: 61 additions & 1 deletion aif360/sklearn/metrics/metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from itertools import permutations
from typing import Union

import numpy as np
import pandas as pd
Expand All @@ -10,10 +11,12 @@
from sklearn.utils import check_X_y
from sklearn.utils.deprecation import deprecated

from aif360.metrics import ot_metric
from aif360.sklearn.utils import check_inputs, check_groups
from aif360.detectors.mdss.ScoringFunctions import BerkJones, Bernoulli
from aif360.detectors.mdss.MDSS import MDSS


__all__ = [
# meta-metrics
'difference', 'ratio', 'intersection', 'one_vs_rest',
Expand All @@ -24,7 +27,7 @@
'specificity_score', 'base_rate', 'selection_rate', 'smoothed_base_rate',
'smoothed_selection_rate', 'generalized_fpr', 'generalized_fnr',
# group fairness
'statistical_parity_difference', 'disparate_impact_ratio',
'ot_distance', 'statistical_parity_difference', 'disparate_impact_ratio',
'equal_opportunity_difference', 'average_odds_difference', 'average_predictive_value_difference',
'average_odds_error', 'class_imbalance', 'kl_divergence',
'conditional_demographic_disparity', 'smoothed_edf',
Expand Down Expand Up @@ -499,6 +502,63 @@ def generalized_fnr(y_true, probas_pred, *, pos_label=1, sample_weight=None,


# ============================ GROUP FAIRNESS ==================================
def ot_distance(
y_true: pd.Series,
y_pred: Union[pd.Series, pd.DataFrame],
prot_attr: pd.Series = None,
pos_label: Union[str, float] = None,
scoring: str = "Wasserstein1",
num_iters: int = 1e5,
penalty: float = 1e-17,
mode: str = "binary",
cost_matrix: np.ndarray=None,
**kwargs,
):
"""Normalize and calculate Wasserstein distance between groups defined by `prot_attr` in `y_true` and `y_pred`.

Args:
y_true (pd.Series): ground truth (correct) target values.
y_pred (pd.Series, pd.DataFrame): estimated target values.
If `mode` is nominal, must be a `pd.DataFrame` with columns containing predictions for each nominal class,
or list of corresponding column names in `data`.
If `None`, model is assumed to be a dummy model that predicts the mean of the targets
or 1/(number of categories) for nominal mode.
prot_attr (pd.Series): sensitive attribute values.
If `None`, assume all samples belong to the same protected group.
pos_label(str, float, optional): Either "high", "low" or a float value if the mode in [binary, ordinal, or continuous].
If float, value has to be the minimum or the maximum in the ground_truth column.
Defaults to high if None for these modes.
Support for float left in to keep the intuition clear in binary classification tasks.
If `mode` is nominal, favorable values should be one of the unique categories in the ground_truth.
Defaults to a one-vs-all scan if None for nominal mode.
scoring (str or class): only 'Wasserstein1'
num_iters (int, optional): number of iterations (random restarts) for EMD. Should be positive.
penalty (float, optional): penalty term. Should be positive. The penalty term as with any regularization parameter
may need to be tuned for a particular use case. The higher the penalty, the higher the influence of entropy regualizer.
mode: one of ['binary', 'continuous', 'nominal', 'ordinal']. Defaults to binary.
In nominal mode, up to 10 categories are supported by default.
To increase this, pass in keyword argument max_nominal = integer value.
cost_matrix (np.ndarray): cost matrix for the Wasserstein distance. Defaults to absolute difference between samples.

Returns:
ot.emd2 (float, dict): Earth mover's distance or dictionary of optimal transports for each of option of classifier

Raises:
ValueError: if `mode` is 'binary' but `ground_truth` contains less than 1 or more than 2 unique values.
"""
return ot_metric.ot_distance(
ground_truth=y_true,
classifier=y_pred,
prot_attr=prot_attr,
favorable_value=pos_label,
scoring=scoring,
num_iters=num_iters,
penalty=penalty,
mode=mode,
cost_matrix=cost_matrix,
**kwargs
)

def statistical_parity_difference(y_true, y_pred=None, *, prot_attr=None,
priv_group=1, pos_label=1, sample_weight=None):
r"""Difference in selection rates.
Expand Down
Loading