Skip to content

Commit

Permalink
Issue 265 privileged class bank dataset (#449)
Browse files Browse the repository at this point in the history
* Updated readme for bank dataset
* Added age >60 to unprivileged group in bank_dataset.py
* Added tests for bank dataset
* Fixed linting errors for all tests/test_standard_datasets.py
* Added binary_age to fetch_bank
* Download bank dataset in ci.yml
  • Loading branch information
joosjegoedhart authored Jul 27, 2023
1 parent 502ff47 commit 6f9972e
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 28 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
wget ${UCI_DB}/statlog/german/german.data -P aif360/data/raw/german/
wget ${UCI_DB}/statlog/german/german.doc -P aif360/data/raw/german/
wget ${PROPUBLICA_GH}/compas-scores-two-years.csv -P aif360/data/raw/compas/
wget ${UCI_DB}/00222/bank-additional.zip -P aif360/data/raw/bank/ && unzip -j aif360/data/raw/bank/bank-additional.zip -d aif360/data/raw/bank/ && rm aif360/data/raw/bank/bank-additional.zip
(cd aif360/data/raw/meps;Rscript generate_data.R <<< y)
- name: Lint with flake8
Expand Down
2 changes: 1 addition & 1 deletion aif360/data/raw/bank/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Additional information on dataset and features is available in `bank-additional-

1. Download the file [bank-additional.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip).

2. Extract files from the downloaded archive and place the files 'bank-additional.csv' and 'bank-additional-names.txt' into the current folder.
2. Extract files from the downloaded archive and place the files 'bank-additional-full.csv' and 'bank-additional-names.txt' into the current folder.

## Relevant Papers

Expand Down
13 changes: 10 additions & 3 deletions aif360/datasets/bank_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class BankDataset(StandardDataset):

def __init__(self, label_name='y', favorable_classes=['yes'],
protected_attribute_names=['age'],
privileged_classes=[lambda x: x >= 25],
privileged_classes=[lambda x: x >= 25 and x < 60],
instance_weights_name=None,
categorical_features=['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month', 'day_of_week',
Expand All @@ -24,8 +24,15 @@ def __init__(self, label_name='y', favorable_classes=['yes'],
"""See :obj:`StandardDataset` for a description of the arguments.
By default, this code converts the 'age' attribute to a binary value
where privileged is `age >= 25` and unprivileged is `age < 25` as in
:obj:`GermanDataset`.
where privileged is `25 <= age < 60` and unprivileged is `age < 25` or `age >= 60`
as suggested in Le Quy, Tai, et al. [1].
References:
.. [1] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine
learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge
Discovery 12.3 (2022): e1452.
"""

filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
Expand Down
27 changes: 21 additions & 6 deletions aif360/sklearn/datasets/openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,19 @@ def fetch_german(*, data_home=None, cache=True, binary_age=True, usecols=None,
dropcols=dropcols, numeric_only=numeric_only,
dropna=dropna)

def fetch_bank(*, data_home=None, cache=True, percent10=False, usecols=None,
dropcols=['duration'], numeric_only=False, dropna=False):
def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False,
usecols=None, dropcols=['duration'], numeric_only=False, dropna=False):
"""Load the Bank Marketing Dataset.
The protected attribute is 'age' (left as continuous). The outcome variable
is 'deposit': 'yes' or 'no'.
The protected attribute is 'age' (binarized by default as suggested by [#lequy22]:
age >= 25 and age <60 is considered privileged and age< 25 or age >= 60 unprivileged;
see the binary_age flag to keep this continuous). The outcome variable is 'deposit':
'yes' or 'no'.
References:
.. [#lequy22] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine
learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge
Discovery 12.3 (2022): e1452.
Note:
By default, the data is downloaded from OpenML. See the `bank-marketing
Expand Down Expand Up @@ -228,7 +235,15 @@ def fetch_bank(*, data_home=None, cache=True, percent10=False, usecols=None,
df[col] = df[col].cat.remove_categories('unknown')
df.education = df.education.astype('category').cat.reorder_categories(
['primary', 'secondary', 'tertiary'], ordered=True)

return standardize_dataset(df, prot_attr='age', target='deposit',

# binarize protected attribute (but not corresponding feature)
age = (pd.cut(df.age, [0, 24, 60, 100], ordered=False,
labels=[0, 1, 0] if numeric_only
else ['<25 or >=60', '25-60', '<25 or >=60'])
if binary_age else 'age')
age = age.cat.reorder_categories([0, 1] if numeric_only
else ['<25 or >=60', '25-60'])

return standardize_dataset(df, prot_attr=[age], target='deposit',
usecols=usecols, dropcols=dropcols,
numeric_only=numeric_only, dropna=dropna)
5 changes: 5 additions & 0 deletions tests/notebook_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@

def notebook_run(path):
"""Execute a notebook via nbconvert and collect output.
Reset cwd after execution.
:returns (parsed nb object, execution errors)
"""
old_cwd = os.getcwd()

dirname, __ = os.path.split(path)
os.chdir(dirname)

Expand All @@ -31,5 +34,7 @@ def notebook_run(path):
errors = [output for cell in nb.cells if "outputs" in cell
for output in cell["outputs"]
if output.output_type == "error"]

os.chdir(old_cwd)

return nb, errors
66 changes: 48 additions & 18 deletions tests/test_standard_datasets.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,74 @@
""" Tests for standard dataset classes """

from unittest.mock import patch
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 200)
import os

from aif360.datasets import AdultDataset
from aif360.datasets import BankDataset
from aif360.datasets import CompasDataset
from aif360.datasets import GermanDataset
from aif360.metrics import BinaryLabelDatasetMetric

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 200)

def test_compas():
''' Test default loading for compas '''
# just test that there are no errors for default loading...
cd = CompasDataset()
# print(cd)
compas_dataset = CompasDataset()
compas_dataset.validate_dataset()

def test_german():
gd = GermanDataset()
bldm = BinaryLabelDatasetMetric(gd)
''' Test default loading for german '''
german_dataset = GermanDataset()
bldm = BinaryLabelDatasetMetric(german_dataset)
assert bldm.num_instances() == 1000

def test_adult_test_set():
ad = AdultDataset()
# test, train = ad.split([16281])
test, train = ad.split([15060])
''' Test default loading for adult, test set '''
adult_dataset = AdultDataset()
test, _ = adult_dataset.split([15060])
assert np.any(test.labels)

def test_adult():
ad = AdultDataset()
# print(ad.feature_names)
assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5)

bldm = BinaryLabelDatasetMetric(ad)
''' Test default loading for adult, mean'''
adult_dataset = AdultDataset()
assert np.isclose(adult_dataset.labels.mean(), 0.2478, atol=5e-5)
bldm = BinaryLabelDatasetMetric(adult_dataset)
assert bldm.num_instances() == 45222

def test_adult_no_drop():
ad = AdultDataset(protected_attribute_names=['sex'],
''' Test default loading for adult, number of instances '''
adult_dataset = AdultDataset(protected_attribute_names=['sex'],
privileged_classes=[['Male']], categorical_features=[],
features_to_keep=['age', 'education-num'])
bldm = BinaryLabelDatasetMetric(ad)
bldm = BinaryLabelDatasetMetric(adult_dataset)
assert bldm.num_instances() == 48842

def test_bank():
''' Test for errors during default loading '''
bank_dataset = BankDataset()
bank_dataset.validate_dataset()

def test_bank_priviliged_attributes():
''' Test if protected attribute age is correctly processed '''
# Bank Data Set
bank_dataset = BankDataset()
num_priv = bank_dataset.protected_attributes.sum()

# Raw data
# TO DO: add file path.
filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'aif360', 'data', 'raw', 'bank', 'bank-additional-full.csv')

bank_dataset_unpreproc = pd.read_csv(filepath, sep = ";", na_values = ["unknown"])
bank_dataset_unpreproc = bank_dataset_unpreproc.dropna()
num_priv_raw = len(bank_dataset_unpreproc[(bank_dataset_unpreproc["age"] >= 25) & (bank_dataset_unpreproc["age"] < 60)])
assert num_priv == num_priv_raw




0 comments on commit 6f9972e

Please sign in to comment.