Issue 265 privileged class bank dataset (#449)

* Updated readme for bank dataset * Added age >60 to unprivileged group in bank_dataset.py * Added tests for bank dataset * Fixed linting errors for all tests/test_standard_datasets.py * Added binary_age to fetch_bank * Download bank dataset in ci.yml
Trusted-AI · Jul 27, 2023 · 6f9972e · 6f9972e
1 parent 502ff47
commit 6f9972e
Show file tree

Hide file tree

Showing 6 changed files with 86 additions and 28 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -57,6 +57,7 @@ jobs:
           wget ${UCI_DB}/statlog/german/german.data -P aif360/data/raw/german/
           wget ${UCI_DB}/statlog/german/german.doc -P aif360/data/raw/german/
           wget ${PROPUBLICA_GH}/compas-scores-two-years.csv -P aif360/data/raw/compas/
+          wget ${UCI_DB}/00222/bank-additional.zip -P aif360/data/raw/bank/ && unzip -j aif360/data/raw/bank/bank-additional.zip -d aif360/data/raw/bank/ && rm aif360/data/raw/bank/bank-additional.zip
           (cd aif360/data/raw/meps;Rscript generate_data.R <<< y)
 
       - name: Lint with flake8

diff --git a/aif360/data/raw/bank/README.md b/aif360/data/raw/bank/README.md
@@ -12,7 +12,7 @@ Additional information on dataset and features is available in `bank-additional-
 
 1. Download the file [bank-additional.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip).
 
-2. Extract files from the downloaded archive and place the files 'bank-additional.csv' and 'bank-additional-names.txt' into the current folder.
+2. Extract files from the downloaded archive and place the files 'bank-additional-full.csv' and 'bank-additional-names.txt' into the current folder.
 
 ## Relevant Papers
 

diff --git a/aif360/datasets/bank_dataset.py b/aif360/datasets/bank_dataset.py
@@ -13,7 +13,7 @@ class BankDataset(StandardDataset):
 
     def __init__(self, label_name='y', favorable_classes=['yes'],
                  protected_attribute_names=['age'],
-                 privileged_classes=[lambda x: x >= 25],
+                 privileged_classes=[lambda x: x >= 25 and x < 60],
                  instance_weights_name=None,
                  categorical_features=['job', 'marital', 'education', 'default',
                      'housing', 'loan', 'contact', 'month', 'day_of_week',
@@ -24,8 +24,15 @@ def __init__(self, label_name='y', favorable_classes=['yes'],
         """See :obj:`StandardDataset` for a description of the arguments.
 
         By default, this code converts the 'age' attribute to a binary value
-        where privileged is `age >= 25` and unprivileged is `age < 25` as in
-        :obj:`GermanDataset`.
+        where privileged is `25 <= age < 60` and unprivileged is `age < 25` or `age >= 60`
+        as suggested in Le Quy, Tai, et al. [1].
+
+        References:
+            .. [1] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine 
+            learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge 
+            Discovery 12.3 (2022): e1452.
+
+
         """
 
         filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),

diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py
@@ -168,12 +168,19 @@ def fetch_german(*, data_home=None, cache=True, binary_age=True, usecols=None,
                                dropcols=dropcols, numeric_only=numeric_only,
                                dropna=dropna)
 
-def fetch_bank(*, data_home=None, cache=True, percent10=False, usecols=None,
-               dropcols=['duration'], numeric_only=False, dropna=False):
+def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False, 
+               usecols=None, dropcols=['duration'], numeric_only=False, dropna=False):
     """Load the Bank Marketing Dataset.
 
-    The protected attribute is 'age' (left as continuous). The outcome variable
-    is 'deposit': 'yes' or 'no'.
+    The protected attribute is 'age' (binarized by default as suggested by [#lequy22]: 
+    age >= 25 and age <60 is considered privileged and age< 25 or age >= 60 unprivileged; 
+    see the binary_age flag to keep this continuous). The outcome variable is 'deposit': 
+    'yes' or 'no'.
+
+        References:
+            .. [#lequy22] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine 
+            learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge 
+            Discovery 12.3 (2022): e1452.
 
     Note:
         By default, the data is downloaded from OpenML. See the `bank-marketing
@@ -228,7 +235,15 @@ def fetch_bank(*, data_home=None, cache=True, percent10=False, usecols=None,
             df[col] = df[col].cat.remove_categories('unknown')
     df.education = df.education.astype('category').cat.reorder_categories(
         ['primary', 'secondary', 'tertiary'], ordered=True)
-
-    return standardize_dataset(df, prot_attr='age', target='deposit',
+
+    # binarize protected attribute (but not corresponding feature)
+    age = (pd.cut(df.age, [0, 24, 60, 100], ordered=False,
+                labels=[0, 1, 0] if numeric_only 
+                else ['<25 or >=60', '25-60', '<25 or >=60'])
+        if binary_age else 'age')
+    age = age.cat.reorder_categories([0, 1] if numeric_only 
+                                    else ['<25 or >=60', '25-60'])
+
+    return standardize_dataset(df, prot_attr=[age], target='deposit',
                                usecols=usecols, dropcols=dropcols,
                                numeric_only=numeric_only, dropna=dropna)
diff --git a/tests/notebook_runner.py b/tests/notebook_runner.py
@@ -9,8 +9,11 @@
 
 def notebook_run(path):
     """Execute a notebook via nbconvert and collect output.
+    Reset cwd after execution. 
        :returns (parsed nb object, execution errors)
     """
+    old_cwd = os.getcwd()
+
     dirname, __ = os.path.split(path)
     os.chdir(dirname)
 
@@ -31,5 +34,7 @@ def notebook_run(path):
     errors = [output for cell in nb.cells if "outputs" in cell
                      for output in cell["outputs"]
                      if output.output_type == "error"]
+
+    os.chdir(old_cwd)
 
     return nb, errors
diff --git a/tests/test_standard_datasets.py b/tests/test_standard_datasets.py
@@ -1,44 +1,74 @@
+""" Tests for standard dataset classes """
+
+from unittest.mock import patch
 import numpy as np
 import pandas as pd
-
-pd.set_option('display.max_rows', 50)
-pd.set_option('display.max_columns', 10)
-pd.set_option('display.width', 200)
+import os
 
 from aif360.datasets import AdultDataset
 from aif360.datasets import BankDataset
 from aif360.datasets import CompasDataset
 from aif360.datasets import GermanDataset
 from aif360.metrics import BinaryLabelDatasetMetric
 
+pd.set_option('display.max_rows', 50)
+pd.set_option('display.max_columns', 10)
+pd.set_option('display.width', 200)
 
 def test_compas():
+    ''' Test default loading for compas '''
     # just test that there are no errors for default loading...
-    cd = CompasDataset()
-    # print(cd)
+    compas_dataset = CompasDataset()
+    compas_dataset.validate_dataset()
 
 def test_german():
-    gd = GermanDataset()
-    bldm = BinaryLabelDatasetMetric(gd)
+    ''' Test default loading for german '''
+    german_dataset = GermanDataset()
+    bldm = BinaryLabelDatasetMetric(german_dataset)
     assert bldm.num_instances() == 1000
 
 def test_adult_test_set():
-    ad = AdultDataset()
-    # test, train = ad.split([16281])
-    test, train = ad.split([15060])
+    ''' Test default loading for adult, test set '''
+    adult_dataset = AdultDataset()
+    test, _ = adult_dataset.split([15060])
     assert np.any(test.labels)
 
 def test_adult():
-    ad = AdultDataset()
-    # print(ad.feature_names)
-    assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5)
-
-    bldm = BinaryLabelDatasetMetric(ad)
+    ''' Test default loading for adult, mean'''
+    adult_dataset = AdultDataset()
+    assert np.isclose(adult_dataset.labels.mean(), 0.2478, atol=5e-5)
+    bldm = BinaryLabelDatasetMetric(adult_dataset)
     assert bldm.num_instances() == 45222
 
 def test_adult_no_drop():
-    ad = AdultDataset(protected_attribute_names=['sex'],
+    ''' Test default loading for adult, number of instances '''
+    adult_dataset = AdultDataset(protected_attribute_names=['sex'],
         privileged_classes=[['Male']], categorical_features=[],
         features_to_keep=['age', 'education-num'])
-    bldm = BinaryLabelDatasetMetric(ad)
+    bldm = BinaryLabelDatasetMetric(adult_dataset)
     assert bldm.num_instances() == 48842
+
+def test_bank():
+    ''' Test for errors during default loading '''
+    bank_dataset = BankDataset()
+    bank_dataset.validate_dataset()
+
+def test_bank_priviliged_attributes():
+    ''' Test if protected attribute age is correctly processed '''
+    # Bank Data Set
+    bank_dataset = BankDataset()
+    num_priv = bank_dataset.protected_attributes.sum()
+
+    # Raw data
+    # TO DO: add file path. 
+    filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                            '..', 'aif360', 'data', 'raw', 'bank', 'bank-additional-full.csv')
+
+    bank_dataset_unpreproc = pd.read_csv(filepath, sep = ";", na_values = ["unknown"])
+    bank_dataset_unpreproc = bank_dataset_unpreproc.dropna()
+    num_priv_raw = len(bank_dataset_unpreproc[(bank_dataset_unpreproc["age"] >= 25) & (bank_dataset_unpreproc["age"] < 60)])
+    assert num_priv == num_priv_raw
+
+
+
+