Merge pull request #844 from talumbau/handle_samples

Sample from weights when not reading full dataset
PSLmodels · Aug 10, 2016 · 4fb4aba · 4fb4aba
2 parents c34ce25 + e59ad62
commit 4fb4aba
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 0 deletions.
diff --git a/taxcalc/records.py b/taxcalc/records.py
@@ -215,6 +215,12 @@ def __init__(self,
         # read extrapolation blowup factors and sample weights
         self._read_blowup(blowup_factors)
         self._read_weights(weights)
+        # weights must be same size as tax record data
+        if not self.WT.empty and self.dim != len(self.WT):
+            frac = float(self.dim) / len(self.WT)
+            self.WT = self.WT.iloc[self.index]
+            self.WT = self.WT / frac
+
         # specify current_year and FLPDYR values
         if isinstance(start_year, int):
             self._current_year = start_year
@@ -383,6 +389,7 @@ def _read_data(self, data, exact_calcs, schR_calcs):
             msg = 'data is neither a string nor a Pandas DataFrame'
             raise ValueError(msg)
         self.dim = len(taxdf)
+        self.index = taxdf.index
         # create class variables using taxdf column names
         READ_VARS = set()
         for varname in list(taxdf.columns.values):

diff --git a/taxcalc/tests/test_pufcsv.py b/taxcalc/tests/test_pufcsv.py
@@ -29,6 +29,38 @@
 import pytest
 import difflib
 import numpy as np
+import pandas as pd
+
+
+@pytest.mark.requires_pufcsv
+def test_sample():
+    """
+    Test if reading in a sample of the data produces a reasonable estimate
+    relative to the full data set
+    """
+    # Full dataset
+    clp = Policy()
+    puf = Records(data=PUFCSV_PATH)
+    calc = Calculator(policy=clp, records=puf)
+    adt = calc.diagnostic_table(num_years=10)
+
+    # Sample sample dataset
+    clp2 = Policy()
+    tax_data_full = pd.read_csv(PUFCSV_PATH)
+    tax_data = tax_data_full.sample(frac=0.02)
+    puf_sample = Records(data=tax_data)
+    calc_sample = Calculator(policy=clp2, records=puf_sample)
+    adt_sample = calc_sample.diagnostic_table(num_years=10)
+
+    # Get the final combined tax liability for the budget period
+    # in the sample and the full dataset and make sure they are close
+    full_tax_liability = adt.loc["Combined liability ($b)"]
+    sample_tax_liability = adt_sample.loc["Combined liability ($b)"]
+    max_val = max(full_tax_liability.max(), sample_tax_liability.max())
+    rel_diff = max(abs(full_tax_liability - sample_tax_liability)) / max_val
+
+    # Fail on greater than 5% releative difference in any budget year
+    assert rel_diff < 0.05
 
 
 @pytest.mark.requires_pufcsv

diff --git a/taxcalc/tests/test_records.py b/taxcalc/tests/test_records.py
@@ -14,6 +14,7 @@
 # use 1991 PUF-like data to emulate current puf.csv, which is private
 TAXDATA_PATH = os.path.join(CUR_PATH, '..', 'altdata', 'puf91taxdata.csv.gz')
 TAXDATA = pd.read_csv(TAXDATA_PATH, compression='gzip')
+TAXDATA_SAMPLE = TAXDATA.sample(frac=0.10)
 WEIGHTS_PATH = os.path.join(CUR_PATH, '..', 'altdata', 'puf91weights.csv.gz')
 WEIGHTS = pd.read_csv(WEIGHTS_PATH, compression='gzip')
 
@@ -46,6 +47,22 @@ def test_correct_Records_instantiation():
     assert rec2.current_year == Records.PUF_YEAR
 
 
+def test_correct_Records_instantiation_sample():
+    rec1 = Records(data=TAXDATA_SAMPLE, blowup_factors=None, weights=WEIGHTS)
+    assert rec1
+    assert np.all(rec1.MARS != 0)
+    assert rec1.current_year == Records.PUF_YEAR
+    sum_e00200_in_puf_year = rec1.e00200.sum()
+    rec1.set_current_year(Records.PUF_YEAR + 1)
+    sum_e00200_in_puf_year_plus_one = rec1.e00200.sum()
+    assert sum_e00200_in_puf_year_plus_one == sum_e00200_in_puf_year
+    bf_df = pd.read_csv(Records.BLOWUP_FACTORS_PATH)
+    rec2 = Records(data=TAXDATA_SAMPLE, blowup_factors=bf_df, weights=None)
+    assert rec2
+    assert np.all(rec2.MARS != 0)
+    assert rec2.current_year == Records.PUF_YEAR
+
+
 def test_read_data():
     funit1 = (
         u'RECID,MARS,e00200,e00200p,e00200s\n'