Skip to content

Commit

Permalink
Merge pull request #844 from talumbau/handle_samples
Browse files Browse the repository at this point in the history
Sample from weights when not reading full dataset
  • Loading branch information
talumbau authored Aug 10, 2016
2 parents c34ce25 + e59ad62 commit 4fb4aba
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 0 deletions.
7 changes: 7 additions & 0 deletions taxcalc/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ def __init__(self,
# read extrapolation blowup factors and sample weights
self._read_blowup(blowup_factors)
self._read_weights(weights)
# weights must be same size as tax record data
if not self.WT.empty and self.dim != len(self.WT):
frac = float(self.dim) / len(self.WT)
self.WT = self.WT.iloc[self.index]
self.WT = self.WT / frac

# specify current_year and FLPDYR values
if isinstance(start_year, int):
self._current_year = start_year
Expand Down Expand Up @@ -383,6 +389,7 @@ def _read_data(self, data, exact_calcs, schR_calcs):
msg = 'data is neither a string nor a Pandas DataFrame'
raise ValueError(msg)
self.dim = len(taxdf)
self.index = taxdf.index
# create class variables using taxdf column names
READ_VARS = set()
for varname in list(taxdf.columns.values):
Expand Down
32 changes: 32 additions & 0 deletions taxcalc/tests/test_pufcsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,38 @@
import pytest
import difflib
import numpy as np
import pandas as pd


@pytest.mark.requires_pufcsv
def test_sample():
"""
Test if reading in a sample of the data produces a reasonable estimate
relative to the full data set
"""
# Full dataset
clp = Policy()
puf = Records(data=PUFCSV_PATH)
calc = Calculator(policy=clp, records=puf)
adt = calc.diagnostic_table(num_years=10)

# Sample sample dataset
clp2 = Policy()
tax_data_full = pd.read_csv(PUFCSV_PATH)
tax_data = tax_data_full.sample(frac=0.02)
puf_sample = Records(data=tax_data)
calc_sample = Calculator(policy=clp2, records=puf_sample)
adt_sample = calc_sample.diagnostic_table(num_years=10)

# Get the final combined tax liability for the budget period
# in the sample and the full dataset and make sure they are close
full_tax_liability = adt.loc["Combined liability ($b)"]
sample_tax_liability = adt_sample.loc["Combined liability ($b)"]
max_val = max(full_tax_liability.max(), sample_tax_liability.max())
rel_diff = max(abs(full_tax_liability - sample_tax_liability)) / max_val

# Fail on greater than 5% releative difference in any budget year
assert rel_diff < 0.05


@pytest.mark.requires_pufcsv
Expand Down
17 changes: 17 additions & 0 deletions taxcalc/tests/test_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# use 1991 PUF-like data to emulate current puf.csv, which is private
TAXDATA_PATH = os.path.join(CUR_PATH, '..', 'altdata', 'puf91taxdata.csv.gz')
TAXDATA = pd.read_csv(TAXDATA_PATH, compression='gzip')
TAXDATA_SAMPLE = TAXDATA.sample(frac=0.10)
WEIGHTS_PATH = os.path.join(CUR_PATH, '..', 'altdata', 'puf91weights.csv.gz')
WEIGHTS = pd.read_csv(WEIGHTS_PATH, compression='gzip')

Expand Down Expand Up @@ -46,6 +47,22 @@ def test_correct_Records_instantiation():
assert rec2.current_year == Records.PUF_YEAR


def test_correct_Records_instantiation_sample():
rec1 = Records(data=TAXDATA_SAMPLE, blowup_factors=None, weights=WEIGHTS)
assert rec1
assert np.all(rec1.MARS != 0)
assert rec1.current_year == Records.PUF_YEAR
sum_e00200_in_puf_year = rec1.e00200.sum()
rec1.set_current_year(Records.PUF_YEAR + 1)
sum_e00200_in_puf_year_plus_one = rec1.e00200.sum()
assert sum_e00200_in_puf_year_plus_one == sum_e00200_in_puf_year
bf_df = pd.read_csv(Records.BLOWUP_FACTORS_PATH)
rec2 = Records(data=TAXDATA_SAMPLE, blowup_factors=bf_df, weights=None)
assert rec2
assert np.all(rec2.MARS != 0)
assert rec2.current_year == Records.PUF_YEAR


def test_read_data():
funit1 = (
u'RECID,MARS,e00200,e00200p,e00200s\n'
Expand Down

0 comments on commit 4fb4aba

Please sign in to comment.