Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sample from weights when not reading full dataset #844

Merged
merged 5 commits into from
Aug 10, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions taxcalc/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ def __init__(self,
# read extrapolation blowup factors and sample weights
self._read_blowup(blowup_factors)
self._read_weights(weights)
# weights must be same size as tax record data
if not self.WT.empty and self.dim != len(self.WT):
frac = float(self.dim) / len(self.WT)
self.WT = self.WT.iloc[self.index]
self.WT = self.WT / frac

# specify current_year and FLPDYR values
if isinstance(start_year, int):
self._current_year = start_year
Expand Down Expand Up @@ -383,6 +389,7 @@ def _read_data(self, data, exact_calcs, schR_calcs):
msg = 'data is neither a string nor a Pandas DataFrame'
raise ValueError(msg)
self.dim = len(taxdf)
self.index = taxdf.index
# create class variables using taxdf column names
READ_VARS = set()
for varname in list(taxdf.columns.values):
Expand Down
32 changes: 32 additions & 0 deletions taxcalc/tests/test_pufcsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,38 @@
import pytest
import difflib
import numpy as np
import pandas as pd


@pytest.mark.requires_pufcsv
def test_sample():
"""
Test if reading in a sample of the data produces a reasonable estimate
relative to the full data set
"""
# Full dataset
clp = Policy()
puf = Records(data=PUFCSV_PATH)
calc = Calculator(policy=clp, records=puf)
adt = calc.diagnostic_table(num_years=10)

# Sample sample dataset
clp2 = Policy()
tax_data_full = pd.read_csv(PUFCSV_PATH)
tax_data = tax_data_full.sample(frac=0.02)
puf_sample = Records(data=tax_data)
calc_sample = Calculator(policy=clp2, records=puf_sample)
adt_sample = calc_sample.diagnostic_table(num_years=10)

# Get the final combined tax liability for the budget period
# in the sample and the full dataset and make sure they are close
full_tax_liability = adt.loc["Combined liability ($b)"]
sample_tax_liability = adt_sample.loc["Combined liability ($b)"]
max_val = max(full_tax_liability.max(), sample_tax_liability.max())
rel_diff = max(abs(full_tax_liability - sample_tax_liability)) / max_val

# Fail on greater than 5% releative difference in any budget year
assert rel_diff < 0.05


@pytest.mark.requires_pufcsv
Expand Down
17 changes: 17 additions & 0 deletions taxcalc/tests/test_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# use 1991 PUF-like data to emulate current puf.csv, which is private
TAXDATA_PATH = os.path.join(CUR_PATH, '..', 'altdata', 'puf91taxdata.csv.gz')
TAXDATA = pd.read_csv(TAXDATA_PATH, compression='gzip')
TAXDATA_SAMPLE = TAXDATA.sample(frac=0.10)
WEIGHTS_PATH = os.path.join(CUR_PATH, '..', 'altdata', 'puf91weights.csv.gz')
WEIGHTS = pd.read_csv(WEIGHTS_PATH, compression='gzip')

Expand Down Expand Up @@ -46,6 +47,22 @@ def test_correct_Records_instantiation():
assert rec2.current_year == Records.PUF_YEAR


def test_correct_Records_instantiation_sample():
rec1 = Records(data=TAXDATA_SAMPLE, blowup_factors=None, weights=WEIGHTS)
assert rec1
assert np.all(rec1.MARS != 0)
assert rec1.current_year == Records.PUF_YEAR
sum_e00200_in_puf_year = rec1.e00200.sum()
rec1.set_current_year(Records.PUF_YEAR + 1)
sum_e00200_in_puf_year_plus_one = rec1.e00200.sum()
assert sum_e00200_in_puf_year_plus_one == sum_e00200_in_puf_year
bf_df = pd.read_csv(Records.BLOWUP_FACTORS_PATH)
rec2 = Records(data=TAXDATA_SAMPLE, blowup_factors=bf_df, weights=None)
assert rec2
assert np.all(rec2.MARS != 0)
assert rec2.current_year == Records.PUF_YEAR


def test_read_data():
funit1 = (
u'RECID,MARS,e00200,e00200p,e00200s\n'
Expand Down