diff --git a/RELEASES.md b/RELEASES.md index caae2b936..85edb9acb 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -10,11 +10,15 @@ Release 0.9.1 on 2017-??-?? [#XXXX](https://github.com/open-source-economics/Tax-Calculator/pull/XXXX)) **API Changes** +- None **New Features** +- Improve calculation of sub-sample weights + [[#1441](https://github.com/open-source-economics/Tax-Calculator/pull/1441) + by Hank Doupe] **Bug Fixes** - +- None Release 0.9.0 on 2017-06-14 --------------------------- diff --git a/taxcalc/records.py b/taxcalc/records.py index 28362dcd0..b860335a6 100644 --- a/taxcalc/records.py +++ b/taxcalc/records.py @@ -5,6 +5,7 @@ # pep8 --ignore=E402 records.py # pylint --disable=locally-disabled records.py +from __future__ import division import os import json import six @@ -146,9 +147,12 @@ def __init__(self, self._read_adjust(adjust_ratios) # weights must be same size as tax record data if not self.WT.empty and self.dim != len(self.WT): - frac = float(self.dim) / len(self.WT) + # scale-up sub-sample weights by year-specific factor + sum_full_weights = self.WT.sum() self.WT = self.WT.iloc[self.index] - self.WT = self.WT / frac + sum_sub_weights = self.WT.sum() + factor = sum_full_weights / sum_sub_weights + self.WT = self.WT * factor # specify current_year and FLPDYR values if isinstance(start_year, int): self._current_year = start_year diff --git a/taxcalc/tests/test_pufcsv.py b/taxcalc/tests/test_pufcsv.py index 03cc6fcc8..e1e130335 100644 --- a/taxcalc/tests/test_pufcsv.py +++ b/taxcalc/tests/test_pufcsv.py @@ -36,7 +36,7 @@ def puf_path(tests_path): def test_agg(tests_path, puf_path): # pylint: disable=redefined-outer-name """ Test Tax-Calculator aggregate taxes with no policy reform using - the full-sample puf.csv and a two-percent sub-sample of puf.csv + the full-sample puf.csv and a small sub-sample of puf.csv """ # pylint: disable=too-many-locals,too-many-statements nyrs = 10 @@ -78,10 +78,10 @@ def test_agg(tests_path, puf_path): # pylint: disable=redefined-outer-name msg += '--- and rerun test. ---\n' msg += '-------------------------------------------------\n' raise ValueError(msg) - # create aggregate diagnostic table using sub sample of records + # create aggregate diagnostic table using unweighted sub-sample of records fullsample = pd.read_csv(puf_path) - rn_seed = 80 # to ensure sub-sample is always the same - subfrac = 0.02 # sub-sample fraction + rn_seed = 180 # to ensure sub-sample is always the same + subfrac = 0.05 # sub-sample fraction subsample = fullsample.sample(frac=subfrac, # pylint: disable=no-member random_state=rn_seed) rec_subsample = Records(data=subsample) @@ -89,12 +89,13 @@ def test_agg(tests_path, puf_path): # pylint: disable=redefined-outer-name adt_subsample = multiyear_diagnostic_table(calc_subsample, num_years=nyrs) # compare combined tax liability from full and sub samples for each year taxes_subsample = adt_subsample.loc["Combined Liability ($b)"] - reltol = 0.04 # maximum allowed relative difference in tax liability + reltol = 0.01 # maximum allowed relative difference in tax liability if not np.allclose(taxes_subsample, taxes_fullsample, atol=0.0, rtol=reltol): msg = 'PUFCSV AGG RESULTS DIFFER IN SUB-SAMPLE AND FULL-SAMPLE\n' - msg += 'WHEN subfrac = {:.3f} and reltol = {:.4f}\n'.format(subfrac, - reltol) + msg += 'WHEN subfrac={:.3f}, rtol={:.4f}, seed={}\n'.format(subfrac, + reltol, + rn_seed) it_sub = np.nditer(taxes_subsample, flags=['f_index']) it_all = np.nditer(taxes_fullsample, flags=['f_index']) while not it_sub.finished: @@ -103,7 +104,7 @@ def test_agg(tests_path, puf_path): # pylint: disable=redefined-outer-name tax_all = float(it_all[0]) reldiff = abs(tax_sub - tax_all) / abs(tax_all) if reldiff > reltol: - msgstr = ' year,sub,full,reldif= {}\t{:.2f}\t{:.2f}\t{:.4f}\n' + msgstr = ' year,sub,full,reldiff= {}\t{:.2f}\t{:.2f}\t{:.4f}\n' msg += msgstr.format(cyr, tax_sub, tax_all, reldiff) it_sub.iternext() it_all.iternext()