Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct WEBBIN_ROW_NAMES for bottom bin shown by TaxBrain #1889

Merged
merged 1 commit into from
Feb 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions taxcalc/tbi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from taxcalc.tbi.tbi import (run_nth_year_tax_calc_model,
WEBBIN_ROW_NAMES,
run_nth_year_gdp_elast_model,
reform_warnings_errors)
10 changes: 8 additions & 2 deletions taxcalc/tbi/tbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,14 @@
create_dict_table,
AGGR_ROW_NAMES)
from taxcalc import (DIST_VARIABLES, DIST_TABLE_LABELS, DIFF_TABLE_LABELS,
DECILE_ROW_NAMES, WEBBIN_ROW_NAMES,
proportional_change_in_gdp, Growdiff, Growfactors, Policy)
proportional_change_in_gdp, Growdiff, Growfactors, Policy,
DECILE_ROW_NAMES)

WEBBIN_ROW_NAMES = ['$0-10K', '$10-20K', '$20-30K', '$30-40K',
'$40-50K', '$50-75K', '$75-100K',
'$100-200K', '$200-500K',
'$500-1000K', '>$1000K', 'all']
# the negative-income bin is removed in the summary() function

AGG_ROW_NAMES = AGGR_ROW_NAMES

Expand Down
14 changes: 7 additions & 7 deletions taxcalc/tbi/tbi_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from taxcalc.utils import (add_income_bins, add_quantile_bins,
create_difference_table, create_distribution_table,
DIST_VARIABLES, DIST_TABLE_COLUMNS,
WEBAPP_INCOME_BINS, read_egg_csv)
STANDARD_INCOME_BINS, read_egg_csv)


def check_years_return_first_year(year_n, start_year, use_puf_not_cps):
Expand Down Expand Up @@ -330,7 +330,7 @@ def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing):
if bin_type == 'dec':
df2 = add_quantile_bins(df2, imeasure, 10)
elif bin_type == 'bin':
df2 = add_income_bins(df2, imeasure, bins=WEBAPP_INCOME_BINS)
df2 = add_income_bins(df2, imeasure, bins=STANDARD_INCOME_BINS)
else:
df2 = add_quantile_bins(df2, imeasure, 1)
gdf2 = df2.groupby('bins')
Expand Down Expand Up @@ -427,7 +427,7 @@ def summary(df1, df2, mask):
df2['iitax'] = df2['iitax_xbin']
diff_itax_xbin = \
create_difference_table(df1, df2,
groupby='webapp_income_bins',
groupby='standard_income_bins',
income_measure='expanded_income',
tax_to_diff='iitax')
diff_itax_xbin.drop(diff_itax_xbin.index[0], inplace=True)
Expand All @@ -436,7 +436,7 @@ def summary(df1, df2, mask):
df2['payrolltax'] = df2['payrolltax_xbin']
diff_ptax_xbin = \
create_difference_table(df1, df2,
groupby='webapp_income_bins',
groupby='standard_income_bins',
income_measure='expanded_income',
tax_to_diff='payrolltax')
diff_ptax_xbin.drop(diff_ptax_xbin.index[0], inplace=True)
Expand All @@ -445,7 +445,7 @@ def summary(df1, df2, mask):
df2['combined'] = df2['combined_xbin']
diff_comb_xbin = \
create_difference_table(df1, df2,
groupby='webapp_income_bins',
groupby='standard_income_bins',
income_measure='expanded_income',
tax_to_diff='combined')
diff_comb_xbin.drop(diff_comb_xbin.index[0], inplace=True)
Expand All @@ -470,7 +470,7 @@ def summary(df1, df2, mask):

# create distribution tables grouped by xbin (removing negative-income bin)
dist1_xbin = \
create_distribution_table(df1, groupby='webapp_income_bins',
create_distribution_table(df1, groupby='standard_income_bins',
income_measure='expanded_income',
result_type='weighted_sum')
dist1_xbin.drop(dist1_xbin.index[0], inplace=True)
Expand All @@ -483,7 +483,7 @@ def summary(df1, df2, mask):
df2[root_col_name] = df2[col]
df2['expanded_income_baseline'] = df1['expanded_income']
dist2_xbin = \
create_distribution_table(df2, groupby='webapp_income_bins',
create_distribution_table(df2, groupby='standard_income_bins',
income_measure='expanded_income_baseline',
result_type='weighted_sum')
dist2_xbin.drop(dist2_xbin.index[0], inplace=True)
Expand Down
1 change: 0 additions & 1 deletion taxcalc/tests/test_cpscsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import os
import sys
import json
import pytest
import numpy as np
import pandas as pd
# pylint: disable=import-error
Expand Down
3 changes: 3 additions & 0 deletions taxcalc/tests/test_taxcalcio.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,9 @@ def test_no_tables_or_graphs(reformfile1):
fname = output_filename.replace('.csv', '-mtr.html')
if os.path.isfile(fname):
os.remove(fname)
fname = output_filename.replace('.csv', '-qin.html')
if os.path.isfile(fname):
os.remove(fname)


def test_tables(reformfile1):
Expand Down
48 changes: 24 additions & 24 deletions taxcalc/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_create_tables(cps_subsample):

diff = create_difference_table(calc1.dataframe(DIFF_VARIABLES),
calc2.dataframe(DIFF_VARIABLES),
groupby='webapp_income_bins',
groupby='standard_income_bins',
income_measure='expanded_income',
tax_to_diff='iitax')
assert isinstance(diff, pd.DataFrame)
Expand Down Expand Up @@ -351,7 +351,7 @@ def test_create_tables(cps_subsample):
print('{:.0f},'.format(val))

dist = create_distribution_table(calc2.dataframe(DIST_VARIABLES),
groupby='webapp_income_bins',
groupby='standard_income_bins',
income_measure='expanded_income',
result_type='weighted_sum')
assert isinstance(dist, pd.DataFrame)
Expand Down Expand Up @@ -424,7 +424,7 @@ def test_diff_count_precision():
9 61733 <--- largest unweighted bin count
A 215525

WEBAPP BINS:
STANDARD BINS:
0 7081 <--- negative income bin is dropped in TaxBrain display
1 19355
2 22722
Expand All @@ -441,27 +441,27 @@ def test_diff_count_precision():

Background information on Trump2017.json reform used in TaxBrain run 16649:

WEBAPP bin 10 ($500-1000 thousand) has weighted count of 1179 thousand;
weighted count of units with tax increase is 32 thousand.
STANDARD bin 10 ($500-1000 thousand) has weighted count of 1179 thousand;
weighted count of units with tax increase is 32 thousand.

So, the mean weight for all units in WEBAPP bin 10 is 111.5421 and the
So, the mean weight for all units in STANDARD bin 10 is 111.5421 and the
unweighted number with a tax increase is 287 assuming all units in that
bin have the same weight. (Note that 287 * 111.5421 is about 32,012.58,
which rounds to the 32 thousand shown in the TaxBrain difference table.)

WEBAPP bin 11 ($1000+ thousand) has weighted count of 636 thousand;
weighted count of units with tax increase is 27 thousand.
STANDARD bin 11 ($1000+ thousand) has weighted count of 636 thousand;
weighted count of units with tax increase is 27 thousand.

So, the mean weight for all units in WEBAPP bin 11 is about 27.517 and the
unweighted number with a tax increase is 981 assuming all units in that
bin have the same weight. (Note that 981 * 27.517 is about 26,994.18,
So, the mean weight for all units in STANDARD bin 11 is about 27.517 and
the unweighted number with a tax increase is 981 assuming all units in
that bin have the same weight. (Note that 981 * 27.517 is about 26,994.18,
which rounds to the 27 thousand shown in the TaxBrain difference table.)
"""
dump = False # setting to True implies results printed and test fails
seed = 123456789
bs_samples = 1000
alpha = 0.025 # implies 95% confidence interval
# compute stderr and confidence interval for WEBAPP bin 10 increase count
# compute stderr and confidence interval for STANDARD bin 10 increase count
data_list = [111.5421] * 287 + [0.0] * (10570 - 287)
assert len(data_list) == 10570
data = np.array(data_list)
Expand All @@ -475,26 +475,26 @@ def test_diff_count_precision():
if dump:
res = '{}EST={:.1f} B={} alpha={:.3f} se={:.2f} ci=[ {:.2f} , {:.2f} ]'
print(
res.format('WEBAPP-BIN10: ',
res.format('STANDARD-BIN10: ',
data_estimate, bs_samples, alpha, stderr, cilo, cihi)
)
assert abs((stderr / 1.90) - 1) < 0.0008
# NOTE: a se of 1.90 thousand implies that when comparing the difference
# in the weighted number of filing units in WEBAPP bin 10 with a
# in the weighted number of filing units in STANDARD bin 10 with a
# tax increase, the difference statistic has a bigger se (because
# the variance of the difference is the sum of the variances of the
# two point estimates). So, in WEBAPP bin 10 if the point estimates
# both had se = 1.90, then the difference in the point estimates has
# has a se = 2.687. This means that the difference would have to be
# over 5 thousand in order for there to be high confidence that the
# difference was different from zero in a statistically significant
# manner.
# two point estimates). So, in STANDARD bin 10 if the point
# estimates both had se = 1.90, then the difference in the point
# estimates has has a se = 2.687. This means that the difference
# would have to be over 5 thousand in order for there to be high
# confidence that the difference was different from zero in a
# statistically significant manner.
# Or put a different way, a difference of 1 thousand cannot be
# accurately detected while a difference of 10 thousand can be
# accurately detected.
assert abs((cilo / 28.33) - 1) < 0.0012
assert abs((cihi / 35.81) - 1) < 0.0012
# compute stderr and confidence interval for WEBAPP bin 11 increase count
# compute stderr and confidence interval for STANDARD bin 11 increase count
data_list = [27.517] * 981 + [0.0] * (23113 - 981)
assert len(data_list) == 23113
data = np.array(data_list)
Expand All @@ -508,15 +508,15 @@ def test_diff_count_precision():
if dump:
res = '{}EST={:.1f} B={} alpha={:.3f} se={:.2f} ci=[ {:.2f} , {:.2f} ]'
print(
res.format('WEBAPP-BIN11: ',
res.format('STANDARD-BIN11: ',
data_estimate, bs_samples, alpha, stderr, cilo, cihi)
)
assert abs((stderr / 0.85) - 1) < 0.0040
# NOTE: a se of 0.85 thousand implies that when comparing the difference
# in the weighted number of filing units in WEBAPP bin 11 with a
# in the weighted number of filing units in STANDARD bin 11 with a
# tax increase, the difference statistic has a bigger se (because
# the variance of the difference is the sum of the variances of the
# two point estimates). So, in WEBAPP bin 11 if the point estimates
# two point estimates). So, in STANDARD bin 11 if point estimates
# both had se = 0.85, then the difference in the point estimates has
# has a se = 1.20. This means that the difference would have to be
# over 2.5 thousand in order for there to be high confidence that the
Expand Down
38 changes: 19 additions & 19 deletions taxcalc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,16 @@
'all',
'80-90', '90-95', '95-99', 'Top 1%']

WEBAPP_INCOME_BINS = [-9e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999,
199999, 499999, 1000000, 9e99]
STANDARD_ROW_NAMES = ['<$0K', '$0-10K', '$10-20K', '$20-30K', '$30-40K',
'$40-50K', '$50-75K', '$75-100K',
'$100-200K', '$200-500K',
'$500-1000K', '>$1000K', 'all']

WEBBIN_ROW_NAMES = ['<$10K', '$10-20K', '$20-30K', '$30-40K',
'$40-50K', '$50-75K', '$75-100K',
'$100-200K', '$200-500K',
'$500-1000K', '>$1000K', 'all']
STANDARD_INCOME_BINS = [-9e99, 0, 9999, 19999, 29999, 39999, 49999,
74999, 99999, 199999, 499999, 1000000, 9e99]

LARGE_INCOME_BINS = [-9e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999,
200000, 9e99]
LARGE_INCOME_BINS = [-9e99, 0, 9999, 19999, 29999, 39999, 49999,
74999, 99999, 200000, 9e99]

SMALL_INCOME_BINS = [-9e99, 0, 4999, 9999, 14999, 19999, 24999, 29999, 39999,
49999, 74999, 99999, 199999, 499999, 999999, 1499999,
Expand Down Expand Up @@ -194,7 +194,7 @@ def add_income_bins(pdf, income_measure,
specifies income variable used to construct bins

bin_type: String, optional
options for input: 'webapp', 'tpc', 'soi'
options for input: 'standard', 'tpc', 'soi'
default: 'soi'

bins: iterable of scalars, optional income breakpoints
Expand All @@ -212,8 +212,8 @@ def add_income_bins(pdf, income_measure,
the original input plus the added 'bin' column
"""
if not bins:
if bin_type == 'webapp':
bins = WEBAPP_INCOME_BINS
if bin_type == 'standard':
bins = STANDARD_INCOME_BINS
elif bin_type == 'tpc':
bins = LARGE_INCOME_BINS
elif bin_type == 'soi':
Expand Down Expand Up @@ -252,7 +252,7 @@ def create_distribution_table(vdf, groupby, income_measure, result_type):
call like this: vdf = calc.dataframe(STATS_VARIABLES)

groupby : String object
options for input: 'weighted_deciles', 'webapp_income_bins',
options for input: 'weighted_deciles', 'standard_income_bins',
'large_income_bins', 'small_income_bins';
determines how the columns in the resulting Pandas DataFrame are sorted
NOTE: when groupby is 'weighted_deciles', the returned table has three
Expand Down Expand Up @@ -329,7 +329,7 @@ def stat_dataframe(gpdf):
# main logic of create_distribution_table
assert isinstance(vdf, pd.DataFrame)
assert (groupby == 'weighted_deciles' or
groupby == 'webapp_income_bins' or
groupby == 'standard_income_bins' or
groupby == 'large_income_bins' or
groupby == 'small_income_bins')
assert result_type == 'weighted_sum' or result_type == 'weighted_avg'
Expand All @@ -344,8 +344,8 @@ def stat_dataframe(gpdf):
# sort the data given specified groupby and income_measure
if groupby == 'weighted_deciles':
pdf = add_quantile_bins(res, income_measure, 10)
elif groupby == 'webapp_income_bins':
pdf = add_income_bins(res, income_measure, bin_type='webapp')
elif groupby == 'standard_income_bins':
pdf = add_income_bins(res, income_measure, bin_type='standard')
elif groupby == 'large_income_bins':
pdf = add_income_bins(res, income_measure, bin_type='tpc')
elif groupby == 'small_income_bins':
Expand Down Expand Up @@ -396,7 +396,7 @@ def create_difference_table(vdf1, vdf2, groupby, income_measure, tax_to_diff):
Calculator.dataframe method

groupby : String object
options for input: 'weighted_deciles', 'webapp_income_bins',
options for input: 'weighted_deciles', 'standard_income_bins',
'large_income_bins', 'small_income_bins'
specifies kind of bins used to group filing units
NOTE: when groupby is 'weighted_deciles', the returned table has three
Expand Down Expand Up @@ -468,8 +468,8 @@ def weighted_share_of_total(gpdf, colname, total):
# add bin column to res2 given specified groupby and income_measure
if groupby == 'weighted_deciles':
pdf = add_quantile_bins(res2, income_measure, 10)
elif groupby == 'webapp_income_bins':
pdf = add_income_bins(res2, income_measure, bin_type='webapp')
elif groupby == 'standard_income_bins':
pdf = add_income_bins(res2, income_measure, bin_type='standard')
elif groupby == 'large_income_bins':
pdf = add_income_bins(res2, income_measure, bin_type='tpc')
elif groupby == 'small_income_bins':
Expand Down Expand Up @@ -515,7 +515,7 @@ def weighted_share_of_total(gpdf, colname, total):
assert isinstance(vdf1, pd.DataFrame)
assert isinstance(vdf2, pd.DataFrame)
assert (groupby == 'weighted_deciles' or
groupby == 'webapp_income_bins' or
groupby == 'standard_income_bins' or
groupby == 'large_income_bins' or
groupby == 'small_income_bins')
assert (income_measure == 'expanded_income' or
Expand Down