From 23889b3defc3af8e30c82a6eabdc345430aacb75 Mon Sep 17 00:00:00 2001 From: Martin Holmer Date: Tue, 17 Apr 2018 10:58:43 -0400 Subject: [PATCH 1/3] Refactor create_difference_table utility function --- taxcalc/utils.py | 235 ++++++++++++++++++++--------------------------- 1 file changed, 99 insertions(+), 136 deletions(-) diff --git a/taxcalc/utils.py b/taxcalc/utils.py index 452c91b97..276fb5ffe 100644 --- a/taxcalc/utils.py +++ b/taxcalc/utils.py @@ -24,8 +24,8 @@ weighted_count, weighted_mean, wage_weighted, agi_weighted, expanded_income_weighted, - weighted_perc_inc, weighted_perc_cut, - EPSILON) + weighted_perc_cut, + weighted_perc_inc) # Items in the DIST_TABLE_COLUMNS list below correspond to the items in the @@ -453,127 +453,25 @@ def create_difference_table(vdf1, vdf2, groupby, income_measure, tax_to_diff): positive (denoted by a 0-10p row label) values of the specified income_measure. """ - # pylint: disable=too-many-statements - # nested function that actually creates the difference table - def diff_table_stats(resd, groupby, income_measure): + # pylint: disable=too-many-statements,too-many-locals + # nested function that creates dataframe containing additive statistics + def additive_stats_dataframe(gpdf): """ - Return new Pandas DataFrame containing difference table statistics - based on grouped values of specified col_name in the specified resd. - - resd: reform difference results Pandas DataFrame - groupby: string naming type of table rows - income_measure: string naming column used to create resd table rows + Nested function that returns additive stats DataFrame derived from gpdf """ - # pylint: disable=too-many-locals - def stat_dataframe(gpdf): - """ - Nested function that returns statistics DataFrame derived from gpdf - """ - def weighted_share_of_total(gpdf, colname, total): - """ - Nested function that returns the ratio of the - weighted_sum(pdf, colname) and specified total - """ - return weighted_sum(gpdf, colname) / (total + EPSILON) - # main logic of stat_dataframe function - # construct basic stat_dataframe columns - sdf = pd.DataFrame() - sdf['count'] = gpdf.apply(weighted_count) - sdf['tax_cut'] = gpdf.apply(weighted_count_lt_zero, 'tax_diff') - sdf['perc_cut'] = gpdf.apply(weighted_perc_cut, 'tax_diff') - sdf['tax_inc'] = gpdf.apply(weighted_count_gt_zero, 'tax_diff') - sdf['perc_inc'] = gpdf.apply(weighted_perc_inc, 'tax_diff') - sdf['mean'] = gpdf.apply(weighted_mean, 'tax_diff') - sdf['tot_change'] = gpdf.apply(weighted_sum, 'tax_diff') - wtotal = (resd['tax_diff'] * resd['s006']).sum() - sdf['share_of_change'] = gpdf.apply(weighted_share_of_total, - 'tax_diff', wtotal) - sdf['atinc1'] = gpdf.apply(weighted_sum, 'atinc1') - sdf['atinc2'] = gpdf.apply(weighted_sum, 'atinc2') - sdf['ubi'] = gpdf.apply(weighted_sum, 'ubi') - sdf['benefit_cost_total'] = gpdf.apply(weighted_sum, - 'benefit_cost_total') - sdf['benefit_value_total'] = gpdf.apply(weighted_sum, - 'benefit_value_total') - return sdf - - # main logic of diff_table_stats function - # calculate whole-sample perc_cut and perc_inc statistics - sums_perc_cut = weighted_perc_cut(resd, 'tax_diff') - sums_perc_inc = weighted_perc_inc(resd, 'tax_diff') - # add column to resd given specified groupby and income_measure - if groupby == 'weighted_deciles': - pdf = add_quantile_table_row_variable(resd, income_measure, 10) - elif groupby == 'standard_income_bins': - pdf = add_income_table_row_variable(resd, income_measure, - bin_type='standard') - elif groupby == 'large_income_bins': - pdf = add_income_table_row_variable(resd, income_measure, - bin_type='tpc') - elif groupby == 'small_income_bins': - pdf = add_income_table_row_variable(resd, income_measure, - bin_type='soi') - min_income_measure = pdf[income_measure].min() - # create grouped Pandas DataFrame - gpdf = pdf.groupby('table_row', as_index=False) - del pdf - # create difference table statistics from gpdf in a new DataFrame - diffs_without_sums = stat_dataframe(gpdf) - # calculate sums row - row = get_sums(diffs_without_sums)[diffs_without_sums.columns] - row['mean'] = 0 - if row['count'] > 0: - row['mean'] = row['tot_change'] / row['count'] - row['perc_cut'] = sums_perc_cut - row['perc_inc'] = sums_perc_inc - row['share_of_change'] = 1.0 # avoid rounding error - diffs = diffs_without_sums.append(row) - del row - # replace bottom decile row with non-positive and positive rows - if groupby == 'weighted_deciles' and min_income_measure <= 0: - # bottom decile as its own DataFrame - pdf = copy.deepcopy(gpdf.get_group(1)) - pdf['table_row'] = pd.cut(pdf[income_measure], - bins=[-9e99, -1e-9, 1e-9, 9e99], - labels=[1, 2, 3]) - gpdfx = pdf.groupby('table_row', as_index=False) - rows = stat_dataframe(gpdfx) - diffs = pd.concat([rows, diffs.iloc[1:11]]) - del rows - del pdf - del gpdfx - # append top-decile-detail rows - if groupby == 'weighted_deciles': - # top decile as its own DataFrame - pdf = copy.deepcopy(gpdf.get_group(10)) - pdf = add_quantile_table_row_variable(pdf, income_measure, 10) - # TODO: following statement generates this IGNORED error: - # ValueError: Buffer dtype mismatch, - # expected 'Python object' but got 'long' - # Exception ValueError: "Buffer dtype mismatch, - # expected 'Python object' but got 'long'" - # in 'pandas._libs.lib.is_bool_array' ignored - # ^^^^^^^ - # It is hoped that Pandas PR#18252, which is scheduled for - # inclusion in Pandas version 0.23.0 (Apr 2018), will fix this. - # See discussion at the following URL: - # https://github.com/pandas-dev/pandas/issues/19037 - pdf['table_row'].replace(to_replace=[1, 2, 3, 4, 5], - value=[0, 0, 0, 0, 0], inplace=True) - pdf['table_row'].replace(to_replace=[6, 7, 8, 9], - value=[1, 1, 1, 1], inplace=True) - pdf['table_row'].replace(to_replace=[10], - value=[2], inplace=True) - gpdfx = pdf.groupby('table_row', as_index=False) - sdf = stat_dataframe(gpdfx) - diffs = diffs.append(sdf, ignore_index=True) - del sdf - del pdf - del gpdfx - # delete intermediate Pandas DataFrame objects - del gpdf - # return difference statistics - return diffs + sdf = pd.DataFrame() + sdf['count'] = gpdf.apply(weighted_count) + sdf['tax_cut'] = gpdf.apply(weighted_count_lt_zero, 'tax_diff') + sdf['tax_inc'] = gpdf.apply(weighted_count_gt_zero, 'tax_diff') + sdf['tot_change'] = gpdf.apply(weighted_sum, 'tax_diff') + sdf['ubi'] = gpdf.apply(weighted_sum, 'ubi') + sdf['benefit_cost_total'] = gpdf.apply(weighted_sum, + 'benefit_cost_total') + sdf['benefit_value_total'] = gpdf.apply(weighted_sum, + 'benefit_value_total') + sdf['atinc1'] = gpdf.apply(weighted_sum, 'atinc1') + sdf['atinc2'] = gpdf.apply(weighted_sum, 'atinc2') + return sdf # main logic of create_difference_table assert isinstance(vdf1, pd.DataFrame) assert isinstance(vdf2, pd.DataFrame) @@ -594,23 +492,88 @@ def weighted_share_of_total(gpdf, colname, total): res2['tax_diff'] = res2[tax_to_diff] - res1[tax_to_diff] res2['atinc1'] = res1['aftertax_income'] res2['atinc2'] = res2['aftertax_income'] - diffs = diff_table_stats(res2, groupby, baseline_income_measure) - diffs['pc_aftertaxinc'] = (diffs['atinc2'] / diffs['atinc1']) - 1.0 - # delete intermediate atinc1 and atinc2 columns - del diffs['atinc1'] - del diffs['atinc2'] + # add table_row column to res2 given specified groupby and income_measure + if 'table_row' in res2: + print "already have table_row" # TODO: remove + print "min(table_row)=", res2['table_row'].min() # TODO: remove + print "max(table_row)=", res2['table_row'].max() # TODO: remove + pdf = res2 + else: + if groupby == 'weighted_deciles': + pdf = add_quantile_table_row_variable(res2, income_measure, 10) + elif groupby == 'standard_income_bins': + pdf = add_income_table_row_variable(res2, income_measure, + bin_type='standard') + elif groupby == 'large_income_bins': + pdf = add_income_table_row_variable(res2, income_measure, + bin_type='tpc') + elif groupby == 'small_income_bins': + pdf = add_income_table_row_variable(res2, income_measure, + bin_type='soi') + # create grouped Pandas DataFrame + gpdf = pdf.groupby('table_row', as_index=False) + del pdf + # create additive difference table statistics from gpdf + diff_stats = additive_stats_dataframe(gpdf) + # calculate additive statistics on sums row + sum_row_diff_stats = get_sums(diff_stats)[diff_stats.columns] + # optionally create bottom decile details + if groupby == 'weighted_deciles': + pdf = copy.deepcopy(gpdf.get_group(1)) + pdf['table_row'] = pd.cut(pdf[income_measure], + bins=[-9e99, -1e-9, 1e-9, 9e99], + labels=[1, 2, 3]) + gpdfx = pdf.groupby('table_row', as_index=False) + rows = additive_stats_dataframe(gpdfx) + diff_stats = pd.concat([rows, diff_stats.iloc[1:10]]) + del rows + del pdf + del gpdfx + # append sum_row_additive_stats + diff_stats = diff_stats.append(sum_row_diff_stats) + # optionally create top decile details + if groupby == 'weighted_deciles': + pdf = copy.deepcopy(gpdf.get_group(10)) + pdf = add_quantile_table_row_variable(pdf, income_measure, 10) + pdf['table_row'].replace(to_replace=[1, 2, 3, 4, 5], + value=[0, 0, 0, 0, 0], inplace=True) + pdf['table_row'].replace(to_replace=[6, 7, 8, 9], + value=[1, 1, 1, 1], inplace=True) + pdf['table_row'].replace(to_replace=[10], + value=[2], inplace=True) + gpdfx = pdf.groupby('table_row', as_index=False) + sdf = additive_stats_dataframe(gpdfx) + diff_stats = diff_stats.append(sdf, ignore_index=True) + del sdf + del pdf + del gpdfx + # delete intermediate Pandas DataFrame objects + del gpdf + # compute non-additive stats in each table cell + count = diff_stats['count'] + diff_stats['perc_cut'] = np.where(count > 0, + 100 * diff_stats['tax_cut'] / count, 0) + diff_stats['perc_inc'] = np.where(count > 0, + 100 * diff_stats['tax_inc'] / count, 0) + diff_stats['mean'] = np.where(count > 0, + diff_stats['tot_change'] / count, 0) + total_change = sum_row_diff_stats['tot_change'] + diff_stats['share_of_change'] = np.where(total_change == 0, np.nan, + (100 * diff_stats['tot_change'] / + total_change)) + diff_stats['pc_aftertaxinc'] = (100 * (diff_stats['atinc2'] / + diff_stats['atinc1'] - 1)) + del diff_stats['atinc1'] + del diff_stats['atinc2'] + del count + del sum_row_diff_stats # delete intermediate Pandas DataFrame objects del res1 del res2 - # convert some columns to percentages - percent_columns = ['perc_inc', 'perc_cut', - 'share_of_change', 'pc_aftertaxinc'] - for col in percent_columns: - diffs[col] *= 100.0 # set print display format for float table elements pd.options.display.float_format = '{:10,.2f}'.format - # ensure diffs columns are in correct order - assert diffs.columns.values.tolist() == DIFF_TABLE_COLUMNS + # put diff_stats columns in correct order + diff_stats = diff_stats.reindex(columns=DIFF_TABLE_COLUMNS) # add row names to table if using weighted_deciles or standard_income_bins if groupby == 'weighted_deciles': rownames = DECILE_ROW_NAMES @@ -619,11 +582,11 @@ def weighted_share_of_total(gpdf, colname, total): else: rownames = None if rownames: - assert len(diffs.index) == len(rownames) - diffs.index = rownames + assert len(diff_stats.index) == len(rownames) + diff_stats.index = rownames del rownames # return table as Pandas DataFrame - return diffs + return diff_stats def create_diagnostic_table(vdf, year): From 5ba12cb89559333ec6d01b5ce411fd76fae74e21 Mon Sep 17 00:00:00 2001 From: Martin Holmer Date: Tue, 17 Apr 2018 13:01:39 -0400 Subject: [PATCH 2/3] Use decile_details in refactored create_difference_table utility --- taxcalc/tests/tbi_cps_expect.txt | 60 ++++---- taxcalc/tests/tbi_puf_expect.txt | 244 +++++++++++++++---------------- taxcalc/tests/test_calculate.py | 8 +- taxcalc/tests/test_utils.py | 8 +- taxcalc/utils.py | 98 ++++++------- 5 files changed, 202 insertions(+), 216 deletions(-) diff --git a/taxcalc/tests/tbi_cps_expect.txt b/taxcalc/tests/tbi_cps_expect.txt index 6730e308c..debcad0de 100644 --- a/taxcalc/tests/tbi_cps_expect.txt +++ b/taxcalc/tests/tbi_cps_expect.txt @@ -218,7 +218,6 @@ TABLE diff_comb_xbin RESULTS: TABLE diff_comb_xdec RESULTS: { "0-10n_2": [ - "61182.54", "0.00", "0.00", "0.00", @@ -227,12 +226,13 @@ TABLE diff_comb_xdec RESULTS: "0.00", "0.00", "0.00", - "806259441.96", - "806259441.96", - "0.00" + "0.00", + "0.00", + "0.00", + "nan" ], "0-10p_2": [ - "16293923.70", + "17448112.04", "0.00", "0.00", "0.00", @@ -241,12 +241,12 @@ TABLE diff_comb_xdec RESULTS: "0.00", "0.00", "0.00", - "33249997320.64", - "33249997320.64", + "34056256762.60", + "34056256762.60", "0.00" ], "0-10z_2": [ - "1093005.80", + "0.00", "0.00", "0.00", "0.00", @@ -423,8 +423,8 @@ TABLE diff_comb_xdec RESULTS: "178473645069.52", "100.00", "0.00", - "3028591969682.75", - "3028591969682.75", + "3028591969682.74", + "3028591969682.74", "-3.08" ], "Top 1%_2": [ @@ -644,7 +644,6 @@ TABLE diff_itax_xbin RESULTS: TABLE diff_itax_xdec RESULTS: { "0-10n_2": [ - "61182.54", "0.00", "0.00", "0.00", @@ -653,12 +652,13 @@ TABLE diff_itax_xdec RESULTS: "0.00", "0.00", "0.00", - "806259441.96", - "806259441.96", - "0.00" + "0.00", + "0.00", + "0.00", + "nan" ], "0-10p_2": [ - "16293923.70", + "17448112.04", "0.00", "0.00", "0.00", @@ -667,12 +667,12 @@ TABLE diff_itax_xdec RESULTS: "0.00", "0.00", "0.00", - "33249997320.64", - "33249997320.64", + "34056256762.60", + "34056256762.60", "0.00" ], "0-10z_2": [ - "1093005.80", + "0.00", "0.00", "0.00", "0.00", @@ -849,8 +849,8 @@ TABLE diff_itax_xdec RESULTS: "199863277627.17", "100.00", "0.00", - "3028591969682.75", - "3028591969682.75", + "3028591969682.74", + "3028591969682.74", "-3.08" ], "Top 1%_2": [ @@ -1070,7 +1070,7 @@ TABLE diff_ptax_xbin RESULTS: TABLE diff_ptax_xdec RESULTS: { "0-10n_2": [ - "61182.54", + "0.00", "0.00", "0.00", "0.00", @@ -1079,12 +1079,12 @@ TABLE diff_ptax_xdec RESULTS: "0.00", "-0.00", "0.00", - "806259441.96", - "806259441.96", - "0.00" + "0.00", + "0.00", + "nan" ], "0-10p_2": [ - "16293923.70", + "17448112.04", "0.00", "0.00", "0.00", @@ -1093,12 +1093,12 @@ TABLE diff_ptax_xdec RESULTS: "0.00", "-0.00", "0.00", - "33249997320.64", - "33249997320.64", + "34056256762.60", + "34056256762.60", "0.00" ], "0-10z_2": [ - "1093005.80", + "0.00", "0.00", "0.00", "0.00", @@ -1275,8 +1275,8 @@ TABLE diff_ptax_xdec RESULTS: "-21389632557.65", "100.00", "0.00", - "3028591969682.75", - "3028591969682.75", + "3028591969682.74", + "3028591969682.74", "-3.08" ], "Top 1%_2": [ diff --git a/taxcalc/tests/tbi_puf_expect.txt b/taxcalc/tests/tbi_puf_expect.txt index 9edbbdb67..266df154f 100644 --- a/taxcalc/tests/tbi_puf_expect.txt +++ b/taxcalc/tests/tbi_puf_expect.txt @@ -218,35 +218,34 @@ TABLE diff_comb_xbin RESULTS: TABLE diff_comb_xdec RESULTS: { "0-10n_2": [ - "1291545.69", - "902.66", - "0.07", - "30783.77", - "2.38", - "331.11", - "427640887.02", - "0.21", "0.00", - "8349168320.57", - "8349168320.57", - "0.28" - ], - "0-10p_2": [ - "12825822.88", "0.00", "0.00", - "31166.81", - "0.24", - "2.08", - "26673032.12", + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "nan" + ], + "0-10p_2": [ + "17756353.90", + "902.66", "0.01", + "61950.58", + "0.35", + "25.59", + "454313919.14", + "0.23", "0.00", - "5952150789.38", - "5952150789.38", - "-0.30" + "14301319109.95", + "14301319109.95", + "0.41" ], "0-10z_2": [ - "3638985.32", "0.00", "0.00", "0.00", @@ -257,7 +256,8 @@ TABLE diff_comb_xdec RESULTS: "0.00", "0.00", "0.00", - "0.00" + "0.00", + "nan" ], "10-20_2": [ "17740190.50", @@ -400,17 +400,17 @@ TABLE diff_comb_xdec RESULTS: "-7.07" ], "95-99_2": [ - "7111665.44", + "7115702.98", "171628.45", "2.41", - "6467370.69", - "90.94", - "12303.33", - "87497192807.86", - "43.82", - "0.00", - "50725308987.29", - "50725308987.29", + "6471408.23", + "90.95", + "12309.79", + "87592837243.48", + "43.86", + "0.00", + "50761464897.83", + "50761464897.83", "-6.90" ], "ALL_2": [ @@ -428,18 +428,18 @@ TABLE diff_comb_xdec RESULTS: "-3.32" ], "Top 1%_2": [ - "1780889.07", + "1776851.53", "47668.39", "2.68", - "1594172.23", - "89.52", - "18271.64", - "32539763073.09", - "16.29", - "0.00", - "13160093211.25", - "13160093211.25", - "-1.65" + "1590134.69", + "89.49", + "18259.33", + "32444118637.47", + "16.25", + "0.00", + "13123937300.71", + "13123937300.71", + "-1.64" ] } TABLE diff_itax_xbin RESULTS: @@ -644,35 +644,34 @@ TABLE diff_itax_xbin RESULTS: TABLE diff_itax_xdec RESULTS: { "0-10n_2": [ - "1291545.69", - "902.66", - "0.07", - "30783.77", - "2.38", - "333.82", - "431147466.71", - "0.20", "0.00", - "8349168320.57", - "8349168320.57", - "0.28" - ], - "0-10p_2": [ - "12825822.88", "0.00", "0.00", - "31166.81", - "0.24", - "2.94", - "37752726.03", - "0.02", "0.00", - "5952150789.38", - "5952150789.38", - "-0.30" + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "nan" + ], + "0-10p_2": [ + "17756353.90", + "902.66", + "0.01", + "61950.58", + "0.35", + "26.41", + "468900192.75", + "0.21", + "0.00", + "14301319109.95", + "14301319109.95", + "0.41" ], "0-10z_2": [ - "3638985.32", "0.00", "0.00", "0.00", @@ -683,7 +682,8 @@ TABLE diff_itax_xdec RESULTS: "0.00", "0.00", "0.00", - "0.00" + "0.00", + "nan" ], "10-20_2": [ "17740190.50", @@ -826,17 +826,17 @@ TABLE diff_itax_xdec RESULTS: "-7.07" ], "95-99_2": [ - "7111665.44", + "7115702.98", "149022.53", - "2.10", - "6489976.61", + "2.09", + "6494014.15", "91.26", - "12687.47", - "90229046150.91", - "41.32", + "12693.71", + "90324690586.68", + "41.37", "0.00", - "50725308987.29", - "50725308987.29", + "50761464897.83", + "50761464897.83", "-6.90" ], "ALL_2": [ @@ -854,18 +854,18 @@ TABLE diff_itax_xdec RESULTS: "-3.32" ], "Top 1%_2": [ - "1780889.07", + "1776851.53", "46699.72", - "2.62", - "1595140.90", - "89.57", - "18334.16", - "32651100099.27", - "14.95", - "0.00", - "13160093211.25", - "13160093211.25", - "-1.65" + "2.63", + "1591103.36", + "89.55", + "18321.99", + "32555455663.50", + "14.91", + "0.00", + "13123937300.71", + "13123937300.71", + "-1.64" ] } TABLE diff_ptax_xbin RESULTS: @@ -1070,35 +1070,35 @@ TABLE diff_ptax_xbin RESULTS: TABLE diff_ptax_xdec RESULTS: { "0-10n_2": [ - "1291545.69", - "7933.17", - "0.61", - "289.36", - "0.02", - "-2.72", - "-3506579.69", - "0.02", "0.00", - "8349168320.57", - "8349168320.57", - "0.28" + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "0.00", + "-0.00", + "0.00", + "0.00", + "0.00", + "nan" ], "0-10p_2": [ - "12825822.88", - "30866.19", - "0.24", - "0.00", + "17756353.90", + "38799.36", + "0.22", + "289.36", "0.00", - "-0.86", - "-11079693.91", - "0.06", + "-0.82", + "-14586273.60", + "0.08", "0.00", - "5952150789.38", - "5952150789.38", - "-0.30" + "14301319109.95", + "14301319109.95", + "0.41" ], "0-10z_2": [ - "3638985.32", + "0.00", "0.00", "0.00", "0.00", @@ -1109,7 +1109,7 @@ TABLE diff_ptax_xdec RESULTS: "0.00", "0.00", "0.00", - "0.00" + "nan" ], "10-20_2": [ "17740190.50", @@ -1252,17 +1252,17 @@ TABLE diff_ptax_xdec RESULTS: "-7.07" ], "95-99_2": [ - "7111665.44", + "7115702.98", "4203474.42", - "59.11", + "59.07", "1287.47", "0.02", - "-384.14", - "-2731853343.06", + "-383.92", + "-2731853343.21", "14.64", "0.00", - "50725308987.29", - "50725308987.29", + "50761464897.83", + "50761464897.83", "-6.90" ], "ALL_2": [ @@ -1280,18 +1280,18 @@ TABLE diff_ptax_xdec RESULTS: "-3.32" ], "Top 1%_2": [ - "1780889.07", + "1776851.53", "228185.04", - "12.81", + "12.84", "34259.60", - "1.92", - "-62.52", - "-111337026.17", + "1.93", + "-62.66", + "-111337026.02", "0.60", "0.00", - "13160093211.25", - "13160093211.25", - "-1.65" + "13123937300.71", + "13123937300.71", + "-1.64" ] } TABLE dist1_xbin RESULTS: diff --git a/taxcalc/tests/test_calculate.py b/taxcalc/tests/test_calculate.py index 29fd9d4ca..7b06ba04b 100644 --- a/taxcalc/tests/test_calculate.py +++ b/taxcalc/tests/test_calculate.py @@ -905,14 +905,16 @@ def test_distribution_tables(cps_subsample): def test_difference_table(cps_subsample): + cyr = 2014 pol = Policy() recs = Records.cps_constructor(data=cps_subsample) calc1 = Calculator(policy=pol, records=recs) - assert calc1.current_year == 2014 - reform = {2014: {'_SS_Earnings_c': [9e99]}} + assert calc1.current_year == cyr + reform = {cyr: {'_SS_Earnings_c': [9e99]}} pol.implement_reform(reform) - assert not pol.parameter_errors calc2 = Calculator(policy=pol, records=recs) + assert calc2.current_year == cyr + calc1.calc_all() calc2.calc_all() diff = calc1.difference_table(calc2) assert isinstance(diff, pd.DataFrame) diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py index 609e3c119..e515e1b59 100644 --- a/taxcalc/tests/test_utils.py +++ b/taxcalc/tests/test_utils.py @@ -194,8 +194,8 @@ def test_create_tables(cps_subsample): 116996252, 102458801, 580961247, - 62524760, - 34296230, + 63156380, + 33664610, 5637811] tabcol = 'tot_change' if not np.allclose(diff[tabcol].values, expected, @@ -217,8 +217,8 @@ def test_create_tables(cps_subsample): 20.14, 17.64, 100.00, - 10.76, - 5.90, + 10.87, + 5.79, 0.97] tabcol = 'share_of_change' if not np.allclose(diff[tabcol].values, expected, diff --git a/taxcalc/utils.py b/taxcalc/utils.py index 291567afe..39f7bdb5f 100644 --- a/taxcalc/utils.py +++ b/taxcalc/utils.py @@ -498,13 +498,11 @@ def additive_stats_dataframe(gpdf): res2['atinc2'] = res2['aftertax_income'] # add table_row column to res2 given specified groupby and income_measure if 'table_row' in res2: - print "already have table_row" # TODO: remove - print "min(table_row)=", res2['table_row'].min() # TODO: remove - print "max(table_row)=", res2['table_row'].max() # TODO: remove pdf = res2 else: if groupby == 'weighted_deciles': - pdf = add_quantile_table_row_variable(res2, income_measure, 10) + pdf = add_quantile_table_row_variable(res2, income_measure, + 10, decile_details=True) elif groupby == 'standard_income_bins': pdf = add_income_table_row_variable(res2, income_measure, bin_type='standard') @@ -516,68 +514,54 @@ def additive_stats_dataframe(gpdf): bin_type='soi') # create grouped Pandas DataFrame gpdf = pdf.groupby('table_row', as_index=False) - del pdf # create additive difference table statistics from gpdf - diff_stats = additive_stats_dataframe(gpdf) + diff_table = additive_stats_dataframe(gpdf) # calculate additive statistics on sums row - sum_row_diff_stats = get_sums(diff_stats)[diff_stats.columns] - # optionally create bottom decile details - if groupby == 'weighted_deciles': - pdf = copy.deepcopy(gpdf.get_group(1)) - pdf['table_row'] = pd.cut(pdf[income_measure], - bins=[-9e99, -1e-9, 1e-9, 9e99], - labels=[1, 2, 3]) - gpdfx = pdf.groupby('table_row', as_index=False) - rows = additive_stats_dataframe(gpdfx) - diff_stats = pd.concat([rows, diff_stats.iloc[1:10]]) - del rows - del pdf - del gpdfx - # append sum_row_additive_stats - diff_stats = diff_stats.append(sum_row_diff_stats) - # optionally create top decile details + sum_row = get_sums(diff_table)[diff_table.columns] + # handle placement of sum_row in table if groupby == 'weighted_deciles': - pdf = copy.deepcopy(gpdf.get_group(10)) - pdf = add_quantile_table_row_variable(pdf, income_measure, 10) - pdf['table_row'].replace(to_replace=[1, 2, 3, 4, 5], - value=[0, 0, 0, 0, 0], inplace=True) - pdf['table_row'].replace(to_replace=[6, 7, 8, 9], - value=[1, 1, 1, 1], inplace=True) - pdf['table_row'].replace(to_replace=[10], - value=[2], inplace=True) - gpdfx = pdf.groupby('table_row', as_index=False) - sdf = additive_stats_dataframe(gpdfx) - diff_stats = diff_stats.append(sdf, ignore_index=True) - del sdf - del pdf - del gpdfx + # compute top-decile row + lenindex = len(diff_table.index) + assert lenindex == 14 # rows should be indexed from 0 to 13 + topdec_row = get_sums(diff_table[11:lenindex])[diff_table.columns] + # move top-decile detail rows to make room for topdec_row and sum_row + diff_table = diff_table.reindex(index=range(0, lenindex + 2)) + diff_table.iloc[15] = diff_table.iloc[13] + diff_table.iloc[14] = diff_table.iloc[12] + diff_table.iloc[13] = diff_table.iloc[11] + diff_table.iloc[12] = sum_row + diff_table.iloc[11] = topdec_row + del topdec_row + else: + diff_table = diff_table.append(sum_row) # delete intermediate Pandas DataFrame objects del gpdf + del pdf # compute non-additive stats in each table cell - count = diff_stats['count'] - diff_stats['perc_cut'] = np.where(count > 0, - 100 * diff_stats['tax_cut'] / count, 0) - diff_stats['perc_inc'] = np.where(count > 0, - 100 * diff_stats['tax_inc'] / count, 0) - diff_stats['mean'] = np.where(count > 0, - diff_stats['tot_change'] / count, 0) - total_change = sum_row_diff_stats['tot_change'] - diff_stats['share_of_change'] = np.where(total_change == 0, np.nan, - (100 * diff_stats['tot_change'] / + count = diff_table['count'] + diff_table['perc_cut'] = np.where(count > 0, + 100 * diff_table['tax_cut'] / count, 0) + diff_table['perc_inc'] = np.where(count > 0, + 100 * diff_table['tax_inc'] / count, 0) + diff_table['mean'] = np.where(count > 0, + diff_table['tot_change'] / count, 0) + total_change = sum_row['tot_change'] + diff_table['share_of_change'] = np.where(total_change == 0, np.nan, + (100 * diff_table['tot_change'] / total_change)) - diff_stats['pc_aftertaxinc'] = (100 * (diff_stats['atinc2'] / - diff_stats['atinc1'] - 1)) - del diff_stats['atinc1'] - del diff_stats['atinc2'] - del count - del sum_row_diff_stats + diff_table['pc_aftertaxinc'] = (100 * (diff_table['atinc2'] / + diff_table['atinc1'] - 1)) # delete intermediate Pandas DataFrame objects + del diff_table['atinc1'] + del diff_table['atinc2'] + del count + del sum_row del res1 del res2 # set print display format for float table elements pd.options.display.float_format = '{:10,.2f}'.format - # put diff_stats columns in correct order - diff_stats = diff_stats.reindex(columns=DIFF_TABLE_COLUMNS) + # put diff_table columns in correct order + diff_table = diff_table.reindex(columns=DIFF_TABLE_COLUMNS) # add row names to table if using weighted_deciles or standard_income_bins if groupby == 'weighted_deciles': rownames = DECILE_ROW_NAMES @@ -586,11 +570,11 @@ def additive_stats_dataframe(gpdf): else: rownames = None if rownames: - assert len(diff_stats.index) == len(rownames) - diff_stats.index = rownames + assert len(diff_table.index) == len(rownames) + diff_table.index = rownames del rownames # return table as Pandas DataFrame - return diff_stats + return diff_table def create_diagnostic_table(vdf, year): From b92cacb8a06261cc8c6fc4a4f3de518a54c2d773 Mon Sep 17 00:00:00 2001 From: Martin Holmer Date: Tue, 17 Apr 2018 14:09:38 -0400 Subject: [PATCH 3/3] Add test to maintain complete code coverage --- taxcalc/tests/test_utils.py | 28 ++++++---------------------- taxcalc/utils.py | 4 +--- taxcalc/utilsprvt.py | 18 ------------------ 3 files changed, 7 insertions(+), 43 deletions(-) diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py index e515e1b59..391bf1e51 100644 --- a/taxcalc/tests/test_utils.py +++ b/taxcalc/tests/test_utils.py @@ -26,7 +26,6 @@ weighted_count, weighted_sum, weighted_mean, wage_weighted, agi_weighted, expanded_income_weighted, - weighted_perc_inc, weighted_perc_cut, add_income_table_row_variable, add_quantile_table_row_variable, mtr_graph_data, atr_graph_data, dec_graph_data, @@ -170,13 +169,16 @@ def test_create_tables(cps_subsample): tabcol = 'pc_aftertaxinc' if not np.allclose(diff[tabcol].values, expected, atol=0.005, rtol=0.0, equal_nan=True): - test_failure = True + test_failure print('diff', tabcol) for val in diff[tabcol].values: print('{:.2f},'.format(val)) - diff = create_difference_table(calc1.dataframe(DIFF_VARIABLES), - calc2.dataframe(DIFF_VARIABLES), + vdf1 = calc1.dataframe(DIFF_VARIABLES) + vdf2 = calc2.dataframe(DIFF_VARIABLES) + add_quantile_table_row_variable(vdf2, 'expanded_income', + 10, decile_details=True) + diff = create_difference_table(vdf1, vdf2, groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='combined') @@ -637,24 +639,6 @@ def test_weighted_sum(): pd.util.testing.assert_series_equal(exp, diffs) -def test_weighted_perc_inc(): - dfx = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label']) - grouped = dfx.groupby('label') - diffs = grouped.apply(weighted_perc_inc, 'tax_diff') - exp = pd.Series(data=[8. / 12., 1.0], index=['a', 'b']) - exp.index.name = 'label' - pd.util.testing.assert_series_equal(exp, diffs) - - -def test_weighted_perc_cut(): - dfx = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label']) - grouped = dfx.groupby('label') - diffs = grouped.apply(weighted_perc_cut, 'tax_diff') - exp = pd.Series(data=[4. / 12., 0.0], index=['a', 'b']) - exp.index.name = 'label' - pd.util.testing.assert_series_equal(exp, diffs) - - EPSILON = 1e-5 diff --git a/taxcalc/utils.py b/taxcalc/utils.py index 39f7bdb5f..e38d4189f 100644 --- a/taxcalc/utils.py +++ b/taxcalc/utils.py @@ -23,9 +23,7 @@ weighted_count_gt_zero, weighted_count, weighted_mean, wage_weighted, agi_weighted, - expanded_income_weighted, - weighted_perc_cut, - weighted_perc_inc) + expanded_income_weighted) # Items in the DIST_TABLE_COLUMNS list below correspond to the items in the diff --git a/taxcalc/utilsprvt.py b/taxcalc/utilsprvt.py index 2001ef531..26944c68b 100644 --- a/taxcalc/utilsprvt.py +++ b/taxcalc/utilsprvt.py @@ -70,21 +70,3 @@ def expanded_income_weighted(pdf, col_name): expinc = 'expanded_income' return ((pdf[col_name] * pdf[swght] * pdf[expinc]).sum() / ((pdf[swght] * pdf[expinc]).sum() + EPSILON)) - - -def weighted_perc_inc(pdf, col_name): - """ - Return weighted fraction (not percent) of positive values for the - variable with col_name in the specified Pandas DataFrame. - """ - return (weighted_count_gt_zero(pdf, col_name) / - (weighted_count(pdf) + EPSILON)) - - -def weighted_perc_cut(pdf, col_name): - """ - Return weighted fraction (not percent) of negative values for the - variable with col_name in the specified Pandas DataFrame. - """ - return (weighted_count_lt_zero(pdf, col_name) / - (weighted_count(pdf) + EPSILON))