From 63869a7759c0a63be2bae86fbe82ecce36486814 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Thu, 7 Sep 2017 06:09:14 -0400 Subject: [PATCH 1/5] Standardize names for table column and row lists --- taxcalc/dropq/dropq.py | 35 ++++++++++--------- taxcalc/dropq/dropq_utils.py | 17 ++++++--- taxcalc/tests/test_utils.py | 5 +-- taxcalc/utils.py | 68 +++++++++++++++++++++++++----------- 4 files changed, 81 insertions(+), 44 deletions(-) diff --git a/taxcalc/dropq/dropq.py b/taxcalc/dropq/dropq.py index b649b21a9..a65a3ec33 100644 --- a/taxcalc/dropq/dropq.py +++ b/taxcalc/dropq/dropq.py @@ -17,26 +17,27 @@ import pandas as pd from taxcalc.dropq.dropq_utils import (dropq_calculate, random_seed, - dropq_summary) -from taxcalc import (results, TABLE_LABELS, proportional_change_gdp, - Growdiff, Growfactors, Policy) + dropq_summary, + AGGR_ROW_NAMES) +from taxcalc import (results, DIST_TABLE_LABELS, + proportional_change_gdp, Growdiff, Growfactors, Policy) # specify constants -PLAN_COLUMN_TYPES = [float] * len(TABLE_LABELS) +DIST_COLUMN_TYPES = [float] * len(DIST_TABLE_LABELS) DIFF_COLUMN_TYPES = [int, int, int, float, float, str, str, str, str] -DECILE_ROW_NAMES = ['perc0-10', 'perc10-20', 'perc20-30', 'perc30-40', - 'perc40-50', 'perc50-60', 'perc60-70', 'perc70-80', - 'perc80-90', 'perc90-100', 'all'] +DEC_ROW_NAMES = ['perc0-10', 'perc10-20', 'perc20-30', 'perc30-40', + 'perc40-50', 'perc50-60', 'perc60-70', 'perc70-80', + 'perc80-90', 'perc90-100', 'all'] BIN_ROW_NAMES = ['less_than_10', 'ten_twenty', 'twenty_thirty', 'thirty_forty', 'forty_fifty', 'fifty_seventyfive', 'seventyfive_hundred', 'hundred_twohundred', 'twohundred_fivehundred', 'fivehundred_thousand', 'thousand_up', 'all'] -TOTAL_ROW_NAMES = ['ind_tax', 'payroll_tax', 'combined_tax'] +AGG_ROW_NAMES = AGGR_ROW_NAMES GDP_ELAST_ROW_NAMES = ['gdp_elasticity'] @@ -120,13 +121,13 @@ def run_nth_year_tax_calc_model(year_n, start_year, # construct DataFrames containing aggregate tax totals # ... for reform-minus-baseline difference aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d] - aggr_d = pd.DataFrame(data=aggrd, index=TOTAL_ROW_NAMES) + aggr_d = pd.DataFrame(data=aggrd, index=AGGR_ROW_NAMES) # ... for baseline aggr1 = [aggr_itax_1, aggr_ptax_1, aggr_comb_1] - aggr_1 = pd.DataFrame(data=aggr1, index=TOTAL_ROW_NAMES) + aggr_1 = pd.DataFrame(data=aggr1, index=AGGR_ROW_NAMES) # ... for reform aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2] - aggr_2 = pd.DataFrame(data=aggr2, index=TOTAL_ROW_NAMES) + aggr_2 = pd.DataFrame(data=aggr2, index=AGGR_ROW_NAMES) elapsed_time = time.time() - start_time print('elapsed time for this run: ', elapsed_time) @@ -155,13 +156,13 @@ def append_year(pdf): append_year(aggr_2)) # optionally construct JSON results tables for year n - dec_row_names_n = [x + '_' + str(year_n) for x in DECILE_ROW_NAMES] + dec_row_names_n = [x + '_' + str(year_n) for x in DEC_ROW_NAMES] dist2_dec_table_n = create_json_table(dist2_dec, row_names=dec_row_names_n, - column_types=PLAN_COLUMN_TYPES) + column_types=DIST_COLUMN_TYPES) dist1_dec_table_n = create_json_table(dist1_dec, row_names=dec_row_names_n, - column_types=PLAN_COLUMN_TYPES) + column_types=DIST_COLUMN_TYPES) diff_itax_dec_table_n = create_json_table(diff_itax_dec, row_names=dec_row_names_n, column_types=DIFF_COLUMN_TYPES) @@ -174,10 +175,10 @@ def append_year(pdf): bin_row_names_n = [x + '_' + str(year_n) for x in BIN_ROW_NAMES] dist2_bin_table_n = create_json_table(dist2_bin, row_names=bin_row_names_n, - column_types=PLAN_COLUMN_TYPES) + column_types=DIST_COLUMN_TYPES) dist1_bin_table_n = create_json_table(dist1_bin, row_names=bin_row_names_n, - column_types=PLAN_COLUMN_TYPES) + column_types=DIST_COLUMN_TYPES) diff_itax_bin_table_n = create_json_table(diff_itax_bin, row_names=bin_row_names_n, column_types=DIFF_COLUMN_TYPES) @@ -187,7 +188,7 @@ def append_year(pdf): diff_comb_bin_table_n = create_json_table(diff_comb_bin, row_names=bin_row_names_n, column_types=DIFF_COLUMN_TYPES) - total_row_names_n = [x + '_' + str(year_n) for x in TOTAL_ROW_NAMES] + total_row_names_n = [x + '_' + str(year_n) for x in AGGR_ROW_NAMES] aggr_d_table_n = create_json_table(aggr_d, row_names=total_row_names_n) aggr_d_table_n = dict((k, v[0]) for k, v in aggr_d_table_n.items()) diff --git a/taxcalc/dropq/dropq_utils.py b/taxcalc/dropq/dropq_utils.py index 8646b2290..64d519d84 100644 --- a/taxcalc/dropq/dropq_utils.py +++ b/taxcalc/dropq/dropq_utils.py @@ -12,7 +12,8 @@ Consumption, Behavior, Growfactors, Growdiff) from taxcalc.utils import (add_income_bins, add_quantile_bins, results, create_difference_table, create_distribution_table, - STATS_COLUMNS, TABLE_COLUMNS, WEBAPP_INCOME_BINS) + STATS_COLUMNS, DIST_TABLE_COLUMNS, + WEBAPP_INCOME_BINS) def check_years(start_year, year_n): @@ -279,9 +280,11 @@ def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col]) # main logic of fuzz_df2_records - cols_to_skip = set(['num_returns_ItemDed', 'num_returns_StandardDed', - 'num_returns_AMT', 's006']) - columns_to_fuzz = (set(TABLE_COLUMNS) | set(STATS_COLUMNS)) - cols_to_skip + skips = set(['num_returns_ItemDed', + 'num_returns_StandardDed', + 'num_returns_AMT', + 's006']) + columns_to_fuzz = (set(DIST_TABLE_COLUMNS) | set(STATS_COLUMNS)) - skips df2['mask'] = mask # always use expanded income in df1 baseline to groupby into bins df2['expanded_income_baseline'] = df1['expanded_income'] @@ -291,11 +294,15 @@ def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): return df2 +AGGR_ROW_NAMES = ['ind_tax', 'payroll_tax', 'combined_tax'] + + def dropq_summary(df1, df2, mask): """ df1 contains raw results for baseline plan df2 contains raw results for reform plan - mask is the boolean array specifying which rows might be fuzzed + mask is the boolean array specifying which records might be fuzzed + returns dictionary of summary results DataFrames """ # pylint: disable=too-many-locals diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py index be5f46b74..ca4ea3eec 100644 --- a/taxcalc/tests/test_utils.py +++ b/taxcalc/tests/test_utils.py @@ -15,7 +15,8 @@ import pytest # pylint: disable=import-error from taxcalc import Policy, Records, Behavior, Calculator -from taxcalc.utils import (TABLE_COLUMNS, TABLE_LABELS, STATS_COLUMNS, +from taxcalc.utils import (STATS_COLUMNS, + DIST_TABLE_COLUMNS, DIST_TABLE_LABELS, create_distribution_table, create_difference_table, weighted_count_lt_zero, weighted_count_gt_zero, weighted_count, weighted_sum, weighted_mean, @@ -52,7 +53,7 @@ def test_validity_of_name_lists(): - assert len(TABLE_COLUMNS) == len(TABLE_LABELS) + assert len(DIST_TABLE_COLUMNS) == len(DIST_TABLE_LABELS) assert set(STATS_COLUMNS).issubset(Records.CALCULATED_VARS | {'s006'}) diff --git a/taxcalc/utils.py b/taxcalc/utils.py index f57e5bf74..039c403ed 100644 --- a/taxcalc/utils.py +++ b/taxcalc/utils.py @@ -32,22 +32,49 @@ 'c05800', 'othertaxes', 'refund', 'c07100', 'iitax', 'payrolltax', 'combined', 's006'] -# Items in the TABLE_COLUMNS list below correspond to the items in the -# TABLE_LABELS list below; this correspondence allows us to use TABLE_LABELS -# to map a label to the correct column in our distribution tables. -TABLE_COLUMNS = ['s006', 'c00100', 'num_returns_StandardDed', 'standard', - 'num_returns_ItemDed', 'c04470', 'c04600', 'c04800', 'taxbc', - 'c62100', 'num_returns_AMT', 'c09600', 'c05800', 'c07100', - 'othertaxes', 'refund', 'iitax', 'payrolltax', 'combined'] - -TABLE_LABELS = ['Returns', 'AGI', 'Standard Deduction Filers', - 'Standard Deduction', 'Itemizers', - 'Itemized Deduction', 'Personal Exemption', - 'Taxable Income', 'Regular Tax', 'AMTI', 'AMT Filers', 'AMT', - 'Tax before Credits', 'Non-refundable Credits', - 'Other Taxes', 'Refundable Credits', - 'Individual Income Tax Liabilities', 'Payroll Tax Liablities', - 'Combined Payroll and Individual Income Tax Liabilities'] +# Items in the DIST_TABLE_COLUMNS list below correspond to the items in the +# DIST_TABLE_LABELS list below; this correspondence allows us to use +# DIST_TABLE_LABELS to map a label to the correct column in the distribution +# tables. +DIST_TABLE_COLUMNS = ['s006', + 'c00100', + 'num_returns_StandardDed', + 'standard', + 'num_returns_ItemDed', + 'c04470', + 'c04600', + 'c04800', + 'taxbc', + 'c62100', + 'num_returns_AMT', + 'c09600', + 'c05800', + 'c07100', + 'othertaxes', + 'refund', + 'iitax', + 'payrolltax', + 'combined'] + +DIST_TABLE_LABELS = ['Returns', + 'AGI', + 'Standard Deduction Filers', + 'Standard Deduction', + 'Itemizers', + 'Itemized Deduction', + 'Personal Exemption', + 'Taxable Income', + 'Regular Tax', + 'AMTI', + 'AMT Filers', + 'AMT', + 'Tax before Credits', + 'Non-refundable Credits', + 'Other Taxes', + 'Refundable Credits', + 'Individual Income Tax Liabilities', + 'Payroll Tax Liablities', + 'Combined Payroll and Individual Income Tax Liabilities'] # Following list is used in our difference table to label its columns. DIFF_TABLE_LABELS = ['Tax Units with Tax Cut', @@ -325,13 +352,14 @@ def add_columns(pdf): # manipulate the data given specified result_type if result_type == 'weighted_sum': pdf = weighted(pdf, STATS_COLUMNS) - gpdf_mean = pdf.groupby('bins', as_index=False)[TABLE_COLUMNS].sum() + gpdf = pdf.groupby('bins', as_index=False) + gpdf_mean = gpdf[DIST_TABLE_COLUMNS].sum() gpdf_mean.drop('bins', axis=1, inplace=True) - sum_row = get_sums(pdf)[TABLE_COLUMNS] + sum_row = get_sums(pdf)[DIST_TABLE_COLUMNS] elif result_type == 'weighted_avg': - gpdf_mean = weighted_avg_allcols(pdf, TABLE_COLUMNS, + gpdf_mean = weighted_avg_allcols(pdf, DIST_TABLE_COLUMNS, income_measure=income_measure) - sum_row = get_sums(pdf, not_available=True)[TABLE_COLUMNS] + sum_row = get_sums(pdf, not_available=True)[DIST_TABLE_COLUMNS] else: msg = "result_type must be either 'weighted_sum' or 'weighted_avg'" raise ValueError(msg) From 82f835c040631a9c68662929bc30529b361c8977 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Thu, 7 Sep 2017 06:48:35 -0400 Subject: [PATCH 2/5] Make dropq_summary return dictionary-of-DataFrame results --- taxcalc/dropq/dropq.py | 73 ++++++++------------- taxcalc/dropq/dropq_utils.py | 122 +++++++++++++++++++++-------------- 2 files changed, 100 insertions(+), 95 deletions(-) diff --git a/taxcalc/dropq/dropq.py b/taxcalc/dropq/dropq.py index a65a3ec33..1d0c0b19f 100644 --- a/taxcalc/dropq/dropq.py +++ b/taxcalc/dropq/dropq.py @@ -108,26 +108,7 @@ def run_nth_year_tax_calc_model(year_n, start_year, np.random.seed(seed) # pylint: disable=no-member # construct dropq summary results from raw results - (dist1_dec, dist2_dec, - diff_itax_dec, diff_ptax_dec, diff_comb_dec, - dist1_bin, dist2_bin, - diff_itax_bin, diff_ptax_bin, diff_comb_bin, - aggr_itax_d, aggr_ptax_d, aggr_comb_d, - aggr_itax_1, aggr_ptax_1, aggr_comb_1, - aggr_itax_2, aggr_ptax_2, aggr_comb_2) = dropq_summary(rawres1, - rawres2, - mask) - - # construct DataFrames containing aggregate tax totals - # ... for reform-minus-baseline difference - aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d] - aggr_d = pd.DataFrame(data=aggrd, index=AGGR_ROW_NAMES) - # ... for baseline - aggr1 = [aggr_itax_1, aggr_ptax_1, aggr_comb_1] - aggr_1 = pd.DataFrame(data=aggr1, index=AGGR_ROW_NAMES) - # ... for reform - aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2] - aggr_2 = pd.DataFrame(data=aggr2, index=AGGR_ROW_NAMES) + summ = dropq_summary(rawres1, rawres2, mask) elapsed_time = time.time() - start_time print('elapsed time for this run: ', elapsed_time) @@ -141,61 +122,61 @@ def append_year(pdf): # optionally return non-JSON results if not return_json: - return (append_year(dist2_dec), - append_year(dist1_dec), - append_year(diff_itax_dec), - append_year(diff_ptax_dec), - append_year(diff_comb_dec), - append_year(dist2_bin), - append_year(dist1_bin), - append_year(diff_itax_bin), - append_year(diff_ptax_bin), - append_year(diff_comb_bin), - append_year(aggr_d), - append_year(aggr_1), - append_year(aggr_2)) + return (append_year(summ['dist2_dec']), + append_year(summ['dist1_dec']), + append_year(summ['diff_itax_dec']), + append_year(summ['diff_ptax_dec']), + append_year(summ['diff_comb_dec']), + append_year(summ['dist2_bin']), + append_year(summ['dist1_bin']), + append_year(summ['diff_itax_bin']), + append_year(summ['diff_ptax_bin']), + append_year(summ['diff_comb_bin']), + append_year(summ['aggr_d']), + append_year(summ['aggr_1']), + append_year(summ['aggr_2'])) # optionally construct JSON results tables for year n dec_row_names_n = [x + '_' + str(year_n) for x in DEC_ROW_NAMES] - dist2_dec_table_n = create_json_table(dist2_dec, + dist2_dec_table_n = create_json_table(summ['dist2_dec'], row_names=dec_row_names_n, column_types=DIST_COLUMN_TYPES) - dist1_dec_table_n = create_json_table(dist1_dec, + dist1_dec_table_n = create_json_table(summ['dist1_dec'], row_names=dec_row_names_n, column_types=DIST_COLUMN_TYPES) - diff_itax_dec_table_n = create_json_table(diff_itax_dec, + diff_itax_dec_table_n = create_json_table(summ['diff_itax_dec'], row_names=dec_row_names_n, column_types=DIFF_COLUMN_TYPES) - diff_ptax_dec_table_n = create_json_table(diff_ptax_dec, + diff_ptax_dec_table_n = create_json_table(summ['diff_ptax_dec'], row_names=dec_row_names_n, column_types=DIFF_COLUMN_TYPES) - diff_comb_dec_table_n = create_json_table(diff_comb_dec, + diff_comb_dec_table_n = create_json_table(summ['diff_comb_dec'], row_names=dec_row_names_n, column_types=DIFF_COLUMN_TYPES) bin_row_names_n = [x + '_' + str(year_n) for x in BIN_ROW_NAMES] - dist2_bin_table_n = create_json_table(dist2_bin, + dist2_bin_table_n = create_json_table(summ['dist2_bin'], row_names=bin_row_names_n, column_types=DIST_COLUMN_TYPES) - dist1_bin_table_n = create_json_table(dist1_bin, + dist1_bin_table_n = create_json_table(summ['dist1_bin'], row_names=bin_row_names_n, column_types=DIST_COLUMN_TYPES) - diff_itax_bin_table_n = create_json_table(diff_itax_bin, + diff_itax_bin_table_n = create_json_table(summ['diff_itax_bin'], row_names=bin_row_names_n, column_types=DIFF_COLUMN_TYPES) - diff_ptax_bin_table_n = create_json_table(diff_ptax_bin, + diff_ptax_bin_table_n = create_json_table(summ['diff_ptax_bin'], row_names=bin_row_names_n, column_types=DIFF_COLUMN_TYPES) - diff_comb_bin_table_n = create_json_table(diff_comb_bin, + diff_comb_bin_table_n = create_json_table(summ['diff_comb_bin'], row_names=bin_row_names_n, column_types=DIFF_COLUMN_TYPES) total_row_names_n = [x + '_' + str(year_n) for x in AGGR_ROW_NAMES] - aggr_d_table_n = create_json_table(aggr_d, + aggr_d_table_n = create_json_table(summ['aggr_d'], row_names=total_row_names_n) aggr_d_table_n = dict((k, v[0]) for k, v in aggr_d_table_n.items()) - aggr_1_table_n = create_json_table(aggr_1, + aggr_1_table_n = create_json_table(summ['aggr_1'], row_names=total_row_names_n) aggr_1_table_n = dict((k, v[0]) for k, v in aggr_1_table_n.items()) - aggr_2_table_n = create_json_table(aggr_2, + aggr_2_table_n = create_json_table(summ['aggr_2'], row_names=total_row_names_n) aggr_2_table_n = dict((k, v[0]) for k, v in aggr_2_table_n.items()) diff --git a/taxcalc/dropq/dropq_utils.py b/taxcalc/dropq/dropq_utils.py index 64d519d84..aed7b8ff8 100644 --- a/taxcalc/dropq/dropq_utils.py +++ b/taxcalc/dropq/dropq_utils.py @@ -8,6 +8,7 @@ import copy import hashlib import numpy as np +import pandas as pd from taxcalc import (Policy, Records, Calculator, Consumption, Behavior, Growfactors, Growdiff) from taxcalc.utils import (add_income_bins, add_quantile_bins, results, @@ -304,10 +305,12 @@ def dropq_summary(df1, df2, mask): mask is the boolean array specifying which records might be fuzzed returns dictionary of summary results DataFrames """ - # pylint: disable=too-many-locals + # pylint: disable=too-many-statements,too-many-locals df2 = fuzz_df2_records(df1, df2, mask) + summ = dict() + # tax difference totals between reform and baseline tdiff = df2['iitax_agg'] - df1['iitax'] aggr_itax_d = (tdiff * df2['s006']).sum() @@ -315,66 +318,98 @@ def dropq_summary(df1, df2, mask): aggr_ptax_d = (tdiff * df2['s006']).sum() tdiff = df2['combined_agg'] - df1['combined'] aggr_comb_d = (tdiff * df2['s006']).sum() + aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d] + summ['aggr_d'] = pd.DataFrame(data=aggrd, index=AGGR_ROW_NAMES) # totals for baseline aggr_itax_1 = (df1['iitax'] * df1['s006']).sum() aggr_ptax_1 = (df1['payrolltax'] * df1['s006']).sum() aggr_comb_1 = (df1['combined'] * df1['s006']).sum() + aggr1 = [aggr_itax_1, aggr_ptax_1, aggr_comb_1] + summ['aggr_1'] = pd.DataFrame(data=aggr1, index=AGGR_ROW_NAMES) # totals for reform aggr_itax_2 = (df2['iitax_agg'] * df2['s006']).sum() aggr_ptax_2 = (df2['payrolltax_agg'] * df2['s006']).sum() aggr_comb_2 = (df2['combined_agg'] * df2['s006']).sum() + aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2] + summ['aggr_2'] = pd.DataFrame(data=aggr2, index=AGGR_ROW_NAMES) - # create difference tables grouped by deciles and bins + # create difference tables grouped by deciles df2['iitax'] = df2['iitax_xdec'] - diff_itax_dec = create_difference_table(df1, df2, - groupby='weighted_deciles', - income_measure='expanded_income', - tax_to_diff='iitax') + summ['diff_itax_dec'] = \ + create_difference_table(df1, df2, + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='iitax') + df2['payrolltax'] = df2['payrolltax_xdec'] - diff_ptax_dec = create_difference_table(df1, df2, - groupby='weighted_deciles', - income_measure='expanded_income', - tax_to_diff='payrolltax') + summ['diff_ptax_dec'] = \ + create_difference_table(df1, df2, + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='payrolltax') + df2['combined'] = df2['combined_xdec'] - diff_comb_dec = create_difference_table(df1, df2, - groupby='weighted_deciles', - income_measure='expanded_income', - tax_to_diff='combined') + summ['diff_comb_dec'] = \ + create_difference_table(df1, df2, + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='combined') + + # create difference tables grouped by bins (removing negative-income bin) df2['iitax'] = df2['iitax_xbin'] - diff_itax_bin = create_difference_table(df1, df2, - groupby='webapp_income_bins', - income_measure='expanded_income', - tax_to_diff='iitax') + diff_itax_bin = \ + create_difference_table(df1, df2, + groupby='webapp_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') + diff_itax_bin.drop(diff_itax_bin.index[0], inplace=True) + summ['diff_itax_bin'] = diff_itax_bin + df2['payrolltax'] = df2['payrolltax_xbin'] - diff_ptax_bin = create_difference_table(df1, df2, - groupby='webapp_income_bins', - income_measure='expanded_income', - tax_to_diff='iitax') + diff_ptax_bin = \ + create_difference_table(df1, df2, + groupby='webapp_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') + diff_ptax_bin.drop(diff_ptax_bin.index[0], inplace=True) + summ['diff_ptax_bin'] = diff_ptax_bin + df2['combined'] = df2['combined_xbin'] - diff_comb_bin = create_difference_table(df1, df2, - groupby='webapp_income_bins', - income_measure='expanded_income', - tax_to_diff='combined') - - # create distribution tables grouped by deciles and bins - dist1_dec = create_distribution_table(df1, groupby='weighted_deciles', - income_measure='expanded_income', - result_type='weighted_sum') - dist1_bin = create_distribution_table(df1, groupby='webapp_income_bins', - income_measure='expanded_income', - result_type='weighted_sum') + diff_comb_bin = \ + create_difference_table(df1, df2, + groupby='webapp_income_bins', + income_measure='expanded_income', + tax_to_diff='combined') + diff_comb_bin.drop(diff_comb_bin.index[0], inplace=True) + summ['diff_comb_bin'] = diff_comb_bin + + # create distribution tables grouped by deciles + summ['dist1_dec'] = \ + create_distribution_table(df1, groupby='weighted_deciles', + income_measure='expanded_income', + result_type='weighted_sum') + suffix = '_xdec' df2_cols_with_suffix = [c for c in list(df2) if c.endswith(suffix)] for col in df2_cols_with_suffix: root_col_name = col.replace(suffix, '') df2[root_col_name] = df2[col] df2['expanded_income_baseline'] = df1['expanded_income'] - dist2_dec = \ + summ['dist2_dec'] = \ create_distribution_table(df2, groupby='weighted_deciles', income_measure='expanded_income_baseline', result_type='weighted_sum') + + # create distribution tables grouped by bins (removing negative-income bin) + dist1_bin = \ + create_distribution_table(df1, groupby='webapp_income_bins', + income_measure='expanded_income', + result_type='weighted_sum') + dist1_bin.drop(dist1_bin.index[0], inplace=True) + summ['dist1_bin'] = dist1_bin + suffix = '_xbin' df2_cols_with_suffix = [c for c in list(df2) if c.endswith(suffix)] for col in df2_cols_with_suffix: @@ -385,19 +420,8 @@ def dropq_summary(df1, df2, mask): create_distribution_table(df2, groupby='webapp_income_bins', income_measure='expanded_income_baseline', result_type='weighted_sum') - - # remove negative-income bin from each bin result - dist1_bin.drop(dist1_bin.index[0], inplace=True) dist2_bin.drop(dist2_bin.index[0], inplace=True) - diff_itax_bin.drop(diff_itax_bin.index[0], inplace=True) - diff_ptax_bin.drop(diff_ptax_bin.index[0], inplace=True) - diff_comb_bin.drop(diff_comb_bin.index[0], inplace=True) + summ['dist2_bin'] = dist2_bin - # return tupl of summary results - return (dist1_dec, dist2_dec, - diff_itax_dec, diff_ptax_dec, diff_comb_dec, - dist1_bin, dist2_bin, - diff_itax_bin, diff_ptax_bin, diff_comb_bin, - aggr_itax_d, aggr_ptax_d, aggr_comb_d, - aggr_itax_1, aggr_ptax_1, aggr_comb_1, - aggr_itax_2, aggr_ptax_2, aggr_comb_2) + # return dictionary of summary results + return summ From d9fff0fb3851091c3aceba7188b622019e609ff6 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Thu, 7 Sep 2017 09:09:21 -0400 Subject: [PATCH 3/5] Make run_nth_year_tax_calc_model return dict (not tuple) --- taxcalc/dropq/dropq.py | 97 +++++++++++-------------------------- taxcalc/tests/test_dropq.py | 14 +++--- 2 files changed, 36 insertions(+), 75 deletions(-) diff --git a/taxcalc/dropq/dropq.py b/taxcalc/dropq/dropq.py index 1d0c0b19f..a3816a6b4 100644 --- a/taxcalc/dropq/dropq.py +++ b/taxcalc/dropq/dropq.py @@ -122,78 +122,39 @@ def append_year(pdf): # optionally return non-JSON results if not return_json: - return (append_year(summ['dist2_dec']), - append_year(summ['dist1_dec']), - append_year(summ['diff_itax_dec']), - append_year(summ['diff_ptax_dec']), - append_year(summ['diff_comb_dec']), - append_year(summ['dist2_bin']), - append_year(summ['dist1_bin']), - append_year(summ['diff_itax_bin']), - append_year(summ['diff_ptax_bin']), - append_year(summ['diff_comb_bin']), - append_year(summ['aggr_d']), - append_year(summ['aggr_1']), - append_year(summ['aggr_2'])) + res = dict() + for tbl in summ: + res[tbl] = append_year(summ[tbl]) + return res # optionally construct JSON results tables for year n dec_row_names_n = [x + '_' + str(year_n) for x in DEC_ROW_NAMES] - dist2_dec_table_n = create_json_table(summ['dist2_dec'], - row_names=dec_row_names_n, - column_types=DIST_COLUMN_TYPES) - dist1_dec_table_n = create_json_table(summ['dist1_dec'], - row_names=dec_row_names_n, - column_types=DIST_COLUMN_TYPES) - diff_itax_dec_table_n = create_json_table(summ['diff_itax_dec'], - row_names=dec_row_names_n, - column_types=DIFF_COLUMN_TYPES) - diff_ptax_dec_table_n = create_json_table(summ['diff_ptax_dec'], - row_names=dec_row_names_n, - column_types=DIFF_COLUMN_TYPES) - diff_comb_dec_table_n = create_json_table(summ['diff_comb_dec'], - row_names=dec_row_names_n, - column_types=DIFF_COLUMN_TYPES) bin_row_names_n = [x + '_' + str(year_n) for x in BIN_ROW_NAMES] - dist2_bin_table_n = create_json_table(summ['dist2_bin'], - row_names=bin_row_names_n, - column_types=DIST_COLUMN_TYPES) - dist1_bin_table_n = create_json_table(summ['dist1_bin'], - row_names=bin_row_names_n, - column_types=DIST_COLUMN_TYPES) - diff_itax_bin_table_n = create_json_table(summ['diff_itax_bin'], - row_names=bin_row_names_n, - column_types=DIFF_COLUMN_TYPES) - diff_ptax_bin_table_n = create_json_table(summ['diff_ptax_bin'], - row_names=bin_row_names_n, - column_types=DIFF_COLUMN_TYPES) - diff_comb_bin_table_n = create_json_table(summ['diff_comb_bin'], - row_names=bin_row_names_n, - column_types=DIFF_COLUMN_TYPES) - total_row_names_n = [x + '_' + str(year_n) for x in AGGR_ROW_NAMES] - aggr_d_table_n = create_json_table(summ['aggr_d'], - row_names=total_row_names_n) - aggr_d_table_n = dict((k, v[0]) for k, v in aggr_d_table_n.items()) - aggr_1_table_n = create_json_table(summ['aggr_1'], - row_names=total_row_names_n) - aggr_1_table_n = dict((k, v[0]) for k, v in aggr_1_table_n.items()) - aggr_2_table_n = create_json_table(summ['aggr_2'], - row_names=total_row_names_n) - aggr_2_table_n = dict((k, v[0]) for k, v in aggr_2_table_n.items()) - - # return JSON results - return (dist2_dec_table_n, - dist1_dec_table_n, - diff_itax_dec_table_n, - diff_ptax_dec_table_n, - diff_comb_dec_table_n, - dist2_bin_table_n, - dist1_bin_table_n, - diff_itax_bin_table_n, - diff_ptax_bin_table_n, - diff_comb_bin_table_n, - aggr_d_table_n, - aggr_1_table_n, - aggr_2_table_n) + agg_row_names_n = [x + '_' + str(year_n) for x in AGG_ROW_NAMES] + info = dict() + for tbl in summ: + info[tbl] = {'row_names': [], 'col_types': []} + if 'dec' in tbl: + info[tbl]['row_names'] = dec_row_names_n + elif 'bin' in tbl: + info[tbl]['row_names'] = bin_row_names_n + else: + info[tbl]['row_names'] = agg_row_names_n + if 'dist' in tbl: + info[tbl]['col_types'] = DIST_COLUMN_TYPES + elif 'diff' in tbl: + info[tbl]['col_types'] = DIFF_COLUMN_TYPES + res = dict() + for tbl in summ: + if 'aggr' in tbl: + res_table = create_json_table(summ[tbl], + row_names=info[tbl]['row_names']) + res[tbl] = dict((k, v[0]) for k, v in res_table.items()) + else: + res[tbl] = create_json_table(summ[tbl], + row_names=info[tbl]['row_names'], + column_types=info[tbl]['col_types']) + return res def run_nth_year_gdp_elast_model(year_n, start_year, diff --git a/taxcalc/tests/test_dropq.py b/taxcalc/tests/test_dropq.py index c8792e9ca..0f81c70a9 100644 --- a/taxcalc/tests/test_dropq.py +++ b/taxcalc/tests/test_dropq.py @@ -78,19 +78,19 @@ def test_run_tax_calc_model(puf_subsample, resjson): return_json=resjson) assert len(res) == 13 dump = False # set to True in order to dump returned results and fail test - for idx in range(0, len(res)): + for tbl in sorted(res.keys()): if resjson: - assert isinstance(res[idx], dict) + assert isinstance(res[tbl], dict) else: - assert isinstance(res[idx], pd.DataFrame) + assert isinstance(res[tbl], pd.DataFrame) if dump: if resjson: - cols = sorted(res[idx].keys()) + cols = sorted(res[tbl].keys()) else: - cols = sorted(list(res[idx])) + cols = sorted(list(res[tbl])) for col in cols: - print('<>'.format(idx, col)) - print(res[idx][col]) + print('<>'.format(tbl, col)) + print(res[tbl][col]) assert not dump From c6b09fa19346b493079fc783e8a9cf6e49d7410e Mon Sep 17 00:00:00 2001 From: martinholmer Date: Thu, 7 Sep 2017 09:37:51 -0400 Subject: [PATCH 4/5] Fix test to handle returned dict (not tuple) --- taxcalc/tests/test_dropq.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/taxcalc/tests/test_dropq.py b/taxcalc/tests/test_dropq.py index 0f81c70a9..8a3f29d59 100644 --- a/taxcalc/tests/test_dropq.py +++ b/taxcalc/tests/test_dropq.py @@ -186,12 +186,11 @@ def test_with_pufcsv(puf_fullsample): # create a Public Use File object tax_data = puf_fullsample # call run_nth_year_tax_calc_model function - restuple = run_nth_year_tax_calc_model(year_n, start_year, - tax_data, usermods, - return_json=True) - total = restuple[len(restuple) - 1] # the last of element of the tuple - dropq_reform_revenue = float(total['combined_tax_9']) - dropq_reform_revenue *= 1e-9 # convert to billions of dollars + resdict = run_nth_year_tax_calc_model(year_n, start_year, + tax_data, usermods, + return_json=True) + total = resdict['aggr_2'] + dropq_reform_revenue = float(total['combined_tax_9']) * 1e-9 # assert that dropq revenue is similar to the fullsample calculation diff = abs(fulls_reform_revenue - dropq_reform_revenue) proportional_diff = diff / fulls_reform_revenue From 22a0ce36c3f83bae7e36745f5ea7bba751097fc1 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Thu, 7 Sep 2017 10:05:16 -0400 Subject: [PATCH 5/5] Update RELEASES.md info --- RELEASES.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/RELEASES.md b/RELEASES.md index 2371b4e10..40db6a665 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -13,6 +13,9 @@ Release 0.11.0 on 2017-??-?? - Revise dropq distribution and difference tables used by TaxBrain [[#1537](https://github.com/open-source-economics/Tax-Calculator/pull/1537) by Anderson Frailey and Martin Holmer] +- Make dropq run_nth_year_tax_calc_model return a dictionary of results + [[#1543](https://github.com/open-source-economics/Tax-Calculator/pull/1543) + by Martin Holmer] **New Features** - None @@ -43,7 +46,6 @@ Release 0.10.0 on 2017-08-28 [[#1524](https://github.com/open-source-economics/Tax-Calculator/pull/1524) by Martin Holmer] - **Bug Fixes** - None