From 23889b3defc3af8e30c82a6eabdc345430aacb75 Mon Sep 17 00:00:00 2001
From: Martin Holmer <martin.holmer@gmail.com>
Date: Tue, 17 Apr 2018 10:58:43 -0400
Subject: [PATCH 1/3] Refactor create_difference_table utility function

---
 taxcalc/utils.py | 235 ++++++++++++++++++++---------------------------
 1 file changed, 99 insertions(+), 136 deletions(-)

diff --git a/taxcalc/utils.py b/taxcalc/utils.py
index 452c91b97..276fb5ffe 100644
--- a/taxcalc/utils.py
+++ b/taxcalc/utils.py
@@ -24,8 +24,8 @@
                                weighted_count, weighted_mean,
                                wage_weighted, agi_weighted,
                                expanded_income_weighted,
-                               weighted_perc_inc, weighted_perc_cut,
-                               EPSILON)
+                               weighted_perc_cut,
+                               weighted_perc_inc)
 
 
 # Items in the DIST_TABLE_COLUMNS list below correspond to the items in the
@@ -453,127 +453,25 @@ def create_difference_table(vdf1, vdf2, groupby, income_measure, tax_to_diff):
           positive (denoted by a 0-10p row label) values of the
           specified income_measure.
     """
-    # pylint: disable=too-many-statements
-    # nested function that actually creates the difference table
-    def diff_table_stats(resd, groupby, income_measure):
+    # pylint: disable=too-many-statements,too-many-locals
+    # nested function that creates dataframe containing additive statistics
+    def additive_stats_dataframe(gpdf):
         """
-        Return new Pandas DataFrame containing difference table statistics
-        based on grouped values of specified col_name in the specified resd.
-
-        resd: reform difference results Pandas DataFrame
-        groupby: string naming type of table rows
-        income_measure: string naming column used to create resd table rows
+        Nested function that returns additive stats DataFrame derived from gpdf
         """
-        # pylint: disable=too-many-locals
-        def stat_dataframe(gpdf):
-            """
-            Nested function that returns statistics DataFrame derived from gpdf
-            """
-            def weighted_share_of_total(gpdf, colname, total):
-                """
-                Nested function that returns the ratio of the
-                weighted_sum(pdf, colname) and specified total
-                """
-                return weighted_sum(gpdf, colname) / (total + EPSILON)
-            # main logic of stat_dataframe function
-            # construct basic stat_dataframe columns
-            sdf = pd.DataFrame()
-            sdf['count'] = gpdf.apply(weighted_count)
-            sdf['tax_cut'] = gpdf.apply(weighted_count_lt_zero, 'tax_diff')
-            sdf['perc_cut'] = gpdf.apply(weighted_perc_cut, 'tax_diff')
-            sdf['tax_inc'] = gpdf.apply(weighted_count_gt_zero, 'tax_diff')
-            sdf['perc_inc'] = gpdf.apply(weighted_perc_inc, 'tax_diff')
-            sdf['mean'] = gpdf.apply(weighted_mean, 'tax_diff')
-            sdf['tot_change'] = gpdf.apply(weighted_sum, 'tax_diff')
-            wtotal = (resd['tax_diff'] * resd['s006']).sum()
-            sdf['share_of_change'] = gpdf.apply(weighted_share_of_total,
-                                                'tax_diff', wtotal)
-            sdf['atinc1'] = gpdf.apply(weighted_sum, 'atinc1')
-            sdf['atinc2'] = gpdf.apply(weighted_sum, 'atinc2')
-            sdf['ubi'] = gpdf.apply(weighted_sum, 'ubi')
-            sdf['benefit_cost_total'] = gpdf.apply(weighted_sum,
-                                                   'benefit_cost_total')
-            sdf['benefit_value_total'] = gpdf.apply(weighted_sum,
-                                                    'benefit_value_total')
-            return sdf
-
-        # main logic of diff_table_stats function
-        # calculate whole-sample perc_cut and perc_inc statistics
-        sums_perc_cut = weighted_perc_cut(resd, 'tax_diff')
-        sums_perc_inc = weighted_perc_inc(resd, 'tax_diff')
-        # add column to resd given specified groupby and income_measure
-        if groupby == 'weighted_deciles':
-            pdf = add_quantile_table_row_variable(resd, income_measure, 10)
-        elif groupby == 'standard_income_bins':
-            pdf = add_income_table_row_variable(resd, income_measure,
-                                                bin_type='standard')
-        elif groupby == 'large_income_bins':
-            pdf = add_income_table_row_variable(resd, income_measure,
-                                                bin_type='tpc')
-        elif groupby == 'small_income_bins':
-            pdf = add_income_table_row_variable(resd, income_measure,
-                                                bin_type='soi')
-        min_income_measure = pdf[income_measure].min()
-        # create grouped Pandas DataFrame
-        gpdf = pdf.groupby('table_row', as_index=False)
-        del pdf
-        # create difference table statistics from gpdf in a new DataFrame
-        diffs_without_sums = stat_dataframe(gpdf)
-        # calculate sums row
-        row = get_sums(diffs_without_sums)[diffs_without_sums.columns]
-        row['mean'] = 0
-        if row['count'] > 0:
-            row['mean'] = row['tot_change'] / row['count']
-        row['perc_cut'] = sums_perc_cut
-        row['perc_inc'] = sums_perc_inc
-        row['share_of_change'] = 1.0  # avoid rounding error
-        diffs = diffs_without_sums.append(row)
-        del row
-        # replace bottom decile row with non-positive and positive rows
-        if groupby == 'weighted_deciles' and min_income_measure <= 0:
-            # bottom decile as its own DataFrame
-            pdf = copy.deepcopy(gpdf.get_group(1))
-            pdf['table_row'] = pd.cut(pdf[income_measure],
-                                      bins=[-9e99, -1e-9, 1e-9, 9e99],
-                                      labels=[1, 2, 3])
-            gpdfx = pdf.groupby('table_row', as_index=False)
-            rows = stat_dataframe(gpdfx)
-            diffs = pd.concat([rows, diffs.iloc[1:11]])
-            del rows
-            del pdf
-            del gpdfx
-        # append top-decile-detail rows
-        if groupby == 'weighted_deciles':
-            # top decile as its own DataFrame
-            pdf = copy.deepcopy(gpdf.get_group(10))
-            pdf = add_quantile_table_row_variable(pdf, income_measure, 10)
-            # TODO: following statement generates this IGNORED error:
-            # ValueError: Buffer dtype mismatch,
-            #             expected 'Python object' but got 'long'
-            # Exception ValueError: "Buffer dtype mismatch,
-            #              expected 'Python object' but got 'long'"
-            #              in 'pandas._libs.lib.is_bool_array' ignored
-            #                                                  ^^^^^^^
-            # It is hoped that Pandas PR#18252, which is scheduled for
-            # inclusion in Pandas version 0.23.0 (Apr 2018), will fix this.
-            # See discussion at the following URL:
-            # https://github.com/pandas-dev/pandas/issues/19037
-            pdf['table_row'].replace(to_replace=[1, 2, 3, 4, 5],
-                                     value=[0, 0, 0, 0, 0], inplace=True)
-            pdf['table_row'].replace(to_replace=[6, 7, 8, 9],
-                                     value=[1, 1, 1, 1], inplace=True)
-            pdf['table_row'].replace(to_replace=[10],
-                                     value=[2], inplace=True)
-            gpdfx = pdf.groupby('table_row', as_index=False)
-            sdf = stat_dataframe(gpdfx)
-            diffs = diffs.append(sdf, ignore_index=True)
-            del sdf
-            del pdf
-            del gpdfx
-        # delete intermediate Pandas DataFrame objects
-        del gpdf
-        # return difference statistics
-        return diffs
+        sdf = pd.DataFrame()
+        sdf['count'] = gpdf.apply(weighted_count)
+        sdf['tax_cut'] = gpdf.apply(weighted_count_lt_zero, 'tax_diff')
+        sdf['tax_inc'] = gpdf.apply(weighted_count_gt_zero, 'tax_diff')
+        sdf['tot_change'] = gpdf.apply(weighted_sum, 'tax_diff')
+        sdf['ubi'] = gpdf.apply(weighted_sum, 'ubi')
+        sdf['benefit_cost_total'] = gpdf.apply(weighted_sum,
+                                               'benefit_cost_total')
+        sdf['benefit_value_total'] = gpdf.apply(weighted_sum,
+                                                'benefit_value_total')
+        sdf['atinc1'] = gpdf.apply(weighted_sum, 'atinc1')
+        sdf['atinc2'] = gpdf.apply(weighted_sum, 'atinc2')
+        return sdf
     # main logic of create_difference_table
     assert isinstance(vdf1, pd.DataFrame)
     assert isinstance(vdf2, pd.DataFrame)
@@ -594,23 +492,88 @@ def weighted_share_of_total(gpdf, colname, total):
     res2['tax_diff'] = res2[tax_to_diff] - res1[tax_to_diff]
     res2['atinc1'] = res1['aftertax_income']
     res2['atinc2'] = res2['aftertax_income']
-    diffs = diff_table_stats(res2, groupby, baseline_income_measure)
-    diffs['pc_aftertaxinc'] = (diffs['atinc2'] / diffs['atinc1']) - 1.0
-    # delete intermediate atinc1 and atinc2 columns
-    del diffs['atinc1']
-    del diffs['atinc2']
+    # add table_row column to res2 given specified groupby and income_measure
+    if 'table_row' in res2:
+        print "already have table_row"  # TODO: remove
+        print "min(table_row)=", res2['table_row'].min()  # TODO: remove
+        print "max(table_row)=", res2['table_row'].max()  # TODO: remove
+        pdf = res2
+    else:
+        if groupby == 'weighted_deciles':
+            pdf = add_quantile_table_row_variable(res2, income_measure, 10)
+        elif groupby == 'standard_income_bins':
+            pdf = add_income_table_row_variable(res2, income_measure,
+                                                bin_type='standard')
+        elif groupby == 'large_income_bins':
+            pdf = add_income_table_row_variable(res2, income_measure,
+                                                bin_type='tpc')
+        elif groupby == 'small_income_bins':
+            pdf = add_income_table_row_variable(res2, income_measure,
+                                                bin_type='soi')
+    # create grouped Pandas DataFrame
+    gpdf = pdf.groupby('table_row', as_index=False)
+    del pdf
+    # create additive difference table statistics from gpdf
+    diff_stats = additive_stats_dataframe(gpdf)
+    # calculate additive statistics on sums row
+    sum_row_diff_stats = get_sums(diff_stats)[diff_stats.columns]
+    # optionally create bottom decile details
+    if groupby == 'weighted_deciles':
+        pdf = copy.deepcopy(gpdf.get_group(1))
+        pdf['table_row'] = pd.cut(pdf[income_measure],
+                                  bins=[-9e99, -1e-9, 1e-9, 9e99],
+                                  labels=[1, 2, 3])
+        gpdfx = pdf.groupby('table_row', as_index=False)
+        rows = additive_stats_dataframe(gpdfx)
+        diff_stats = pd.concat([rows, diff_stats.iloc[1:10]])
+        del rows
+        del pdf
+        del gpdfx
+    # append sum_row_additive_stats
+    diff_stats = diff_stats.append(sum_row_diff_stats)
+    # optionally create top decile details
+    if groupby == 'weighted_deciles':
+        pdf = copy.deepcopy(gpdf.get_group(10))
+        pdf = add_quantile_table_row_variable(pdf, income_measure, 10)
+        pdf['table_row'].replace(to_replace=[1, 2, 3, 4, 5],
+                                 value=[0, 0, 0, 0, 0], inplace=True)
+        pdf['table_row'].replace(to_replace=[6, 7, 8, 9],
+                                 value=[1, 1, 1, 1], inplace=True)
+        pdf['table_row'].replace(to_replace=[10],
+                                 value=[2], inplace=True)
+        gpdfx = pdf.groupby('table_row', as_index=False)
+        sdf = additive_stats_dataframe(gpdfx)
+        diff_stats = diff_stats.append(sdf, ignore_index=True)
+        del sdf
+        del pdf
+        del gpdfx
+    # delete intermediate Pandas DataFrame objects
+    del gpdf
+    # compute non-additive stats in each table cell
+    count = diff_stats['count']
+    diff_stats['perc_cut'] = np.where(count > 0,
+                                      100 * diff_stats['tax_cut'] / count, 0)
+    diff_stats['perc_inc'] = np.where(count > 0,
+                                      100 * diff_stats['tax_inc'] / count, 0)
+    diff_stats['mean'] = np.where(count > 0,
+                                  diff_stats['tot_change'] / count, 0)
+    total_change = sum_row_diff_stats['tot_change']
+    diff_stats['share_of_change'] = np.where(total_change == 0, np.nan,
+                                             (100 * diff_stats['tot_change'] /
+                                              total_change))
+    diff_stats['pc_aftertaxinc'] = (100 * (diff_stats['atinc2'] /
+                                           diff_stats['atinc1'] - 1))
+    del diff_stats['atinc1']
+    del diff_stats['atinc2']
+    del count
+    del sum_row_diff_stats
     # delete intermediate Pandas DataFrame objects
     del res1
     del res2
-    # convert some columns to percentages
-    percent_columns = ['perc_inc', 'perc_cut',
-                       'share_of_change', 'pc_aftertaxinc']
-    for col in percent_columns:
-        diffs[col] *= 100.0
     # set print display format for float table elements
     pd.options.display.float_format = '{:10,.2f}'.format
-    # ensure diffs columns are in correct order
-    assert diffs.columns.values.tolist() == DIFF_TABLE_COLUMNS
+    # put diff_stats columns in correct order
+    diff_stats = diff_stats.reindex(columns=DIFF_TABLE_COLUMNS)
     # add row names to table if using weighted_deciles or standard_income_bins
     if groupby == 'weighted_deciles':
         rownames = DECILE_ROW_NAMES
@@ -619,11 +582,11 @@ def weighted_share_of_total(gpdf, colname, total):
     else:
         rownames = None
     if rownames:
-        assert len(diffs.index) == len(rownames)
-        diffs.index = rownames
+        assert len(diff_stats.index) == len(rownames)
+        diff_stats.index = rownames
         del rownames
     # return table as Pandas DataFrame
-    return diffs
+    return diff_stats
 
 
 def create_diagnostic_table(vdf, year):

From 5ba12cb89559333ec6d01b5ce411fd76fae74e21 Mon Sep 17 00:00:00 2001
From: Martin Holmer <martin.holmer@gmail.com>
Date: Tue, 17 Apr 2018 13:01:39 -0400
Subject: [PATCH 2/3] Use decile_details in refactored create_difference_table
 utility

---
 taxcalc/tests/tbi_cps_expect.txt |  60 ++++----
 taxcalc/tests/tbi_puf_expect.txt | 244 +++++++++++++++----------------
 taxcalc/tests/test_calculate.py  |   8 +-
 taxcalc/tests/test_utils.py      |   8 +-
 taxcalc/utils.py                 |  98 ++++++-------
 5 files changed, 202 insertions(+), 216 deletions(-)

diff --git a/taxcalc/tests/tbi_cps_expect.txt b/taxcalc/tests/tbi_cps_expect.txt
index 6730e308c..debcad0de 100644
--- a/taxcalc/tests/tbi_cps_expect.txt
+++ b/taxcalc/tests/tbi_cps_expect.txt
@@ -218,7 +218,6 @@ TABLE diff_comb_xbin RESULTS:
 TABLE diff_comb_xdec RESULTS:
 {
     "0-10n_2": [
-        "61182.54",
         "0.00",
         "0.00",
         "0.00",
@@ -227,12 +226,13 @@ TABLE diff_comb_xdec RESULTS:
         "0.00",
         "0.00",
         "0.00",
-        "806259441.96",
-        "806259441.96",
-        "0.00"
+        "0.00",
+        "0.00",
+        "0.00",
+        "nan"
     ],
     "0-10p_2": [
-        "16293923.70",
+        "17448112.04",
         "0.00",
         "0.00",
         "0.00",
@@ -241,12 +241,12 @@ TABLE diff_comb_xdec RESULTS:
         "0.00",
         "0.00",
         "0.00",
-        "33249997320.64",
-        "33249997320.64",
+        "34056256762.60",
+        "34056256762.60",
         "0.00"
     ],
     "0-10z_2": [
-        "1093005.80",
+        "0.00",
         "0.00",
         "0.00",
         "0.00",
@@ -423,8 +423,8 @@ TABLE diff_comb_xdec RESULTS:
         "178473645069.52",
         "100.00",
         "0.00",
-        "3028591969682.75",
-        "3028591969682.75",
+        "3028591969682.74",
+        "3028591969682.74",
         "-3.08"
     ],
     "Top 1%_2": [
@@ -644,7 +644,6 @@ TABLE diff_itax_xbin RESULTS:
 TABLE diff_itax_xdec RESULTS:
 {
     "0-10n_2": [
-        "61182.54",
         "0.00",
         "0.00",
         "0.00",
@@ -653,12 +652,13 @@ TABLE diff_itax_xdec RESULTS:
         "0.00",
         "0.00",
         "0.00",
-        "806259441.96",
-        "806259441.96",
-        "0.00"
+        "0.00",
+        "0.00",
+        "0.00",
+        "nan"
     ],
     "0-10p_2": [
-        "16293923.70",
+        "17448112.04",
         "0.00",
         "0.00",
         "0.00",
@@ -667,12 +667,12 @@ TABLE diff_itax_xdec RESULTS:
         "0.00",
         "0.00",
         "0.00",
-        "33249997320.64",
-        "33249997320.64",
+        "34056256762.60",
+        "34056256762.60",
         "0.00"
     ],
     "0-10z_2": [
-        "1093005.80",
+        "0.00",
         "0.00",
         "0.00",
         "0.00",
@@ -849,8 +849,8 @@ TABLE diff_itax_xdec RESULTS:
         "199863277627.17",
         "100.00",
         "0.00",
-        "3028591969682.75",
-        "3028591969682.75",
+        "3028591969682.74",
+        "3028591969682.74",
         "-3.08"
     ],
     "Top 1%_2": [
@@ -1070,7 +1070,7 @@ TABLE diff_ptax_xbin RESULTS:
 TABLE diff_ptax_xdec RESULTS:
 {
     "0-10n_2": [
-        "61182.54",
+        "0.00",
         "0.00",
         "0.00",
         "0.00",
@@ -1079,12 +1079,12 @@ TABLE diff_ptax_xdec RESULTS:
         "0.00",
         "-0.00",
         "0.00",
-        "806259441.96",
-        "806259441.96",
-        "0.00"
+        "0.00",
+        "0.00",
+        "nan"
     ],
     "0-10p_2": [
-        "16293923.70",
+        "17448112.04",
         "0.00",
         "0.00",
         "0.00",
@@ -1093,12 +1093,12 @@ TABLE diff_ptax_xdec RESULTS:
         "0.00",
         "-0.00",
         "0.00",
-        "33249997320.64",
-        "33249997320.64",
+        "34056256762.60",
+        "34056256762.60",
         "0.00"
     ],
     "0-10z_2": [
-        "1093005.80",
+        "0.00",
         "0.00",
         "0.00",
         "0.00",
@@ -1275,8 +1275,8 @@ TABLE diff_ptax_xdec RESULTS:
         "-21389632557.65",
         "100.00",
         "0.00",
-        "3028591969682.75",
-        "3028591969682.75",
+        "3028591969682.74",
+        "3028591969682.74",
         "-3.08"
     ],
     "Top 1%_2": [
diff --git a/taxcalc/tests/tbi_puf_expect.txt b/taxcalc/tests/tbi_puf_expect.txt
index 9edbbdb67..266df154f 100644
--- a/taxcalc/tests/tbi_puf_expect.txt
+++ b/taxcalc/tests/tbi_puf_expect.txt
@@ -218,35 +218,34 @@ TABLE diff_comb_xbin RESULTS:
 TABLE diff_comb_xdec RESULTS:
 {
     "0-10n_2": [
-        "1291545.69",
-        "902.66",
-        "0.07",
-        "30783.77",
-        "2.38",
-        "331.11",
-        "427640887.02",
-        "0.21",
         "0.00",
-        "8349168320.57",
-        "8349168320.57",
-        "0.28"
-    ],
-    "0-10p_2": [
-        "12825822.88",
         "0.00",
         "0.00",
-        "31166.81",
-        "0.24",
-        "2.08",
-        "26673032.12",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "nan"
+    ],
+    "0-10p_2": [
+        "17756353.90",
+        "902.66",
         "0.01",
+        "61950.58",
+        "0.35",
+        "25.59",
+        "454313919.14",
+        "0.23",
         "0.00",
-        "5952150789.38",
-        "5952150789.38",
-        "-0.30"
+        "14301319109.95",
+        "14301319109.95",
+        "0.41"
     ],
     "0-10z_2": [
-        "3638985.32",
         "0.00",
         "0.00",
         "0.00",
@@ -257,7 +256,8 @@ TABLE diff_comb_xdec RESULTS:
         "0.00",
         "0.00",
         "0.00",
-        "0.00"
+        "0.00",
+        "nan"
     ],
     "10-20_2": [
         "17740190.50",
@@ -400,17 +400,17 @@ TABLE diff_comb_xdec RESULTS:
         "-7.07"
     ],
     "95-99_2": [
-        "7111665.44",
+        "7115702.98",
         "171628.45",
         "2.41",
-        "6467370.69",
-        "90.94",
-        "12303.33",
-        "87497192807.86",
-        "43.82",
-        "0.00",
-        "50725308987.29",
-        "50725308987.29",
+        "6471408.23",
+        "90.95",
+        "12309.79",
+        "87592837243.48",
+        "43.86",
+        "0.00",
+        "50761464897.83",
+        "50761464897.83",
         "-6.90"
     ],
     "ALL_2": [
@@ -428,18 +428,18 @@ TABLE diff_comb_xdec RESULTS:
         "-3.32"
     ],
     "Top 1%_2": [
-        "1780889.07",
+        "1776851.53",
         "47668.39",
         "2.68",
-        "1594172.23",
-        "89.52",
-        "18271.64",
-        "32539763073.09",
-        "16.29",
-        "0.00",
-        "13160093211.25",
-        "13160093211.25",
-        "-1.65"
+        "1590134.69",
+        "89.49",
+        "18259.33",
+        "32444118637.47",
+        "16.25",
+        "0.00",
+        "13123937300.71",
+        "13123937300.71",
+        "-1.64"
     ]
 }
 TABLE diff_itax_xbin RESULTS:
@@ -644,35 +644,34 @@ TABLE diff_itax_xbin RESULTS:
 TABLE diff_itax_xdec RESULTS:
 {
     "0-10n_2": [
-        "1291545.69",
-        "902.66",
-        "0.07",
-        "30783.77",
-        "2.38",
-        "333.82",
-        "431147466.71",
-        "0.20",
         "0.00",
-        "8349168320.57",
-        "8349168320.57",
-        "0.28"
-    ],
-    "0-10p_2": [
-        "12825822.88",
         "0.00",
         "0.00",
-        "31166.81",
-        "0.24",
-        "2.94",
-        "37752726.03",
-        "0.02",
         "0.00",
-        "5952150789.38",
-        "5952150789.38",
-        "-0.30"
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "nan"
+    ],
+    "0-10p_2": [
+        "17756353.90",
+        "902.66",
+        "0.01",
+        "61950.58",
+        "0.35",
+        "26.41",
+        "468900192.75",
+        "0.21",
+        "0.00",
+        "14301319109.95",
+        "14301319109.95",
+        "0.41"
     ],
     "0-10z_2": [
-        "3638985.32",
         "0.00",
         "0.00",
         "0.00",
@@ -683,7 +682,8 @@ TABLE diff_itax_xdec RESULTS:
         "0.00",
         "0.00",
         "0.00",
-        "0.00"
+        "0.00",
+        "nan"
     ],
     "10-20_2": [
         "17740190.50",
@@ -826,17 +826,17 @@ TABLE diff_itax_xdec RESULTS:
         "-7.07"
     ],
     "95-99_2": [
-        "7111665.44",
+        "7115702.98",
         "149022.53",
-        "2.10",
-        "6489976.61",
+        "2.09",
+        "6494014.15",
         "91.26",
-        "12687.47",
-        "90229046150.91",
-        "41.32",
+        "12693.71",
+        "90324690586.68",
+        "41.37",
         "0.00",
-        "50725308987.29",
-        "50725308987.29",
+        "50761464897.83",
+        "50761464897.83",
         "-6.90"
     ],
     "ALL_2": [
@@ -854,18 +854,18 @@ TABLE diff_itax_xdec RESULTS:
         "-3.32"
     ],
     "Top 1%_2": [
-        "1780889.07",
+        "1776851.53",
         "46699.72",
-        "2.62",
-        "1595140.90",
-        "89.57",
-        "18334.16",
-        "32651100099.27",
-        "14.95",
-        "0.00",
-        "13160093211.25",
-        "13160093211.25",
-        "-1.65"
+        "2.63",
+        "1591103.36",
+        "89.55",
+        "18321.99",
+        "32555455663.50",
+        "14.91",
+        "0.00",
+        "13123937300.71",
+        "13123937300.71",
+        "-1.64"
     ]
 }
 TABLE diff_ptax_xbin RESULTS:
@@ -1070,35 +1070,35 @@ TABLE diff_ptax_xbin RESULTS:
 TABLE diff_ptax_xdec RESULTS:
 {
     "0-10n_2": [
-        "1291545.69",
-        "7933.17",
-        "0.61",
-        "289.36",
-        "0.02",
-        "-2.72",
-        "-3506579.69",
-        "0.02",
         "0.00",
-        "8349168320.57",
-        "8349168320.57",
-        "0.28"
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "-0.00",
+        "0.00",
+        "0.00",
+        "0.00",
+        "nan"
     ],
     "0-10p_2": [
-        "12825822.88",
-        "30866.19",
-        "0.24",
-        "0.00",
+        "17756353.90",
+        "38799.36",
+        "0.22",
+        "289.36",
         "0.00",
-        "-0.86",
-        "-11079693.91",
-        "0.06",
+        "-0.82",
+        "-14586273.60",
+        "0.08",
         "0.00",
-        "5952150789.38",
-        "5952150789.38",
-        "-0.30"
+        "14301319109.95",
+        "14301319109.95",
+        "0.41"
     ],
     "0-10z_2": [
-        "3638985.32",
+        "0.00",
         "0.00",
         "0.00",
         "0.00",
@@ -1109,7 +1109,7 @@ TABLE diff_ptax_xdec RESULTS:
         "0.00",
         "0.00",
         "0.00",
-        "0.00"
+        "nan"
     ],
     "10-20_2": [
         "17740190.50",
@@ -1252,17 +1252,17 @@ TABLE diff_ptax_xdec RESULTS:
         "-7.07"
     ],
     "95-99_2": [
-        "7111665.44",
+        "7115702.98",
         "4203474.42",
-        "59.11",
+        "59.07",
         "1287.47",
         "0.02",
-        "-384.14",
-        "-2731853343.06",
+        "-383.92",
+        "-2731853343.21",
         "14.64",
         "0.00",
-        "50725308987.29",
-        "50725308987.29",
+        "50761464897.83",
+        "50761464897.83",
         "-6.90"
     ],
     "ALL_2": [
@@ -1280,18 +1280,18 @@ TABLE diff_ptax_xdec RESULTS:
         "-3.32"
     ],
     "Top 1%_2": [
-        "1780889.07",
+        "1776851.53",
         "228185.04",
-        "12.81",
+        "12.84",
         "34259.60",
-        "1.92",
-        "-62.52",
-        "-111337026.17",
+        "1.93",
+        "-62.66",
+        "-111337026.02",
         "0.60",
         "0.00",
-        "13160093211.25",
-        "13160093211.25",
-        "-1.65"
+        "13123937300.71",
+        "13123937300.71",
+        "-1.64"
     ]
 }
 TABLE dist1_xbin RESULTS:
diff --git a/taxcalc/tests/test_calculate.py b/taxcalc/tests/test_calculate.py
index 29fd9d4ca..7b06ba04b 100644
--- a/taxcalc/tests/test_calculate.py
+++ b/taxcalc/tests/test_calculate.py
@@ -905,14 +905,16 @@ def test_distribution_tables(cps_subsample):
 
 
 def test_difference_table(cps_subsample):
+    cyr = 2014
     pol = Policy()
     recs = Records.cps_constructor(data=cps_subsample)
     calc1 = Calculator(policy=pol, records=recs)
-    assert calc1.current_year == 2014
-    reform = {2014: {'_SS_Earnings_c': [9e99]}}
+    assert calc1.current_year == cyr
+    reform = {cyr: {'_SS_Earnings_c': [9e99]}}
     pol.implement_reform(reform)
-    assert not pol.parameter_errors
     calc2 = Calculator(policy=pol, records=recs)
+    assert calc2.current_year == cyr
+    calc1.calc_all()
     calc2.calc_all()
     diff = calc1.difference_table(calc2)
     assert isinstance(diff, pd.DataFrame)
diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py
index 609e3c119..e515e1b59 100644
--- a/taxcalc/tests/test_utils.py
+++ b/taxcalc/tests/test_utils.py
@@ -194,8 +194,8 @@ def test_create_tables(cps_subsample):
                 116996252,
                 102458801,
                 580961247,
-                62524760,
-                34296230,
+                63156380,
+                33664610,
                 5637811]
     tabcol = 'tot_change'
     if not np.allclose(diff[tabcol].values, expected,
@@ -217,8 +217,8 @@ def test_create_tables(cps_subsample):
                 20.14,
                 17.64,
                 100.00,
-                10.76,
-                5.90,
+                10.87,
+                5.79,
                 0.97]
     tabcol = 'share_of_change'
     if not np.allclose(diff[tabcol].values, expected,
diff --git a/taxcalc/utils.py b/taxcalc/utils.py
index 291567afe..39f7bdb5f 100644
--- a/taxcalc/utils.py
+++ b/taxcalc/utils.py
@@ -498,13 +498,11 @@ def additive_stats_dataframe(gpdf):
     res2['atinc2'] = res2['aftertax_income']
     # add table_row column to res2 given specified groupby and income_measure
     if 'table_row' in res2:
-        print "already have table_row"  # TODO: remove
-        print "min(table_row)=", res2['table_row'].min()  # TODO: remove
-        print "max(table_row)=", res2['table_row'].max()  # TODO: remove
         pdf = res2
     else:
         if groupby == 'weighted_deciles':
-            pdf = add_quantile_table_row_variable(res2, income_measure, 10)
+            pdf = add_quantile_table_row_variable(res2, income_measure,
+                                                  10, decile_details=True)
         elif groupby == 'standard_income_bins':
             pdf = add_income_table_row_variable(res2, income_measure,
                                                 bin_type='standard')
@@ -516,68 +514,54 @@ def additive_stats_dataframe(gpdf):
                                                 bin_type='soi')
     # create grouped Pandas DataFrame
     gpdf = pdf.groupby('table_row', as_index=False)
-    del pdf
     # create additive difference table statistics from gpdf
-    diff_stats = additive_stats_dataframe(gpdf)
+    diff_table = additive_stats_dataframe(gpdf)
     # calculate additive statistics on sums row
-    sum_row_diff_stats = get_sums(diff_stats)[diff_stats.columns]
-    # optionally create bottom decile details
-    if groupby == 'weighted_deciles':
-        pdf = copy.deepcopy(gpdf.get_group(1))
-        pdf['table_row'] = pd.cut(pdf[income_measure],
-                                  bins=[-9e99, -1e-9, 1e-9, 9e99],
-                                  labels=[1, 2, 3])
-        gpdfx = pdf.groupby('table_row', as_index=False)
-        rows = additive_stats_dataframe(gpdfx)
-        diff_stats = pd.concat([rows, diff_stats.iloc[1:10]])
-        del rows
-        del pdf
-        del gpdfx
-    # append sum_row_additive_stats
-    diff_stats = diff_stats.append(sum_row_diff_stats)
-    # optionally create top decile details
+    sum_row = get_sums(diff_table)[diff_table.columns]
+    # handle placement of sum_row in table
     if groupby == 'weighted_deciles':
-        pdf = copy.deepcopy(gpdf.get_group(10))
-        pdf = add_quantile_table_row_variable(pdf, income_measure, 10)
-        pdf['table_row'].replace(to_replace=[1, 2, 3, 4, 5],
-                                 value=[0, 0, 0, 0, 0], inplace=True)
-        pdf['table_row'].replace(to_replace=[6, 7, 8, 9],
-                                 value=[1, 1, 1, 1], inplace=True)
-        pdf['table_row'].replace(to_replace=[10],
-                                 value=[2], inplace=True)
-        gpdfx = pdf.groupby('table_row', as_index=False)
-        sdf = additive_stats_dataframe(gpdfx)
-        diff_stats = diff_stats.append(sdf, ignore_index=True)
-        del sdf
-        del pdf
-        del gpdfx
+        # compute top-decile row
+        lenindex = len(diff_table.index)
+        assert lenindex == 14  # rows should be indexed from 0 to 13
+        topdec_row = get_sums(diff_table[11:lenindex])[diff_table.columns]
+        # move top-decile detail rows to make room for topdec_row and sum_row
+        diff_table = diff_table.reindex(index=range(0, lenindex + 2))
+        diff_table.iloc[15] = diff_table.iloc[13]
+        diff_table.iloc[14] = diff_table.iloc[12]
+        diff_table.iloc[13] = diff_table.iloc[11]
+        diff_table.iloc[12] = sum_row
+        diff_table.iloc[11] = topdec_row
+        del topdec_row
+    else:
+        diff_table = diff_table.append(sum_row)
     # delete intermediate Pandas DataFrame objects
     del gpdf
+    del pdf
     # compute non-additive stats in each table cell
-    count = diff_stats['count']
-    diff_stats['perc_cut'] = np.where(count > 0,
-                                      100 * diff_stats['tax_cut'] / count, 0)
-    diff_stats['perc_inc'] = np.where(count > 0,
-                                      100 * diff_stats['tax_inc'] / count, 0)
-    diff_stats['mean'] = np.where(count > 0,
-                                  diff_stats['tot_change'] / count, 0)
-    total_change = sum_row_diff_stats['tot_change']
-    diff_stats['share_of_change'] = np.where(total_change == 0, np.nan,
-                                             (100 * diff_stats['tot_change'] /
+    count = diff_table['count']
+    diff_table['perc_cut'] = np.where(count > 0,
+                                      100 * diff_table['tax_cut'] / count, 0)
+    diff_table['perc_inc'] = np.where(count > 0,
+                                      100 * diff_table['tax_inc'] / count, 0)
+    diff_table['mean'] = np.where(count > 0,
+                                  diff_table['tot_change'] / count, 0)
+    total_change = sum_row['tot_change']
+    diff_table['share_of_change'] = np.where(total_change == 0, np.nan,
+                                             (100 * diff_table['tot_change'] /
                                               total_change))
-    diff_stats['pc_aftertaxinc'] = (100 * (diff_stats['atinc2'] /
-                                           diff_stats['atinc1'] - 1))
-    del diff_stats['atinc1']
-    del diff_stats['atinc2']
-    del count
-    del sum_row_diff_stats
+    diff_table['pc_aftertaxinc'] = (100 * (diff_table['atinc2'] /
+                                           diff_table['atinc1'] - 1))
     # delete intermediate Pandas DataFrame objects
+    del diff_table['atinc1']
+    del diff_table['atinc2']
+    del count
+    del sum_row
     del res1
     del res2
     # set print display format for float table elements
     pd.options.display.float_format = '{:10,.2f}'.format
-    # put diff_stats columns in correct order
-    diff_stats = diff_stats.reindex(columns=DIFF_TABLE_COLUMNS)
+    # put diff_table columns in correct order
+    diff_table = diff_table.reindex(columns=DIFF_TABLE_COLUMNS)
     # add row names to table if using weighted_deciles or standard_income_bins
     if groupby == 'weighted_deciles':
         rownames = DECILE_ROW_NAMES
@@ -586,11 +570,11 @@ def additive_stats_dataframe(gpdf):
     else:
         rownames = None
     if rownames:
-        assert len(diff_stats.index) == len(rownames)
-        diff_stats.index = rownames
+        assert len(diff_table.index) == len(rownames)
+        diff_table.index = rownames
         del rownames
     # return table as Pandas DataFrame
-    return diff_stats
+    return diff_table
 
 
 def create_diagnostic_table(vdf, year):

From b92cacb8a06261cc8c6fc4a4f3de518a54c2d773 Mon Sep 17 00:00:00 2001
From: Martin Holmer <martin.holmer@gmail.com>
Date: Tue, 17 Apr 2018 14:09:38 -0400
Subject: [PATCH 3/3] Add test to maintain complete code coverage

---
 taxcalc/tests/test_utils.py | 28 ++++++----------------------
 taxcalc/utils.py            |  4 +---
 taxcalc/utilsprvt.py        | 18 ------------------
 3 files changed, 7 insertions(+), 43 deletions(-)

diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py
index e515e1b59..391bf1e51 100644
--- a/taxcalc/tests/test_utils.py
+++ b/taxcalc/tests/test_utils.py
@@ -26,7 +26,6 @@
                            weighted_count, weighted_sum, weighted_mean,
                            wage_weighted, agi_weighted,
                            expanded_income_weighted,
-                           weighted_perc_inc, weighted_perc_cut,
                            add_income_table_row_variable,
                            add_quantile_table_row_variable,
                            mtr_graph_data, atr_graph_data, dec_graph_data,
@@ -170,13 +169,16 @@ def test_create_tables(cps_subsample):
     tabcol = 'pc_aftertaxinc'
     if not np.allclose(diff[tabcol].values, expected,
                        atol=0.005, rtol=0.0, equal_nan=True):
-        test_failure = True
+        test_failure
         print('diff', tabcol)
         for val in diff[tabcol].values:
             print('{:.2f},'.format(val))
 
-    diff = create_difference_table(calc1.dataframe(DIFF_VARIABLES),
-                                   calc2.dataframe(DIFF_VARIABLES),
+    vdf1 = calc1.dataframe(DIFF_VARIABLES)
+    vdf2 = calc2.dataframe(DIFF_VARIABLES)
+    add_quantile_table_row_variable(vdf2, 'expanded_income',
+                                    10, decile_details=True)
+    diff = create_difference_table(vdf1, vdf2,
                                    groupby='weighted_deciles',
                                    income_measure='expanded_income',
                                    tax_to_diff='combined')
@@ -637,24 +639,6 @@ def test_weighted_sum():
     pd.util.testing.assert_series_equal(exp, diffs)
 
 
-def test_weighted_perc_inc():
-    dfx = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label'])
-    grouped = dfx.groupby('label')
-    diffs = grouped.apply(weighted_perc_inc, 'tax_diff')
-    exp = pd.Series(data=[8. / 12., 1.0], index=['a', 'b'])
-    exp.index.name = 'label'
-    pd.util.testing.assert_series_equal(exp, diffs)
-
-
-def test_weighted_perc_cut():
-    dfx = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label'])
-    grouped = dfx.groupby('label')
-    diffs = grouped.apply(weighted_perc_cut, 'tax_diff')
-    exp = pd.Series(data=[4. / 12., 0.0], index=['a', 'b'])
-    exp.index.name = 'label'
-    pd.util.testing.assert_series_equal(exp, diffs)
-
-
 EPSILON = 1e-5
 
 
diff --git a/taxcalc/utils.py b/taxcalc/utils.py
index 39f7bdb5f..e38d4189f 100644
--- a/taxcalc/utils.py
+++ b/taxcalc/utils.py
@@ -23,9 +23,7 @@
                                weighted_count_gt_zero,
                                weighted_count, weighted_mean,
                                wage_weighted, agi_weighted,
-                               expanded_income_weighted,
-                               weighted_perc_cut,
-                               weighted_perc_inc)
+                               expanded_income_weighted)
 
 
 # Items in the DIST_TABLE_COLUMNS list below correspond to the items in the
diff --git a/taxcalc/utilsprvt.py b/taxcalc/utilsprvt.py
index 2001ef531..26944c68b 100644
--- a/taxcalc/utilsprvt.py
+++ b/taxcalc/utilsprvt.py
@@ -70,21 +70,3 @@ def expanded_income_weighted(pdf, col_name):
     expinc = 'expanded_income'
     return ((pdf[col_name] * pdf[swght] * pdf[expinc]).sum() /
             ((pdf[swght] * pdf[expinc]).sum() + EPSILON))
-
-
-def weighted_perc_inc(pdf, col_name):
-    """
-    Return weighted fraction (not percent) of positive values for the
-    variable with col_name in the specified Pandas DataFrame.
-    """
-    return (weighted_count_gt_zero(pdf, col_name) /
-            (weighted_count(pdf) + EPSILON))
-
-
-def weighted_perc_cut(pdf, col_name):
-    """
-    Return weighted fraction (not percent) of negative values for the
-    variable with col_name in the specified Pandas DataFrame.
-    """
-    return (weighted_count_lt_zero(pdf, col_name) /
-            (weighted_count(pdf) + EPSILON))