Merge pull request #2322 from martinholmer/popquant

Allow population (rather than filing-unit) quantiles in tables and graphs
PSLmodels · May 18, 2019 · 540b7c5 · 540b7c5
2 parents 18d8867 + b0c3002
commit 540b7c5
Show file tree

Hide file tree

Showing 10 changed files with 303 additions and 513 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,8 +1,7 @@
 [run]
 omit =
     taxcalc/calcfunctions.py
-    taxcalc/*.json
     taxcalc/cli/*
-    taxcalc/tbi/*
     taxcalc/tests/*
     taxcalc/validation/*
+    docs/cookbook/*
diff --git a/docs/make_uguide.py b/docs/make_uguide.py
@@ -3,21 +3,20 @@
 containing information from several JSON files.
 """
 # CODING-STYLE CHECKS:
-# pycodestyle --ignore=E402 make_uguide.py
+# pycodestyle make_uguide.py
 # pylint --disable=locally-disabled make_uguide.py
 
 import os
 import sys
 from collections import OrderedDict
-CURDIR_PATH = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.join(CURDIR_PATH, '..'))
-# pylint: disable=import-error,wrong-import-position
 from taxcalc import Policy, json_to_dict
 
 
 INPUT_FILENAME = 'uguide.htmx'
 OUTPUT_FILENAME = 'uguide.html'
 
+CURDIR_PATH = os.path.abspath(os.path.dirname(__file__))
+
 TAXCALC_PATH = os.path.join(CURDIR_PATH, '..', 'taxcalc')
 
 INPUT_PATH = os.path.join(CURDIR_PATH, INPUT_FILENAME)

diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,9 @@
+[pytest]
+markers =
+    requires_pufcsv
+    pre_release
+    local
+    compatible_data
+    benefits
+    itmded_vars
+    pep8
diff --git a/taxcalc/calculator.py b/taxcalc/calculator.py
diff --git a/taxcalc/taxcalcio.py b/taxcalc/taxcalcio.py
@@ -557,6 +557,7 @@ def write_decile_table(dfx, tfile, tkind='Totals'):
         """
         dfx = add_quantile_table_row_variable(dfx, 'expanded_income', 10,
                                               decile_details=False,
+                                              pop_quantiles=False,
                                               weight_by_income_measure=False)
         gdfx = dfx.groupby('table_row', as_index=False)
         rtns_series = gdfx.apply(unweighted_sum, 's006')
@@ -613,14 +614,15 @@ def write_decile_table(dfx, tfile, tkind='Totals'):
     def write_graph_files(self):
         """
         Write graphs to HTML files.
+        All graphs contain same number of filing units in each quantile.
         """
         pos_wght_sum = self.calc.total_weight() > 0.0
         fig = None
         # average-tax-rate graph
         atr_fname = self._output_filename.replace('.csv', '-atr.html')
         atr_title = 'ATR by Income Percentile'
         if pos_wght_sum:
-            fig = self.calc_base.atr_graph(self.calc)
+            fig = self.calc_base.atr_graph(self.calc, pop_quantiles=False)
             write_graph_file(fig, atr_fname, atr_title)
         else:
             reason = 'No graph because sum of weights is not positive'
@@ -630,7 +632,10 @@ def write_graph_files(self):
         mtr_title = 'MTR by Income Percentile'
         if pos_wght_sum:
             fig = self.calc_base.mtr_graph(
-                self.calc, alt_e00200p_text='Taxpayer Earnings')
+                self.calc,
+                alt_e00200p_text='Taxpayer Earnings',
+                pop_quantiles=False
+            )
             write_graph_file(fig, mtr_fname, mtr_title)
         else:
             reason = 'No graph because sum of weights is not positive'
@@ -639,7 +644,7 @@ def write_graph_files(self):
         pch_fname = self._output_filename.replace('.csv', '-pch.html')
         pch_title = 'PCH by Income Percentile'
         if pos_wght_sum:
-            fig = self.calc_base.pch_graph(self.calc)
+            fig = self.calc_base.pch_graph(self.calc, pop_quantiles=False)
             write_graph_file(fig, pch_fname, pch_title)
         else:
             reason = 'No graph because sum of weights is not positive'

diff --git a/taxcalc/tests/test_calculator.py b/taxcalc/tests/test_calculator.py
@@ -672,11 +672,13 @@ def test_mtr_graph(cps_subsample):
     fig = calc.mtr_graph(calc,
                          mars=2,
                          income_measure='wages',
-                         mtr_measure='ptax')
+                         mtr_measure='ptax',
+                         pop_quantiles=False)
     assert fig
     fig = calc.mtr_graph(calc,
                          income_measure='agi',
-                         mtr_measure='itax')
+                         mtr_measure='itax',
+                         pop_quantiles=True)
     assert fig
 
 

diff --git a/taxcalc/tests/test_taxcalcio.py b/taxcalc/tests/test_taxcalcio.py
@@ -611,6 +611,7 @@ def test_graphs(reformfile1):
     idict = dict()
     idict['RECID'] = [i for i in range(1, nobs + 1)]
     idict['MARS'] = [2 for i in range(1, nobs + 1)]
+    idict['XTOT'] = [3 for i in range(1, nobs + 1)]
     idict['s006'] = [10.0 for i in range(1, nobs + 1)]
     idict['e00300'] = [10000 * i for i in range(1, nobs + 1)]
     idict['expanded_income'] = idict['e00300']

diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py
@@ -20,14 +20,13 @@
                            DIFF_VARIABLES,
                            DIFF_TABLE_COLUMNS, DIFF_TABLE_LABELS,
                            SOI_AGI_BINS,
-                           create_distribution_table, create_difference_table,
-                           weighted_count_lt_zero, weighted_count_gt_zero,
-                           weighted_count, weighted_sum, weighted_mean,
+                           create_difference_table,
+                           weighted_sum, weighted_mean,
                            wage_weighted, agi_weighted,
                            expanded_income_weighted,
                            add_income_table_row_variable,
                            add_quantile_table_row_variable,
-                           mtr_graph_data, atr_graph_data, dec_graph_data,
+                           mtr_graph_data, atr_graph_data,
                            xtr_graph_plot, write_graph_file,
                            read_egg_csv, read_egg_json, delete_file,
                            bootstrap_se_ci,
@@ -59,10 +58,12 @@
 def test_validity_of_name_lists():
     assert len(DIST_TABLE_COLUMNS) == len(DIST_TABLE_LABELS)
     Records.read_var_info()
-    assert set(DIST_VARIABLES).issubset(Records.CALCULATED_VARS | {'s006'})
-    extra_vars_set = set(['num_returns_StandardDed',
-                          'num_returns_ItemDed',
-                          'num_returns_AMT'])
+    assert set(DIST_VARIABLES).issubset(Records.CALCULATED_VARS |
+                                        {'s006', 'XTOT'})
+    extra_vars_set = set(['count',
+                          'count_StandardDed',
+                          'count_ItemDed',
+                          'count_AMT'])
     assert (set(DIST_TABLE_COLUMNS) - set(DIST_VARIABLES)) == extra_vars_set
 
 
@@ -213,7 +214,7 @@ def test_create_tables(cps_subsample):
         for val in dist[tabcol].values:
             print('{:.1f},'.format(val))
 
-    tabcol = 'num_returns_ItemDed'
+    tabcol = 'count_ItemDed'
     expected = [0.0,
                 0.0,
                 0.4,
@@ -309,7 +310,7 @@ def test_create_tables(cps_subsample):
         for val in dist[tabcol].values:
             print('{:.1f},'.format(val))
 
-    tabcol = 'num_returns_ItemDed'
+    tabcol = 'count_ItemDed'
     expected = [0.0,
                 0.0,
                 0.1,
@@ -466,45 +467,6 @@ def test_diff_count_precision():
     assert not dump
 
 
-def test_weighted_count_lt_zero():
-    df1 = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label'])
-    grped = df1.groupby('label')
-    diffs = grped.apply(weighted_count_lt_zero, 'tax_diff')
-    exp = pd.Series(data=[4, 0], index=['a', 'b'])
-    exp.index.name = 'label'
-    pd.util.testing.assert_series_equal(exp, diffs)
-    df2 = pd.DataFrame(data=DATA_FLOAT, columns=['tax_diff', 's006', 'label'])
-    grped = df2.groupby('label')
-    diffs = grped.apply(weighted_count_lt_zero, 'tax_diff')
-    exp = pd.Series(data=[4, 0], index=['a', 'b'])
-    exp.index.name = 'label'
-    pd.util.testing.assert_series_equal(exp, diffs)
-
-
-def test_weighted_count_gt_zero():
-    df1 = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label'])
-    grped = df1.groupby('label')
-    diffs = grped.apply(weighted_count_gt_zero, 'tax_diff')
-    exp = pd.Series(data=[8, 10], index=['a', 'b'])
-    exp.index.name = 'label'
-    pd.util.testing.assert_series_equal(exp, diffs)
-    df2 = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label'])
-    grped = df2.groupby('label')
-    diffs = grped.apply(weighted_count_gt_zero, 'tax_diff')
-    exp = pd.Series(data=[8, 10], index=['a', 'b'])
-    exp.index.name = 'label'
-    pd.util.testing.assert_series_equal(exp, diffs)
-
-
-def test_weighted_count():
-    dfx = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label'])
-    grouped = dfx.groupby('label')
-    diffs = grouped.apply(weighted_count)
-    exp = pd.Series(data=[12, 10], index=['a', 'b'])
-    exp.index.name = 'label'
-    pd.util.testing.assert_series_equal(exp, diffs)
-
-
 def test_weighted_mean():
     dfx = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label'])
     grouped = dfx.groupby('label')
@@ -578,11 +540,26 @@ def test_dist_table_sum_row(cps_subsample):
     rec = Records.cps_constructor(data=cps_subsample)
     calc = Calculator(policy=Policy(), records=rec)
     calc.calc_all()
-    tb1 = create_distribution_table(calc.distribution_table_dataframe(),
-                                    'standard_income_bins', 'expanded_income')
-    tb2 = create_distribution_table(calc.distribution_table_dataframe(),
-                                    'soi_agi_bins', 'expanded_income')
-    assert np.allclose(tb1[-1:], tb2[-1:])
+    # create three distribution tables and compare the ALL row contents
+    tb1, _ = calc.distribution_tables(None, 'standard_income_bins')
+    tb2, _ = calc.distribution_tables(None, 'soi_agi_bins')
+    tb3, _ = calc.distribution_tables(None, 'weighted_deciles')
+    tb4, _ = calc.distribution_tables(None, 'weighted_deciles',
+                                      pop_quantiles=True)
+    assert np.allclose(tb1.loc['ALL'], tb2.loc['ALL'])
+    assert np.allclose(tb1.loc['ALL'], tb3.loc['ALL'])
+    # make sure population count is larger than filing-unit count
+    assert tb4.at['ALL', 'count'] > tb1.at['ALL', 'count']
+    # make sure population table has same ALL row values as filing-unit table
+    for col in ['count', 'count_StandardDed', 'count_ItemDed', 'count_AMT']:
+        tb4.at['ALL', col] = tb1.at['ALL', col]
+    assert np.allclose(tb1.loc['ALL'], tb4.loc['ALL'])
+    # make sure population table has same ALL tax liabilities as diagnostic tbl
+    dgt = calc.diagnostic_table(1)
+    assert np.allclose([tb4.at['ALL', 'iitax'],
+                        tb4.at['ALL', 'payrolltax']],
+                       [dgt.at['Ind Income Tax ($b)', calc.current_year],
+                        dgt.at['Payroll Taxes ($b)', calc.current_year]])
 
 
 def test_diff_table_sum_row(cps_subsample):
@@ -596,19 +573,19 @@ def test_diff_table_sum_row(cps_subsample):
     pol.implement_reform(reform)
     calc2 = Calculator(policy=pol, records=rec)
     calc2.calc_all()
-    # create two difference tables and compare their content
-    tdiff1 = create_difference_table(calc1.dataframe(DIFF_VARIABLES),
-                                     calc2.dataframe(DIFF_VARIABLES),
-                                     'standard_income_bins', 'iitax')
-    tdiff2 = create_difference_table(calc1.dataframe(DIFF_VARIABLES),
-                                     calc2.dataframe(DIFF_VARIABLES),
-                                     'soi_agi_bins', 'iitax')
-    non_digit_cols = ['perc_inc', 'perc_cut']
-    digit_cols = [c for c in list(tdiff1) if c not in non_digit_cols]
-    assert np.allclose(tdiff1[digit_cols][-1:],
-                       tdiff2[digit_cols][-1:])
-    np.allclose(tdiff1[non_digit_cols][-1:],
-                tdiff2[non_digit_cols][-1:])
+    # create three difference tables and compare their content
+    dv1 = calc1.dataframe(DIFF_VARIABLES)
+    dv2 = calc2.dataframe(DIFF_VARIABLES)
+    dt1 = create_difference_table(dv1, dv2, 'standard_income_bins', 'iitax')
+    dt2 = create_difference_table(dv1, dv2, 'soi_agi_bins', 'iitax')
+    dt3 = create_difference_table(dv1, dv2, 'weighted_deciles', 'iitax',
+                                  pop_quantiles=False)
+    dt4 = create_difference_table(dv1, dv2, 'weighted_deciles', 'iitax',
+                                  pop_quantiles=True)
+    assert np.allclose(dt1.loc['ALL'], dt2.loc['ALL'])
+    assert np.allclose(dt1.loc['ALL'], dt3.loc['ALL'])
+    # make sure population count is larger than filing-unit count
+    assert dt4.at['ALL', 'count'] > dt1.at['ALL', 'count']
 
 
 def test_mtr_graph_data(cps_subsample):
@@ -807,36 +784,3 @@ def test_table_columns_labels():
     # check that length of two lists are the same
     assert len(DIST_TABLE_COLUMNS) == len(DIST_TABLE_LABELS)
     assert len(DIFF_TABLE_COLUMNS) == len(DIFF_TABLE_LABELS)
-
-
-def test_dec_graph_plots(cps_subsample):
-    pol = Policy()
-    rec = Records.cps_constructor(data=cps_subsample)
-    calc1 = Calculator(policy=pol, records=rec)
-    year = 2020
-    calc1.advance_to_year(year)
-    reform = {
-        'SS_Earnings_c': {year: 9e99},  # OASDI FICA tax on all earnings
-        'FICA_ss_trt': {year: 0.107484}  # lower rate to keep revenue unchanged
-    }
-    pol.implement_reform(reform)
-    calc2 = Calculator(policy=pol, records=rec)
-    calc2.advance_to_year(year)
-    assert calc1.current_year == calc2.current_year
-    calc1.calc_all()
-    calc2.calc_all()
-    fig = calc1.decile_graph(calc2)
-    assert fig
-    dt1, dt2 = calc1.distribution_tables(calc2, 'weighted_deciles')
-    dta = dec_graph_data(dt1, dt2, year,
-                         include_zero_incomes=True,
-                         include_negative_incomes=False)
-    assert isinstance(dta, dict)
-    dta = dec_graph_data(dt1, dt2, year,
-                         include_zero_incomes=False,
-                         include_negative_incomes=True)
-    assert isinstance(dta, dict)
-    dta = dec_graph_data(dt1, dt2, year,
-                         include_zero_incomes=False,
-                         include_negative_incomes=False)
-    assert isinstance(dta, dict)