Skip to content

Commit

Permalink
Merge pull request #114 from andersonfrailey/updatepuf
Browse files Browse the repository at this point in the history
2011 PUF and 2016 CPS
  • Loading branch information
andersonfrailey authored Feb 8, 2018
2 parents adfa484 + c70f3b4 commit 282f771
Show file tree
Hide file tree
Showing 20 changed files with 249,571 additions and 216,003 deletions.
66 changes: 66 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: taxdata-dev
channels:
- anaconda
- https://conda.anaconda.org/t/Op-d8036d4f-1ea8-475e-bc39-311eaaddfd86/opensourcepolicycenter
- anaconda-fusion
- defaults
dependencies:
- appnope=0.1.0=py27hb466136_0
- backports=1.0=py27hb4f9756_1
- backports.shutil_get_terminal_size=1.0.0=py27hc9115de_2
- backports_abc=0.5=py27h6972548_0
- certifi=2017.7.27.1=py27h482ffc0_0
- decorator=4.1.2=py27h9f877ea_0
- enum34=1.1.6=py27hf475452_1
- ipykernel=4.6.1=py27h1e70a78_0
- ipython=5.4.1=py27h2b3d779_1
- ipython_genutils=0.2.0=py27h8b9a179_0
- jupyter_client=5.1.0=py27hfaf569a_0
- jupyter_core=4.3.0=py27hd5161ba_0
- libcxx=4.0.1=h579ed51_0
- libcxxabi=4.0.1=hebd6815_0
- libsodium=1.0.13=hba5e272_2
- pathlib2=2.3.0=py27he09da1e_0
- pexpect=4.2.1=py27hc4e4961_0
- pickleshare=0.7.4=py27h37e3d41_0
- prompt_toolkit=1.0.15=py27h4a7b9c2_0
- ptyprocess=0.5.2=py27h70f6364_0
- pygments=2.2.0=py27h1a556bb_0
- pyzmq=16.0.2=py27he61c07e_2
- scandir=1.6=py27h97aa1ee_0
- simplegeneric=0.8.1=py27_1
- singledispatch=3.4.0.3=py27he22c18d_0
- ssl_match_hostname=3.5.0.1=py27h8780752_2
- tornado=4.5.2=py27h29aec9e_0
- traitlets=4.3.2=py27hcf08151_0
- wcwidth=0.1.7=py27h817c265_0
- zeromq=4.2.2=hf974341_2
- intel-openmp=2018.0.0=h68bdfb3_7
- libgfortran=3.0.1=h93005f0_2
- mkl=2018.0.0=h5ef208c_6
- numpy=1.13.3=py27h62f9060_0
- openssl=1.0.2l=0
- pandas=0.20.3=py27_0
- patsy=0.4.1=py27h40ed276_0
- pip=9.0.1=py27_1
- python=2.7.13=0
- python-dateutil=2.6.1=py27_0
- pytz=2017.2=py27_0
- readline=6.2=2
- scipy=0.18.1=py27h793f721_0
- setuptools=27.2.0=py27_0
- six=1.10.0=py27_0
- sqlite=3.13.0=0
- statsmodels=0.8.0=py27h6d68dbf_0
- tk=8.5.18=0
- wheel=0.29.0=py27_0
- zlib=1.2.11=0
- pip:
- backports-abc==0.5
- backports.shutil-get-terminal-size==1.0.0
- backports.ssl-match-hostname==3.5.0.1
- cylp==0.7.1
- ipython-genutils==0.2.0
- jupyter-client==5.1.0
- jupyter-core==4.3.0
- prompt-toolkit==1.0.15
7 changes: 5 additions & 2 deletions puf_data/StatMatch/Matching/add_cps_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ def add_cps(cps_recs, match, puffile):
# cps_recs = pd.read_csv('cpsrets14.csv')
cpsfile = cps_recs.filter(regex='jcps\d{1,2}$|icps\d{1}$|jcps100|cpsseq|' +
'nu\d{1,2}|nu18_dep|n1821|n21|' +
'elderly_dependent|wasp|wass')
'elderly_dependent|wasp|wass|xstate')
# cpsfile = cps_recs
# match = pd.read_csv('match.csv')
# puffile = pd.read_sas('puf2009.sas7bdat')
puffile = puffile[puffile['recid'] != 999999]
puffile = puffile[(puffile['recid'] != 999999) &
(puffile['recid'] != 999998) &
(puffile['recid'] != 999997) &
(puffile['recid'] != 999996)]
puffile['filer'] = 1
puffile['wt'] = puffile['s006'] / 100
puffile['soiseq'] = puffile.index + 1
Expand Down
2 changes: 1 addition & 1 deletion puf_data/StatMatch/Matching/add_nonfilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def add_nonfiler(cpsrets, nonfiler):

nonfiler = nonfiler.filter(regex='jcps\d{1,2}$|icps\d{1}$|jcps100|' +
'cpsseq|nu\d{1,2}|nu18_dep|n1821|n21|' +
'elderly_dependent|wasp|wass')
'elderly_dependent|wasp|wass|xstate')

nonfiler['filer'] = 0
nonfiler['soiseq'] = 0
Expand Down
23 changes: 12 additions & 11 deletions puf_data/StatMatch/Matching/cps_rets.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def computation(self):
self.cps['alm_val'] = 0
for index, row in self.cps.iterrows():
if row['oi_off'] == 20:
row['alm_val'] = row['oi_off']
row['alm_val'] = row['oi_val']

for num in tqdm(self.h_nums):
self.nunits = 0
Expand Down Expand Up @@ -210,13 +210,13 @@ def create(self, record, house):
record['nu13'] = 0 # Only checked for dependents
record['nu18_dep'] = 0
record['nu18'] = 0
record['n1821'] = 0
record['n1820'] = 0
record['n21'] = 0
record['elderly_dependent'] = 0
if record['a_age'] < 18:
if 0 < record['a_age'] < 18:
record['nu18'] += 1
if 18 <= record['a_age'] < 21:
record['n1821'] += 1
record['n1820'] += 1
if record['a_age'] >= 21:
record['n21'] += 1
depne = 0
Expand Down Expand Up @@ -250,10 +250,10 @@ def create(self, record, house):
agede += 1
# Income items
# Determine spouse's age bracket
if spouse['a_age'] < 18:
if 0 < spouse['a_age'] < 18:
record['nu18'] += 1
if 18 <= spouse['a_age'] < 21:
record['n1821'] += 1
record['n1820'] += 1
if spouse['a_age'] >= 21:
record['n21'] += 1
wass = spouse['wsal_val']
Expand Down Expand Up @@ -703,11 +703,11 @@ def create(self, record, house):
record['nu05'] += 1
if individual['a_age'] <= 13:
record['nu13'] += 1
if individual['a_age'] < 18:
if 0 < individual['a_age'] < 18:
record['nu18'] += 1
record['nu18_dep'] += 1
if 18 <= individual['a_age'] < 21:
record['n1821'] += 1
record['n1820'] += 1
if individual['a_age'] >= 21:
record['n21'] += 1
if individual['a_age'] >= 65:
Expand Down Expand Up @@ -844,7 +844,7 @@ def convert(self, ix, iy):
self.house_units[iy]['nu13'] += self.house_units[ix]['nu13']
self.house_units[iy]['nu18_dep'] += self.house_units[ix]['nu18_dep']
self.house_units[iy]['nu18'] += self.house_units[ix]['nu18']
self.house_units[iy]['n1821'] += self.house_units[ix]['n1821']
self.house_units[iy]['n1820'] += self.house_units[ix]['n1820']
self.house_units[iy]['n21'] += self.house_units[ix]['n21']
elderly = self.house_units[ix]['elderly_dependent']
self.house_units[iy]['elderly_dependent'] += elderly
Expand Down Expand Up @@ -927,6 +927,7 @@ def filst(self, unit):
amount = self.joint - self.depExempt * unit['depne']
if unit['agede'] == 1:
amount = self.joint65one - self.depExempt * unit['depne']
elif unit['agede'] == 2:
amount = self.joint65both - self.depExempt * unit['depne']
if income >= amount:
unit['filst'] = 1
Expand Down Expand Up @@ -966,7 +967,7 @@ def output(self, unit, house):
if unit['js'] == 2:
txpye = 2
else:
txpye = 2
txpye = 1
xxtot = txpye + depne
# Check relationship codes among dependents
xxoodep = 0
Expand Down Expand Up @@ -1020,7 +1021,7 @@ def output(self, unit, house):
'zagesp', 'zoldes', 'zyoung', 'zworkc', 'zsocse',
'zssinc', 'zpubas', 'zvetbe', 'zfinas', 'zowner',
'zwaspt', 'zwassp', 'wasp', 'wass', 'nu05', 'nu13',
'nu18_dep', 'nu18', 'n1821', 'n21',
'nu18_dep', 'nu18', 'n1820', 'n21',
'elderly_dependent']
for var in repeated_vars:
record[var] = unit[var]
Expand Down
41 changes: 12 additions & 29 deletions puf_data/StatMatch/Matching/cpsmar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Read in raw CPS data file and structure to be used in future scripts
Input file: asec2014_pubuse_tax)fix_5x8.dat
Input file: asec2016_pubuse_v3.dat
Run time is approximately two hours
"""
from collections import OrderedDict
Expand Down Expand Up @@ -44,7 +44,6 @@ def h_recs(rec):
record['h_telavl'] = [int(rec[36])]
record['h_telint'] = [int(rec[37])]
record['gereg'] = [int(rec[38])]
record['gestcen'] = [int(rec[39:41])]
record['gestfips'] = [int(rec[41:43])]
record['gtcbsa'] = [int(rec[43:48])]
record['gtco'] = [int(rec[48:51])]
Expand Down Expand Up @@ -82,9 +81,9 @@ def h_recs(rec):
record['hssival'] = [int(rec[138:144])]
record['hpaw_yn'] = [int(rec[144])]
record['hpawval'] = [int(rec[145:151])]
record['hvet_yn '] = [int(rec[151])]
record['hvet_yn'] = [int(rec[151])]
record['hvetval'] = [int(rec[152:159])]
record['hsur_yn '] = [int(rec[159])]
record['hsur_yn'] = [int(rec[159])]
record['hsurval'] = [int(rec[160:167])]
record['hdis_yn'] = [int(rec[167])]
record['hdisval'] = [int(rec[168:175])]
Expand All @@ -100,8 +99,6 @@ def h_recs(rec):
record['hedval'] = [int(rec[208:215])]
record['hcsp_yn'] = [int(rec[215])]
record['hcspval'] = [int(rec[216:223])]
record['halm_yn'] = [int(rec[223])]
record['halmval'] = [int(rec[224:231])]
record['hfin_yn'] = [int(rec[231])]
record['hfinval'] = [int(rec[232:239])]
record['hoi_yn'] = [int(rec[239])]
Expand Down Expand Up @@ -228,8 +225,6 @@ def f_recs(rec):
record['fedval'] = [int(rec[165:172])]
record['finc_csp'] = [int(rec[172])]
record['fcspval'] = [int(rec[173:180])]
record['finc_alm'] = [int(rec[180])]
record['falmval'] = [int(rec[181:188])]
record['finc_fin'] = [int(rec[188])]
record['ffinval'] = [int(rec[189:196])]
record['finc_oi'] = [int(rec[196])]
Expand All @@ -241,13 +236,9 @@ def f_recs(rec):
record['fspanish'] = [int(rec[230])]
record['fsup_wgt'] = [float(rec[232:238] + '.' + rec[238:240])]
record['ffposold'] = [int(rec[240:242])]
record['f_mv_fs'] = [int(rec[242:246])]
record['f_mv_sl'] = [int(rec[246:250])]
record['ffngcare'] = [int(rec[250:255])]
record['ffngcaid'] = [int(rec[255:260])]
record['f_mv_fs'] = [int(rec[242:247])]
record['f_mv_sl'] = [int(rec[247:251])]
record['fhoussub'] = [int(rec[260:263])]
record['ffoodreq'] = [int(rec[263:267])]
record['fhousreq'] = [int(rec[267:271])]
record['fhip_val'] = [int(rec[271:278])]
record['fmoop'] = [int(rec[278:285])]
record['fotc_val'] = [int(rec[285:291])]
Expand All @@ -259,7 +250,7 @@ def f_recs(rec):

def p_recs(rec):
"""
Process a person record from the raw CPS file and
Process a person record from the raw CPS file.
Parameters
----------
Expand Down Expand Up @@ -484,7 +475,6 @@ def p_recs(rec):
record['int_yn'] = [int(rec[524])]
record['int_val'] = [int(rec[525:530])]
record['div_yn'] = [int(rec[530])]
record['div_non'] = [int(rec[531])]
record['div_val'] = [int(rec[532:538])]
record['rnt_yn'] = [int(rec[538])]
record['rnt_val'] = [int(rec[539:544])]
Expand All @@ -495,8 +485,6 @@ def p_recs(rec):
record['ed_val'] = [int(rec[548:553])]
record['csp_yn'] = [int(rec[553])]
record['csp_val'] = [int(rec[554:559])]
record['alm_yn'] = [int(rec[559])]
record['alm_val'] = [int(rec[560:565])]
record['fin_yn'] = [int(rec[565])]
record['fin_val'] = [int(rec[566:571])]
record['oi_off'] = [int(rec[571:573])]
Expand All @@ -510,9 +498,7 @@ def p_recs(rec):
record['pov_univ'] = [int(rec[606])]
record['wicyn'] = [int(rec[607])]
record['mcare'] = [int(rec[628])]
record['p_mvcare'] = [int(rec[629:634])]
record['mcaid'] = [int(rec[634])]
record['p_mvcaid'] = [int(rec[635:640])]
record['champ'] = [int(rec[640])]
record['hi_yn'] = [int(rec[641])]
record['hiown'] = [int(rec[642])]
Expand Down Expand Up @@ -574,9 +560,9 @@ def p_recs(rec):
record['fica'] = [int(rec[743:748])]
record['fed_ret'] = [int(rec[748:754])]
record['agi'] = [int(rec[754:761])]
record['tax_inc'] = [int(rec[764:771])]
record['fedtax_bc'] = [int(rec[771:777])]
record['fedtax_ac'] = [int(rec[777:783])]
record['tax_inc'] = [int(rec[762:769])]
record['fedtax_bc'] = [int(rec[769:776])]
record['fedtax_ac'] = [int(rec[776:783])]
record['statetax_bc'] = [int(rec[783:789])]
record['statetax_ac'] = [int(rec[789:795])]
record['prswkxpns'] = [int(rec[795:799])]
Expand Down Expand Up @@ -713,8 +699,6 @@ def p_recs(rec):
record['i_oedval'] = [int(rec[993])]
record['i_cspyn'] = [int(rec[994])]
record['i_cspval'] = [int(rec[995])]
record['i_almyn'] = [int(rec[996])]
record['i_almval'] = [int(rec[997])]
record['i_finyn'] = [int(rec[998])]
record['i_finval'] = [int(rec[999])]
record['i_oival'] = [int(rec[1000])]
Expand Down Expand Up @@ -763,7 +747,6 @@ def p_recs(rec):
record['trnt_val'] = [int(rec[1063])]
record['ted_val'] = [int(rec[1064])]
record['tcsp_val'] = [int(rec[1065])]
record['talm_val'] = [int(rec[1066])]
record['tfin_val'] = [int(rec[1067])]
record['toi_val'] = [int(rec[1068])]
record['tphip_val'] = [int(rec[1069])]
Expand Down Expand Up @@ -792,7 +775,7 @@ def create_cps(raw_cps):
open(raw_cps).readlines()]

# Empty list to hold the completed records
cps_list = list()
cps_list = []
print ('Creating Records')
for record in tqdm(cps):
# Find the type of record
Expand All @@ -804,7 +787,7 @@ def create_cps(raw_cps):
# If it's a family record, concat to household record and store
house_fam = pd.concat([house_rec, f_recs(record[0])], axis=1)
else:
# If it's a person record concat to household and family record
# If it's a person record, concat to household and family record
final_rec = pd.concat([house_fam, p_recs(record[0])], axis=1)
# Append final record to the list of records
cps_list.append(final_rec)
Expand All @@ -813,5 +796,5 @@ def create_cps(raw_cps):
cps_mar = pd.concat(cps_list)
# Export the data
print ('Exporting Data')
cps_mar.to_csv('cpsmar2014.csv', index=False)
cps_mar.to_csv('cpsmar2016.csv', index=False)
return cps_mar
2 changes: 1 addition & 1 deletion puf_data/StatMatch/Matching/phase2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def phasetwo(SOI, CPS):
# CPS = pd.read_csv('cpsrets14_ph1.csv',
# usecols=['cellid', 'cpsseq', 'wt', 'factor', 'yhat'])

CPS.loc[:,'wt_adj'] = CPS['wt'] * CPS['factor']
CPS.loc[:, 'wt_adj'] = CPS['wt'] * CPS['factor']
factor = 1.
if CPS['wt'].sum() > 0:
factor = SOI['wt'].sum() / CPS['wt'].sum()
Expand Down
17 changes: 13 additions & 4 deletions puf_data/StatMatch/Matching/runmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
"""


def match(mar_cps_path='asec2014_pubuse_tax_fix_5x8.dat',
puf_path='puf2009.csv'):
def match(mar_cps_path='asec2016_pubuse_v3.dat',
puf_path='puf2011.csv'):
# Add arguments for specifying path to CPS file in CSV format
# this will allow the program to skip the process of creating the CPS from
# a .DAT file.
Expand All @@ -41,7 +41,11 @@ def match(mar_cps_path='asec2014_pubuse_tax_fix_5x8.dat',
if args.puf is not None:
puf_path = args.puf
puf = pd.read_csv(puf_path)
puf = puf[puf['recid'] != 999999]
# Change PUF columns to lowercase
puf.columns = map(str.lower, puf.columns)
# Remove aggregated variables from the PUF
puf = puf[(puf['recid'] != 999996) & (puf['recid'] != 999997) &
(puf['recid'] != 999998) & (puf['recid'] != 999999)]

print('CPS Created')
rets = Returns(mar_cps)
Expand All @@ -54,6 +58,9 @@ def match(mar_cps_path='asec2014_pubuse_tax_fix_5x8.dat',
soi = create_soi(puf.copy())

print('PUF Created')
print ('Start Phase One')
filers = filers.fillna(0)
soi = soi.fillna(0)
soi_final, cps_final, counts = phaseone(filers, soi)

print('Start Phase Two')
Expand All @@ -64,6 +71,8 @@ def match(mar_cps_path='asec2014_pubuse_tax_fix_5x8.dat',
print('Creating final file')
cpsrets = add_cps(filers, match, puf)
cps_matched = add_nonfiler(cpsrets, nonfilers)
# add age range variable
cps_matched['agerange'] = 0
# Rename variables for use in PUF data prep
renames = {'icps1': 'age_head',
'icps2': 'age_spouse',
Expand All @@ -76,4 +85,4 @@ def match(mar_cps_path='asec2014_pubuse_tax_fix_5x8.dat',

if __name__ == "__main__":
cps_matched = match()
cps_matched.to_csv('../cps-matched-puf.csv', index=False)
cps_matched.to_csv('../../cps-matched-puf.csv', index=False)
Loading

0 comments on commit 282f771

Please sign in to comment.