diff --git a/.gitignore b/.gitignore index e755d63f..fd490641 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ __pycache__/ # IRS-SOI PUF and related CPS matching data files puf*.csv +demographics*.csv *puf.csv puf.csv* cps-matched-puf.csv diff --git a/docs/book/content/data/puf_file_doc.md b/docs/book/content/data/puf_file_doc.md index 0d44ad0f..e2948a68 100644 --- a/docs/book/content/data/puf_file_doc.md +++ b/docs/book/content/data/puf_file_doc.md @@ -17,4 +17,6 @@ Coming soon... ## Final Prep -Coming soon... \ No newline at end of file +### Imputing Pension Contributions + +Target data source: [IRS SOI W-2 statistics](https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics) \ No newline at end of file diff --git a/taxdata/cps/cpsmar.py b/taxdata/cps/cpsmar.py index eb295ab4..1c15808c 100644 --- a/taxdata/cps/cpsmar.py +++ b/taxdata/cps/cpsmar.py @@ -162,6 +162,9 @@ def create_cps( # person record elif rec_type == "3": person = parse(record, parsing_dict["person"]) + # add housing subsidy to person record because it's needed in person_details + if year < 2016: + person['fhoussub'] = family['fhoussub'] person = person_details( person, benefits, diff --git a/taxdata/cps/create.py b/taxdata/cps/create.py index 58127998..7ecaccf5 100644 --- a/taxdata/cps/create.py +++ b/taxdata/cps/create.py @@ -106,7 +106,7 @@ def create( _units = [] for year in cps_files: print(f"Creating Tax Units for {year}") - _yr_units = pycps(cps_dfs[year], year, verbose) + _yr_units = pycps(cps_dfs[year], year, benefits, verbose) if validate: validate_cps_units(cps_dfs[year], _yr_units, year) _units.append(_yr_units) diff --git a/taxdata/cps/pycps.py b/taxdata/cps/pycps.py index 13e7096f..7b147dac 100644 --- a/taxdata/cps/pycps.py +++ b/taxdata/cps/pycps.py @@ -321,19 +321,21 @@ def _create_units(data, year, verbose=False, ctam_benefits=False): return [unit.output() for unit in units.values()] -def pycps(cps: list, year: int, verbose: bool) -> pd.DataFrame: +def pycps(cps: list, year: int, ctam_benefits: bool, verbose: bool) -> pd.DataFrame: """ Core code for iterating through the households Parameters ---------- cps: List where each element is a household in the CPS + year: CPS year to use + ctam_benefits: If true, attach C-TAM benefits to the CPS + verbose """ tax_units = [] - ctam_benefits = True - if year not in C_TAM_YEARS: - ctam_benefits = False + if year not in C_TAM_YEARS and ctam_benefits: + raise ValueError(f'C-TAM Benefits not available for year {year}') for hh in tqdm(cps): - tax_units += create_units(hh, year - 1, ctam_benefits=ctam_benefits) + tax_units += create_units(hh, year - 1, ctam_benefits=ctam_benefits, verbose=verbose) # create a DataFrame of tax units with the new tax_units_df = pd.DataFrame(tax_units) diff --git a/taxdata/puf/finalprep.py b/taxdata/puf/finalprep.py index b3b78345..dae93c35 100644 --- a/taxdata/puf/finalprep.py +++ b/taxdata/puf/finalprep.py @@ -241,6 +241,8 @@ def split_earnings_variables(data, data_year): mte = 106800 elif data_year == 2011: mte = 106800 + elif data_year == 2015: + mte = 118500 else: raise ValueError("illegal SOI PUF data year {}".format(data_year)) # total self-employment earnings subject to SECA taxation diff --git a/taxdata/puf/impute_pencon.py b/taxdata/puf/impute_pencon.py index 40cbdea4..271dfa6c 100644 --- a/taxdata/puf/impute_pencon.py +++ b/taxdata/puf/impute_pencon.py @@ -39,16 +39,10 @@ for details. """ from __future__ import print_function -import sys import numpy as np import pandas as pd from pathlib import Path -if sys.version_info[0] < 3: - from StringIO import StringIO -else: - from io import StringIO - CURPATH = Path(__file__).resolve().parent DUMP0 = False @@ -96,7 +90,7 @@ def targets(year): 1e6, 2e6, 5e6, - 30e6, + 124e6, ] @@ -124,7 +118,7 @@ def wage_group(row): for grp, underwage in enumerate(UNDER_WAGE): if row["wage"] < underwage: return grp - raise ValueError("illegal value of wage") + raise ValueError(f"illegal value of wage: {row['wage']}") # end of wage_group() function @@ -156,11 +150,12 @@ def wage_group(row): # several times each with a different value of HIWAGE_PROB_SF. -# specify maximum legal elective deferral amount for DC pensions in 2011 -MAX_PENCON_AMT = 16500 +# specify maximum legal elective deferral amount for DC pensions in each year +# the PUF is supported +MAX_PENCON_AMT = {2011: 16500, 2015: 1800} -def impute(idata, target_cnt, target_amt): +def impute(idata, target_cnt, target_amt, year): """ Impute idata[pencon] given other idata variables and targets. """ @@ -202,7 +197,7 @@ def impute(idata, target_cnt, target_amt): num_iterations = 10 for itr in range(0, num_iterations): uncapped_amt = np.where(pos_pc, np.round(wage * rate0).astype(int), 0) - capped_amt = np.minimum(uncapped_amt, MAX_PENCON_AMT) + capped_amt = np.minimum(uncapped_amt, MAX_PENCON_AMT[year]) over_amt = uncapped_amt - capped_amt over_tot = (over_amt * wgt).sum() * 1e-9 rate1 = min(1.0, (cell_target_amt + over_tot) / wgt_pos_pc_wages) @@ -298,12 +293,12 @@ def impute_pension_contributions(alldata, year): # do two imputations to construct gross wages for PUF records idata["wage"] = idata["e00200"] idata["wagegrp"] = idata.apply(wage_group, axis=1) - impute(idata, target_cnt, target_amt) + impute(idata, target_cnt, target_amt, year) idata["wage"] = np.where( idata["filer"] == 1, idata["e00200"] + idata["pencon"], idata["e00200"] ) idata["wagegrp"] = idata.apply(wage_group, axis=1) # gross wage group - impute(idata, target_cnt, target_amt) + impute(idata, target_cnt, target_amt, year) if DUMP0: cnt = (idata["weight"] * (idata["pencon"] > 0)).sum() * 1e-6 print("wgt_pencon_cnt(#M)= {:.3f}".format(cnt)) diff --git a/taxdata/puf/preppuf.py b/taxdata/puf/preppuf.py index f721c756..c931ef09 100644 --- a/taxdata/puf/preppuf.py +++ b/taxdata/puf/preppuf.py @@ -8,6 +8,7 @@ 2009: [999999], 2010: [999998, 999999], 2011: [999996, 999997, 999998, 999999], + 2015: [999996, 999997, 999998, 999999], }