From 4d1bd7f1e0bd1b45a8dd12e48378568c02f7396b Mon Sep 17 00:00:00 2001 From: andersonfrailey Date: Wed, 13 Sep 2023 10:04:25 -0400 Subject: [PATCH 1/3] initial work --- .gitignore | 1 + taxdata/cps/cpsmar.py | 3 +++ taxdata/cps/create.py | 2 +- taxdata/cps/pycps.py | 6 +++--- taxdata/puf/finalprep.py | 2 ++ taxdata/puf/impute_pencon.py | 3 ++- taxdata/puf/preppuf.py | 1 + 7 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index e755d63f..fd490641 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ __pycache__/ # IRS-SOI PUF and related CPS matching data files puf*.csv +demographics*.csv *puf.csv puf.csv* cps-matched-puf.csv diff --git a/taxdata/cps/cpsmar.py b/taxdata/cps/cpsmar.py index eb295ab4..1c15808c 100644 --- a/taxdata/cps/cpsmar.py +++ b/taxdata/cps/cpsmar.py @@ -162,6 +162,9 @@ def create_cps( # person record elif rec_type == "3": person = parse(record, parsing_dict["person"]) + # add housing subsidy to person record because it's needed in person_details + if year < 2016: + person['fhoussub'] = family['fhoussub'] person = person_details( person, benefits, diff --git a/taxdata/cps/create.py b/taxdata/cps/create.py index 58127998..7ecaccf5 100644 --- a/taxdata/cps/create.py +++ b/taxdata/cps/create.py @@ -106,7 +106,7 @@ def create( _units = [] for year in cps_files: print(f"Creating Tax Units for {year}") - _yr_units = pycps(cps_dfs[year], year, verbose) + _yr_units = pycps(cps_dfs[year], year, benefits, verbose) if validate: validate_cps_units(cps_dfs[year], _yr_units, year) _units.append(_yr_units) diff --git a/taxdata/cps/pycps.py b/taxdata/cps/pycps.py index 13e7096f..d8dd8ef3 100644 --- a/taxdata/cps/pycps.py +++ b/taxdata/cps/pycps.py @@ -321,7 +321,7 @@ def _create_units(data, year, verbose=False, ctam_benefits=False): return [unit.output() for unit in units.values()] -def pycps(cps: list, year: int, verbose: bool) -> pd.DataFrame: +def pycps(cps: list, year: int, ctam_benefits: bool, verbose: bool) -> pd.DataFrame: """ Core code for iterating through the households Parameters @@ -330,8 +330,8 @@ def pycps(cps: list, year: int, verbose: bool) -> pd.DataFrame: """ tax_units = [] ctam_benefits = True - if year not in C_TAM_YEARS: - ctam_benefits = False + if year not in C_TAM_YEARS and ctam_benefits: + raise ValueError(f'C-TAM Benefits not available for year {year}') for hh in tqdm(cps): tax_units += create_units(hh, year - 1, ctam_benefits=ctam_benefits) # create a DataFrame of tax units with the new diff --git a/taxdata/puf/finalprep.py b/taxdata/puf/finalprep.py index aa41ec92..96614d5d 100644 --- a/taxdata/puf/finalprep.py +++ b/taxdata/puf/finalprep.py @@ -238,6 +238,8 @@ def split_earnings_variables(data, data_year): mte = 106800 elif data_year == 2011: mte = 106800 + elif data_year == 2015: + mte = 118500 else: raise ValueError("illegal SOI PUF data year {}".format(data_year)) # total self-employment earnings subject to SECA taxation diff --git a/taxdata/puf/impute_pencon.py b/taxdata/puf/impute_pencon.py index 35c1c0c4..dd127004 100644 --- a/taxdata/puf/impute_pencon.py +++ b/taxdata/puf/impute_pencon.py @@ -176,7 +176,7 @@ def wage_group(row): for grp, underwage in enumerate(UNDER_WAGE): if row["wage"] < underwage: return grp - raise ValueError("illegal value of wage") + raise ValueError(f"illegal value of wage: {row['wage']}") # end of wage_group() function @@ -283,6 +283,7 @@ def impute_pension_contributions(alldata): """ # specify target DataFrames with total column and total row removed target_cnt, target_amt = targets() + print(target_cnt) target_cnt.drop(labels="total", axis="index", inplace=True) target_cnt.drop(labels="total", axis="columns", inplace=True) target_amt.drop(labels="total", axis="index", inplace=True) diff --git a/taxdata/puf/preppuf.py b/taxdata/puf/preppuf.py index f721c756..c931ef09 100644 --- a/taxdata/puf/preppuf.py +++ b/taxdata/puf/preppuf.py @@ -8,6 +8,7 @@ 2009: [999999], 2010: [999998, 999999], 2011: [999996, 999997, 999998, 999999], + 2015: [999996, 999997, 999998, 999999], } From e35bc6cded12909e647d0825564783b2654bedec Mon Sep 17 00:00:00 2001 From: andersonfrailey Date: Wed, 13 Sep 2023 10:10:04 -0400 Subject: [PATCH 2/3] update impute_pencon --- taxdata/puf/impute_pencon.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/taxdata/puf/impute_pencon.py b/taxdata/puf/impute_pencon.py index dd127004..0e2ee43c 100644 --- a/taxdata/puf/impute_pencon.py +++ b/taxdata/puf/impute_pencon.py @@ -209,10 +209,10 @@ def wage_group(row): # specify maximum legal elective deferral amount for DC pensions in 2011 -MAX_PENCON_AMT = 16500 +MAX_PENCON_AMT = {2011: 16500, 2015: 1800} -def impute(idata, target_cnt, target_amt): +def impute(idata, target_cnt, target_amt, year): """ Impute idata[pencon] given other idata variables and targets. """ @@ -254,7 +254,7 @@ def impute(idata, target_cnt, target_amt): num_iterations = 10 for itr in range(0, num_iterations): uncapped_amt = np.where(pos_pc, np.round(wage * rate0).astype(int), 0) - capped_amt = np.minimum(uncapped_amt, MAX_PENCON_AMT) + capped_amt = np.minimum(uncapped_amt, MAX_PENCON_AMT[year]) over_amt = uncapped_amt - capped_amt over_tot = (over_amt * wgt).sum() * 1e-9 rate1 = min(1.0, (cell_target_amt + over_tot) / wgt_pos_pc_wages) @@ -275,7 +275,7 @@ def impute(idata, target_cnt, target_amt): # end of impute() function -def impute_pension_contributions(alldata): +def impute_pension_contributions(alldata, year): """ Main function in impute_pencon.py file. Argument: puf.csv DataFrame just before imputation is done. @@ -351,12 +351,12 @@ def impute_pension_contributions(alldata): # do two imputations to construct gross wages for PUF records idata["wage"] = idata["e00200"] idata["wagegrp"] = idata.apply(wage_group, axis=1) - impute(idata, target_cnt, target_amt) + impute(idata, target_cnt, target_amt, year) idata["wage"] = np.where( idata["filer"] == 1, idata["e00200"] + idata["pencon"], idata["e00200"] ) idata["wagegrp"] = idata.apply(wage_group, axis=1) # gross wage group - impute(idata, target_cnt, target_amt) + impute(idata, target_cnt, target_amt, year) if DUMP0: cnt = (idata["weight"] * (idata["pencon"] > 0)).sum() * 1e-6 print("wgt_pencon_cnt(#M)= {:.3f}".format(cnt)) From 632533718d45c5d77f09897499c4e77454f2aa45 Mon Sep 17 00:00:00 2001 From: andersonfrailey Date: Sun, 17 Sep 2023 13:57:47 -0400 Subject: [PATCH 3/3] update pycps docstring. update wage groups, add 2015 targets for pension contribution imputations. --- docs/book/content/data/puf_file_doc.md | 4 +++- taxdata/cps/pycps.py | 6 ++++-- taxdata/puf/impute_pencon.py | 11 +++-------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/book/content/data/puf_file_doc.md b/docs/book/content/data/puf_file_doc.md index 0d44ad0f..e2948a68 100644 --- a/docs/book/content/data/puf_file_doc.md +++ b/docs/book/content/data/puf_file_doc.md @@ -17,4 +17,6 @@ Coming soon... ## Final Prep -Coming soon... \ No newline at end of file +### Imputing Pension Contributions + +Target data source: [IRS SOI W-2 statistics](https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics) \ No newline at end of file diff --git a/taxdata/cps/pycps.py b/taxdata/cps/pycps.py index d8dd8ef3..7b147dac 100644 --- a/taxdata/cps/pycps.py +++ b/taxdata/cps/pycps.py @@ -327,13 +327,15 @@ def pycps(cps: list, year: int, ctam_benefits: bool, verbose: bool) -> pd.DataFr Parameters ---------- cps: List where each element is a household in the CPS + year: CPS year to use + ctam_benefits: If true, attach C-TAM benefits to the CPS + verbose """ tax_units = [] - ctam_benefits = True if year not in C_TAM_YEARS and ctam_benefits: raise ValueError(f'C-TAM Benefits not available for year {year}') for hh in tqdm(cps): - tax_units += create_units(hh, year - 1, ctam_benefits=ctam_benefits) + tax_units += create_units(hh, year - 1, ctam_benefits=ctam_benefits, verbose=verbose) # create a DataFrame of tax units with the new tax_units_df = pd.DataFrame(tax_units) diff --git a/taxdata/puf/impute_pencon.py b/taxdata/puf/impute_pencon.py index 488857b4..271dfa6c 100644 --- a/taxdata/puf/impute_pencon.py +++ b/taxdata/puf/impute_pencon.py @@ -39,16 +39,10 @@ for details. """ from __future__ import print_function -import sys import numpy as np import pandas as pd from pathlib import Path -if sys.version_info[0] < 3: - from StringIO import StringIO -else: - from io import StringIO - CURPATH = Path(__file__).resolve().parent DUMP0 = False @@ -96,7 +90,7 @@ def targets(year): 1e6, 2e6, 5e6, - 30e6, + 124e6, ] @@ -156,7 +150,8 @@ def wage_group(row): # several times each with a different value of HIWAGE_PROB_SF. -# specify maximum legal elective deferral amount for DC pensions in 2011 +# specify maximum legal elective deferral amount for DC pensions in each year +# the PUF is supported MAX_PENCON_AMT = {2011: 16500, 2015: 1800}