diff --git a/cps_stage3/aggregates.txt b/cps_stage3/aggregates.txt new file mode 100644 index 00000000..30e4a7a0 --- /dev/null +++ b/cps_stage3/aggregates.txt @@ -0,0 +1,27 @@ +Total benefits (billions) +programs 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 + ss 849.2 901.7 935.5 976.7 1,049.1 1,124.0 1,203.6 1,283.0 1,372.0 1,468.1 1,570.3 + ssi 54.1 54.8 54.9 55.3 57.0 58.9 60.9 62.8 64.8 66.9 69.1 +medicaid 368.6 412.8 384.1 380.7 392.3 391.6 391.7 391.2 391.1 390.9 391.0 +medicare 576.1 602.5 629.9 667.9 718.3 777.9 841.7 907.1 978.5 1,056.2 1,135.8 + vb 146.8 152.3 158.2 159.0 159.8 160.7 161.7 162.7 163.8 164.9 166.2 + snap 83.0 82.6 79.0 79.0 79.0 79.0 79.0 79.0 79.0 79.0 79.0 + +Total participating tax units (millions) +programs 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 + ss 44.6 45.2 45.8 46.5 47.1 47.8 48.5 49.1 49.9 50.6 51.3 + ssi 6.8 6.8 6.8 6.9 6.9 7.0 6.9 7.0 7.0 7.1 7.1 +medicaid 27.9 29.7 30.8 31.2 31.7 32.5 32.9 33.4 33.8 34.3 34.8 +medicare 38.6 39.6 41.0 42.1 43.3 44.6 46.1 47.6 49.1 50.5 51.9 + vb 4.9 4.9 4.6 4.6 4.6 4.6 4.6 4.6 4.6 4.6 4.6 + snap 28.5 28.0 26.8 26.7 26.6 26.4 26.3 26.2 26.1 26.0 25.8 + +Total participants (millions) +programs 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 + ss 56.5 57.8 59.4 60.9 62.5 64.0 65.5 66.4 69.0 70.9 72.9 + ssi 7.6 7.6 7.5 7.5 7.5 7.6 7.6 7.7 7.7 7.8 7.8 +medicaid 52.4 56.4 58.1 58.9 59.7 61.2 62.0 62.9 63.7 64.6 65.4 +medicare 49.6 50.7 52.4 53.8 55.4 57.0 58.7 60.4 62.2 63.8 65.5 + vb 5.0 5.0 4.6 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 + snap 43.0 42.2 40.8 40.8 40.8 40.8 40.8 40.8 40.8 40.8 40.8 + diff --git a/cps_stage3/decile2015.csv b/cps_stage3/decile2015.csv new file mode 100644 index 00000000..6d10e0bd --- /dev/null +++ b/cps_stage3/decile2015.csv @@ -0,0 +1,11 @@ +2015_decile,ss_benefits,ss_taxunits,ss_average,ssi_benefits,ssi_taxunits,ssi_average,medicaid_benefits,medicaid_taxunits,medicaid_average,medicare_benefits,medicare_taxunits,medicare_average,vb_benefits,vb_taxunits,vb_average,snap_benefits,snap_taxunits,snap_average +1.0,235648164574.3,11828872.7,19921.4,15446420820.6,1905326.1,8107.0,59859073657.8,4222520.3,14176.1,172522666333.8,10921985.5,15795.9,36577757385.0,1213400.6,30144.8,14599547226.0,5844984.0,2497.8 +2.0,241140803199.8,12047991.5,20015.0,15752573846.5,1994613.0,7897.6,82066261681.9,4168809.1,19685.8,186804345030.4,10958997.0,17045.8,32955416319.7,1025216.7,32144.8,13892541220.2,5804971.9,2393.2 +3.0,178323885867.7,9105893.6,19583.3,12778730438.7,1620057.0,7887.8,68752039985.8,4510126.1,15243.9,130756967800.6,8295777.6,15761.9,24826044904.9,787832.6,31511.8,12771442403.3,5127523.1,2490.8 +4.0,51941541201.6,2553402.2,20342.1,3151031056.1,392073.0,8036.8,49749892872.5,4130106.4,12045.7,28925090583.7,2106380.3,13732.1,6579280519.5,218581.0,30100.0,15534412690.7,4532945.8,3427.0 +5.0,40677288594.5,2082786.3,19530.2,2897197915.0,348485.8,8313.7,48182460457.7,3888821.7,12390.0,19615969305.1,1608914.5,12192.1,6757385855.2,226861.1,29786.4,13560422068.9,3822778.9,3547.3 +6.0,34212236748.2,1790789.2,19104.6,2368829570.4,289770.5,8174.8,37913747915.4,2989568.3,12682.0,16896754076.9,1367227.1,12358.4,7237855735.1,240994.4,30033.3,7453550856.0,1809022.0,4120.2 +7.0,34063165570.0,1693676.4,20112.0,1751105764.4,215732.5,8117.0,26736187395.1,2220411.7,12041.1,15458622689.6,1256250.2,12305.4,8096538973.5,268389.7,30167.1,3285332871.7,740924.7,4434.1 +8.0,33557347102.4,1669427.0,20101.1,600805574.6,69494.9,8645.3,18087766855.8,1583274.5,11424.3,14803643416.6,1247236.2,11869.2,10020136903.8,329978.9,30366.0,1260181184.6,247498.8,5091.7 +9.0,27446719012.6,1306899.4,21001.4,18709682.5,2015.5,9282.9,12981209061.1,1149838.7,11289.6,9036414388.8,996337.3,9069.6,10341242490.4,321022.8,32213.4,209709592.0,36699.4,5714.2 +10.0,24662603933.5,1082297.0,22787.3,0.0,0.0,0.0,8435236248.0,837370.0,10073.5,7633994376.3,853738.3,8941.8,8905775130.9,276498.0,32209.2,3008985.8,1575.0,1910.5 diff --git a/cps_stage3/tabs.txt b/cps_stage3/tabs.txt new file mode 100644 index 00000000..4a473f83 --- /dev/null +++ b/cps_stage3/tabs.txt @@ -0,0 +1,68 @@ +vb + 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 +0.0 440860 440860 441980 442081 442213 442327 442487 442570 442685 442755 442864 +1.0 15394 15394 14322 14222 14091 13978 13822 13741 13627 13557 13450 +2.0 210 210 162 161 160 159 156 154 153 153 151 +3.0 1 1 1 1 1 1 0 0 0 0 0 + +ss + 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 +0.0 328317 326525 326111 325606 325403 325331 325242 325242 324970 324803 324728 +1.0 83781 84533 84328 84234 84211 84192 84185 84185 84139 84113 84097 +2.0 44108 44607 44462 44418 44394 44391 44389 44389 44366 44361 44340 +3.0 258 473 472 474 475 474 473 473 473 473 473 +4.0 1 68 68 68 69 69 69 69 66 65 65 + +medicaid + 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 +0.0 392005 388485 387573 387573 387573 386957 386957 386957 386957 386957 386957 +1.0 32353 34347 34984 34984 34984 35390 35390 35390 35390 35390 35390 +2.0 16027 16313 16503 16503 16503 16602 16602 16602 16602 16602 16602 +3.0 7810 8183 8240 8240 8240 8271 8271 8271 8271 8271 8271 +4.0 4910 5337 5358 5358 5358 5408 5408 5408 5408 5408 5408 +5.0 2085 2346 2352 2352 2352 2371 2371 2371 2371 2371 2371 +6.0 775 871 872 872 872 877 877 877 877 877 877 +7.0 351 419 419 419 419 425 425 425 425 425 425 +8.0 86 97 97 97 97 96 96 96 96 96 96 +9.0 30 33 33 33 33 34 34 34 34 34 34 +11.0 20 21 21 21 21 21 21 21 21 21 21 +10.0 10 9 9 9 9 9 9 9 9 9 9 +12.0 2 3 3 3 3 3 3 3 3 3 3 +14.0 1 1 1 1 1 1 1 1 1 1 1 + +medicare + 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 +0.0 344758 342573 339832 337965 336186 334385 332111 329739 327392 325338 323332 +1.0 71106 73109 75094 76324 77319 78511 80682 82909 85152 87003 88823 +2.0 40148 40310 41060 41693 42471 43073 43172 43311 43412 43605 43784 +3.0 349 369 359 363 369 376 380 386 388 395 402 +4.0 49 49 65 64 64 64 64 64 65 67 67 +7.0 31 31 31 31 31 31 31 31 31 32 32 +5.0 16 16 16 17 17 16 16 16 16 16 16 +6.0 7 7 7 7 7 8 8 8 8 8 8 +8.0 1 1 1 1 1 1 1 1 1 1 1 + +snap + 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 +0.0 408953 410079 412791 413483 414187 414915 415626 416229 416892 417594 418304 +1.0 34020 33272 31160 30597 29967 29322 28736 28205 27631 26976 26322 +2.0 6932 6675 6251 6168 6113 6054 5952 5903 5853 5830 5796 +3.0 2811 2719 2608 2575 2563 2550 2534 2518 2503 2493 2477 +4.0 1919 1901 1859 1852 1848 1840 1836 1830 1810 1802 1798 +5.0 1083 1076 1056 1051 1050 1047 1044 1044 1042 1039 1037 +6.0 479 475 472 472 471 471 471 470 469 467 467 +7.0 175 175 175 174 174 174 174 174 173 172 172 +8.0 58 58 58 58 57 57 57 57 57 57 57 +9.0 26 26 26 26 26 26 26 26 26 26 26 +11.0 5 5 5 5 5 5 5 5 5 5 5 +10.0 3 3 3 3 3 3 3 3 3 3 3 +12.0 1 1 1 1 1 1 1 1 1 1 1 + +ssi + 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 +0.0 443520 443520 443771 443995 444219 444219 444463 444463 444705 444705 444851 +1.0 11643 11643 11443 11262 11080 11080 10876 10876 10646 10646 10507 +2.0 1206 1206 1164 1134 1099 1099 1059 1059 1048 1048 1042 +3.0 89 89 81 69 62 62 62 62 61 61 60 +4.0 7 7 6 5 5 5 5 5 5 5 5 + diff --git a/cps_stage3/test_cps_benefits.py b/cps_stage3/test_cps_benefits.py new file mode 100644 index 00000000..7c783b6f --- /dev/null +++ b/cps_stage3/test_cps_benefits.py @@ -0,0 +1,204 @@ +import sys +import pandas as pd +import numpy as np +from pandas.util.testing import assert_frame_equal + + +''' +The tests in this script check distribution and aggregates for the benefit data. +Whenever the benefit data gets updated, the tests would create new statistics +and compare with the previous version. + +This file needs three inputs: CPS weights (cps_weights.csv.gz), CPS tax unit +database (cps.csv.gz), and CPS benefit (cps_benefits_extrap_full.csv.gz). +The first two input files are in their corresponding folders as indicated +in the code, but the third benefit file is different from the current version +in that it also includes recipient information for each tax unit. Because this +recipient per tax unit information is not used in the tax-calculator, we have +dropped those variable to save space. But the recipients is essential for +checking aggregates and distribution. So anyone who wants to run the tests needs +to recreate the full version of benefit data. + +It is relatively simple to generate the full version: + +1. find the extrapolation.py in the current folder and comment out the six +lines of code (line 310 - 315) that drop all recipients +2. Rename the output to cps_benefits_extrap_full.csv.gz +3. Run the extrapolation script with 'python extrapolation.py' + +The tests will create three new files for statistic summary: + +1. decile2015_new.csv: participation, total benefit and average benefit +by wage decile +2. aggregates_new.csv: total participation and benefits for each program +from 2014 to 2026 +3. tabs_new.csv: tabulations of tax unit participation for each program +from 2014 to 2026 + +If all three files are exactly the same as the previous version, then the tests +will pass. If the tests fail, compare the new version with the previous version +carefully and then replace the previous with the new version if the difference is +reasonable. + +''' + + +programs = ['ss', 'ssi', 'medicaid', 'medicare', 'vb', 'snap'] +billion = 1e09 +million = 1e06 +delta = 1e-06 + +def read_files(): + ''' import weights, benefit, and raw cps file''' + + # import from taxdata repo + # weights and wage are for 10-year and decile tables + weights = pd.read_csv('../cps_stage2/cps_weights.csv.gz', compression='gzip') + cps_income = pd.read_csv('../cps_data/cps.csv.gz', + compression='gzip')[['e00200', 's006', 'RECID']] + # the benefit file that includes both benefits and recipients + cps_benefit = pd.read_csv('cps_benefits_extrap_full.csv.gz') + + assert len(cps_income) == len(weights) + + # merge all essential variables + cps = cps_income.merge(cps_benefit, on='RECID', how='left') + cps.fillna(0, inplace=True) + cps = cps.join(weights/100) + + # rename to facilitate for loops + cps.rename(columns={'s006': 'WT2014'}, inplace=True) + + # create decile ranks by wage + cps = cps.sort_values(by='e00200') + cps['WT2015_cumsum'] = cps.WT2015.cumsum() + cps['WT2015_decile'] = np.ceil(cps.WT2015_cumsum/(max(cps.WT2015_cumsum)/9.99)) + + return cps + +cps = read_files() + +def test_decile_dist(): + + ''' total participation, total benefits and average benefits + by decile + ''' + benefits_vars = [x + '_benefits_2015' for x in programs] + p_vars = [x + '_recipients_2015' for x in programs] + + decile2015 = pd.DataFrame(np.linspace(1,10, num=10), columns=['2015_decile']) + + for i in range(6): + + # create weighted benefit + cps[benefits_vars[i] + '_weighted'] = cps[benefits_vars[i]] * cps['WT2015'] + + # temporary variable for weighted participation + cps['dummy'] = np.where(cps[p_vars[i]]!=0, cps['WT2015'], 0) + + # calculate total benefits, participation (# tax units), and average per decile + variables = [benefits_vars[i] + '_weighted', 'dummy'] + bp = cps[variables].groupby(cps.WT2015_decile, as_index=False).sum() + + + bp['average'] = bp[benefits_vars[i] + '_weighted']/(bp['dummy'] + delta) + + # rename and save + bp.columns = [programs[i]+'_benefits', programs[i]+'_taxunits', programs[i]+'_average'] + decile2015 = pd.concat([decile2015, bp], axis=1) + + decile2015.to_csv('decile2015_new.csv', float_format='%.1f', index=False) + + decile_old = pd.read_csv('decile2015.csv') + assert_frame_equal(decile2015.round(1), decile_old) + + +def test_aggregates(): + + '''total individual & taxunit participation, total benefits from 2014-2026''' + + benefits = pd.DataFrame(programs, columns=['programs']) + taxunits = pd.DataFrame(programs, columns=['programs']) + participants = pd.DataFrame(programs, columns=['programs']) + + for year in range(2014, 2025): + #benefits + benefits_vars = [x + '_benefits_' + str(year) for x in programs] + raw_benefits = cps.loc[:,benefits_vars] + weighted_benefits = raw_benefits.multiply(cps['WT' + str(year)], axis='index') + benefit_total = pd.DataFrame(weighted_benefits.sum()/billion) + benefits[year] = benefit_total.values + + #participants + p_vars = [x + '_recipients_'+ str(year) for x in programs] + raw_participants = cps.loc[:, p_vars] + weighted_par = raw_participants.multiply(cps['WT' + str(year)], axis='index') + participant_total = pd.DataFrame(weighted_par.sum()/million) + participants[year] = participant_total.values + + # tax units + dummy = raw_participants.astype(bool) + weighted_taxunits = dummy.multiply(cps['WT' + str(year)], axis='index') + taxunit_total = pd.DataFrame(weighted_taxunits.sum()/million) + taxunits[year] = taxunit_total.values + + pd.options.display.float_format = '{:,.1f}'.format + with open('aggregates_new.txt', 'w') as file: + file.write("Total benefits (billions)\n" + benefits.to_string(index=False) + '\n\n') + file.write('Total participating tax units (millions)\n' + taxunits.to_string(index=False) + '\n\n') + file.write('Total participants (millions)\n' + participants.to_string(index=False) + '\n\n') + + # import the current version + agg_old = pd.read_csv('aggregates.txt', delim_whitespace=True, skiprows=[0,9,18], thousands=',') + agg_old.columns = ['programs'] + list(range(2014, 2025)) + + benefits_old = agg_old.loc[0:5] + assert_frame_equal(benefits.round(1), benefits_old) + + taxunits_old = agg_old.loc[7:12].reset_index().drop(['index'], axis=1) + assert_frame_equal(taxunits.round(1), taxunits_old) + + participants_old = agg_old.loc[14:19].reset_index().drop(['index'], axis=1) + assert_frame_equal(participants.round(1), participants_old) + + +def test_tabs(): + + ''' tabulation of number of participants per tax unit from 2014 to 2026''' + + tabs = {} + + # inline function to create single year program tabulation + p_tab = lambda program: cps[program].value_counts() + + for program in programs: + program_tab = {} + for year in range(2014, 2025): + program_tab[year] = p_tab(program+"_recipients_"+str(year)) + program_tab = pd.DataFrame(program_tab) + program_tab.fillna(0, inplace=True) + tabs[program] = program_tab.astype(int) + + with open('tabs_new.txt', 'w') as file: + for key, dfs in tabs.iteritems(): + file.write(key + '\n') + file.write(dfs.to_string() + '\n\n') + + tabs_old = pd.read_csv('tabs.txt', delim_whitespace=True, + names=['index'] + list(range(2014, 2025))) + tabs_old = tabs_old[tabs_old['index']!='2014'] + + for program in programs: + + unitmax = len(tabs[program]) + start_row = (tabs_old.index[tabs_old['index']==program] + 1).values[0] + end_row = start_row + unitmax + + participation_old = tabs_old.loc[start_row: end_row] + participation_old = participation_old.reset_index().drop(['level_0'], axis=1) + + assert_frame_equal(participation_old.astype(float), + tabs[program].reset_index().astype(float), + check_column_type=False, check_index_type=False) + +