diff --git a/.gitignore b/.gitignore index 5739c0a9..c70762d3 100644 --- a/.gitignore +++ b/.gitignore @@ -22,5 +22,9 @@ cpsmar*.sas cpsmar*.csv *.dat + # pickle cps*.pkl + +# .npz numpy files +*.npz diff --git a/Makefile b/Makefile index 57e73a7d..c1a33779 100644 --- a/Makefile +++ b/Makefile @@ -106,7 +106,8 @@ puf_stage1/growfactors.csv: puf_stage1/factors_finalprep.py \ cd puf_stage1 ; python factors_finalprep.py puf_stage2/puf_weights.csv.gz: puf_stage2/stage2.py \ - puf_stage2/solve_lp_for_year.py \ + puf_stage2/dataprep.py \ + puf_stage2/solver.jl \ puf_data/cps-matched-puf.csv \ puf_stage1/Stage_I_factors.csv \ puf_stage1/Stage_II_targets.csv diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 00000000..193dd5ee --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,332 @@ +# This file is machine-generated - editing it directly is not advised + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BenchmarkTools]] +deps = ["JSON", "Logging", "Printf", "Statistics", "UUIDs"] +git-tree-sha1 = "9e62e66db34540a0c919d72172cc2f642ac71260" +uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +version = "0.5.0" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "5ccb0770e3d1c185a52e6d36e3ffb830639ed3d2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.6+3" + +[[CEnum]] +git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.3.0" + +[[Calculus]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f641eb0a4f00c343bbc32346e1217b86f3ce9dad" +uuid = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9" +version = "0.5.1" + +[[Cbc]] +deps = ["BinaryProvider", "CEnum", "Cbc_jll", "Libdl", "MathOptInterface", "SparseArrays"] +git-tree-sha1 = "72e4299de0995a60a6230079adc7e47580870815" +uuid = "9961bab8-2fa3-5c5a-9d89-47fab24efd76" +version = "0.7.0" + +[[Cbc_jll]] +deps = ["Cgl_jll", "Clp_jll", "CoinUtils_jll", "CompilerSupportLibraries_jll", "Libdl", "OpenBLAS32_jll", "Osi_jll", "Pkg"] +git-tree-sha1 = "16b8ffa56b3ded6b201aa2f50623f260448aa205" +uuid = "38041ee0-ae04-5750-a4d2-bb4d0d83d27d" +version = "2.10.3+4" + +[[Cgl_jll]] +deps = ["Clp_jll", "CompilerSupportLibraries_jll", "Libdl", "Pkg"] +git-tree-sha1 = "32be20ec1e4c40e5c5d1bbf949ba9918a92a7569" +uuid = "3830e938-1dd0-5f3e-8b8e-b3ee43226782" +version = "0.60.2+5" + +[[Clp_jll]] +deps = ["CoinUtils_jll", "CompilerSupportLibraries_jll", "Libdl", "OpenBLAS32_jll", "Osi_jll", "Pkg"] +git-tree-sha1 = "79263d9383ca89b35f31c33ab5b880536a8413a4" +uuid = "06985876-5285-5a41-9fcb-8948a742cc53" +version = "1.17.6+6" + +[[CodecBzip2]] +deps = ["Bzip2_jll", "Libdl", "TranscodingStreams"] +git-tree-sha1 = "2e62a725210ce3c3c2e1a3080190e7ca491f18d7" +uuid = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" +version = "0.7.2" + +[[CodecZlib]] +deps = ["TranscodingStreams", "Zlib_jll"] +git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.7.0" + +[[CoinUtils_jll]] +deps = ["CompilerSupportLibraries_jll", "Libdl", "OpenBLAS32_jll", "Pkg"] +git-tree-sha1 = "ee1f06ab89337b7f194c29377ab174e752cdf60d" +uuid = "be027038-0da8-5614-b30d-e42594cb92df" +version = "2.11.3+3" + +[[CommonSubexpressions]] +deps = ["MacroTools", "Test"] +git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7" +uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" +version = "0.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "083e7e5ec3ef443e9dcb6dd3fbcb815879823bfa" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.14.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612" +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "0.3.3+0" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "88d48e133e6d3dd68183309877eac74393daa7eb" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.17.20" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[DiffResults]] +deps = ["StaticArrays"] +git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc" +uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" +version = "1.0.2" + +[[DiffRules]] +deps = ["NaNMath", "Random", "SpecialFunctions"] +git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1" +uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" +version = "1.0.1" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[ForwardDiff]] +deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "1d090099fb82223abc48f7ce176d3f7696ede36d" +uuid = "f6369f11-7733-5829-9624-2563aa707210" +version = "0.10.12" + +[[HTTP]] +deps = ["Base64", "Dates", "IniFile", "MbedTLS", "Sockets"] +git-tree-sha1 = "2ac03263ce44be4222342bca1c51c36ce7566161" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "0.8.17" + +[[IniFile]] +deps = ["Test"] +git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" +uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" +version = "0.5.0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.0" + +[[JSONSchema]] +deps = ["HTTP", "JSON", "ZipFile"] +git-tree-sha1 = "2646fb3efdb60f35a8c053250a2647358ae0629f" +uuid = "7d188eb4-7ad8-530c-ae41-71a32a6d4692" +version = "0.3.1" + +[[JuMP]] +deps = ["Calculus", "DataStructures", "ForwardDiff", "LinearAlgebra", "MathOptInterface", "MutableArithmetics", "NaNMath", "Random", "SparseArrays", "Statistics"] +git-tree-sha1 = "cbab42e2e912109d27046aa88f02a283a9abac7c" +uuid = "4076af6c-e467-56ae-b986-b466b2749572" +version = "0.21.3" + +[[LibGit2]] +deps = ["Printf"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.5" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MathOptInterface]] +deps = ["BenchmarkTools", "CodecBzip2", "CodecZlib", "JSON", "JSONSchema", "LinearAlgebra", "MutableArithmetics", "OrderedCollections", "SparseArrays", "Test", "Unicode"] +git-tree-sha1 = "cd2049c055c7d192a235670d50faa375361624ba" +uuid = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" +version = "0.9.14" + +[[MbedTLS]] +deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] +git-tree-sha1 = "426a6978b03a97ceb7ead77775a1da066343ec6e" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "1.0.2" + +[[MbedTLS_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "a0cb0d489819fa7ea5f9fa84c7e7eba19d8073af" +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.16.6+1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MutableArithmetics]] +deps = ["LinearAlgebra", "SparseArrays", "Test"] +git-tree-sha1 = "6cf09794783b9de2e662c4e8b60d743021e338d0" +uuid = "d8a4904e-b15c-11e9-3269-09a3773c0cb0" +version = "0.2.10" + +[[NPZ]] +deps = ["Compat", "ZipFile"] +git-tree-sha1 = "a9837d6dc62c8a159f8e788d51c81ccee248db0e" +uuid = "15e1cf62-19b3-5cfa-8e77-841668bca605" +version = "0.4.0" + +[[NaNMath]] +git-tree-sha1 = "c84c576296d0e2fbb3fc134d3e09086b3ea617cd" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.4" + +[[OpenBLAS32_jll]] +deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"] +git-tree-sha1 = "793b33911239d2651c356c823492b58d6490d36a" +uuid = "656ef2d0-ae68-5445-9ca0-591084a874a2" +version = "0.3.9+4" + +[[OpenSpecFun_jll]] +deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"] +git-tree-sha1 = "d51c416559217d974a1113522d5919235ae67a87" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.3+3" + +[[OrderedCollections]] +git-tree-sha1 = "293b70ac1780f9584c89268a6e2a560d938a7065" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.3.0" + +[[Osi_jll]] +deps = ["CoinUtils_jll", "CompilerSupportLibraries_jll", "Libdl", "OpenBLAS32_jll", "Pkg"] +git-tree-sha1 = "bd436a97280df40938e66ae8d18e57aceb072856" +uuid = "7da25872-d9ce-5375-a4d3-7a845f58efdd" +version = "0.108.5+3" + +[[Parsers]] +deps = ["Dates", "Test"] +git-tree-sha1 = "8077624b3c450b15c087944363606a6ba12f925e" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "1.0.10" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["OpenSpecFun_jll"] +git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "0.10.3" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "016d1e1a00fabc556473b07161da3d39726ded35" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.12.4" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TranscodingStreams]] +deps = ["Random", "Test"] +git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.9.5" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[ZipFile]] +deps = ["Libdl", "Printf", "Zlib_jll"] +git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f" +uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" +version = "0.9.2" + +[[Zlib_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "d5bba6485811931e4b8958e2d7ca3738273ac468" +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.11+15" diff --git a/Project.toml b/Project.toml new file mode 100644 index 00000000..f8e32228 --- /dev/null +++ b/Project.toml @@ -0,0 +1,4 @@ +[deps] +Cbc = "9961bab8-2fa3-5c5a-9d89-47fab24efd76" +JuMP = "4076af6c-e467-56ae-b986-b466b2749572" +NPZ = "15e1cf62-19b3-5cfa-8e77-841668bca605" diff --git a/README.md b/README.md index 47121964..ff9de28c 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,10 @@ conda env create -f environment.yml To run the scripts that produce `puf.csv` and `cps.csv.gz`, activate the `taxdata-dev` conda environment and follow the workflow laid out below. +`Julia` must also be installed to solve for the PUF and CPS weights. You +can download `Julia` from their [website](https://julialang.org/downloads/) +or by using `homebrew`. + Data-Preparation Documentation and Workflow ------------------------------------------- diff --git a/cps_stage2/cps_weights.csv.gz b/cps_stage2/cps_weights.csv.gz index 81775e30..702eee38 100644 Binary files a/cps_stage2/cps_weights.csv.gz and b/cps_stage2/cps_weights.csv.gz differ diff --git a/cps_stage2/solve_lp_for_year.py b/cps_stage2/dataprep.py similarity index 85% rename from cps_stage2/solve_lp_for_year.py rename to cps_stage2/dataprep.py index 03de886d..45a40b01 100644 --- a/cps_stage2/solve_lp_for_year.py +++ b/cps_stage2/dataprep.py @@ -1,9 +1,8 @@ import numpy as np import pandas as pd -import cvxopt -def solve_lp_for_year(data, factors, targets, year, tol, weights=None): +def dataprep(data, factors, targets, year, weights=None): """ Parameters ---------- @@ -11,7 +10,6 @@ def solve_lp_for_year(data, factors, targets, year, tol, weights=None): factors: growth factors targets: aggregate targets year: year LP is being solved for - tol: tolerance """ def target(target_val, pop, factor, value): return (target_val * pop / factor * 1000 - value) @@ -201,36 +199,9 @@ def target(target_val, pop, factor, value): one_half_lhs = np.vstack(vstack_vars) # coefficients for r and s - a1 = np.array(one_half_lhs) - a2 = np.array(-one_half_lhs) + A1 = np.array(one_half_lhs) + A2 = np.array(-one_half_lhs) - # set up LP model - print('Constructing LP Model') - N = len(data.index) - c = cvxopt.matrix(np.ones(2 * N).tolist()) - - # tolerance and non-negativity constraints - G_values = np.append(np.ones(2 * N), -np.ones(2 * N)).tolist() - G_row = np.concatenate((list(range(N)), list(range(N)), - [i + N for i in list(range(2 * N))])).tolist() - G_row = [int(i) for i in G_row] - G_col = np.concatenate((list(range(2 * N)), list(range(2 * N)))).tolist() - G_col = [int(i) for i in G_col] - - G = cvxopt.spmatrix(G_values, G_row, G_col) - h = cvxopt.matrix(np.append(tol * np.ones(N), np.zeros(2 * N)).tolist()) - - # targets - A = cvxopt.matrix(np.hstack([a1, a2])) - b = cvxopt.matrix(b) - - print("Solving model") - sol_cvxopt = cvxopt.solvers.lp(c=c, G=G, h=h, A=A, b=b, solver=None) - - # extract results and construct weights - rs_val_cvxopt = np.array(sol_cvxopt["x"]).reshape((2 * N,)) - r_val_cvxopt = rs_val_cvxopt[:N] - s_val_cvxopt = rs_val_cvxopt[N:] - z = r_val_cvxopt - s_val_cvxopt - - return (1 + z) * s006 * 100 + + # save arrays as .npz files + np.savez(str(str(year) + "_input.npz"), A1=A1, A2=A2, b=b) diff --git a/cps_stage2/solver.jl b/cps_stage2/solver.jl new file mode 100644 index 00000000..8e5c903f --- /dev/null +++ b/cps_stage2/solver.jl @@ -0,0 +1,50 @@ +using JuMP, Cbc, NPZ + +function Solve_func(year, tol) + + println("Solving weights for $year ...\n\n") + + array = npzread(string(year, "_input.npz")) + + A1 = array["A1"] + A2 = array["A2"] + b = array["b"] + + model = Model(Cbc.Optimizer) + set_optimizer_attribute(model, "logLevel", 1) + N = size(A1)[2] + + @variable(model, r[1:N] >= 0) + @variable(model, s[1:N] >= 0) + + @objective(model, Min, sum(r[i] + s[i] for i in 1:N)) + + # bound on top by tolerance + @constraint(model, [i in 1:N], r[i] + s[i] <= tol) + + # Ax = b + @constraint(model, [i in 1:length(b)], sum(A1[i,j] * r[j] + A2[i,j] * s[j] + for j in 1:N) == b[i]) + + + optimize!(model) + termination_status(model) + + r_vec = value.(r) + s_vec = value.(s) + + npzwrite(string(year, "_output.npz"), Dict("r" => r_vec, "s" => s_vec)) + + println("\n") + +end + + + +year_list = [x for x in 2014:2030] +tol = 0.70 + +# Run solver function for all years and tolerances (in order) +for i in year_list + Solve_func(i, tol) +end diff --git a/cps_stage2/stage2.py b/cps_stage2/stage2.py index e42e69c9..725737fc 100644 --- a/cps_stage2/stage2.py +++ b/cps_stage2/stage2.py @@ -1,6 +1,8 @@ +import os, glob +import numpy as np import pandas as pd from pathlib import Path -from solve_lp_for_year import solve_lp_for_year +from dataprep import dataprep CUR_PATH = Path(__file__).resolve().parent STAGE_1_PATH = Path(CUR_PATH, "..", "puf_stage1", "Stage_I_factors.csv") @@ -26,17 +28,42 @@ def main(): ) # DataFrame for holding each year's weights weights = pd.DataFrame() + + # write .npz input files for solver + for year in range(START_YEAR, END_YEAR + 1): + dataprep( + cps, stage_1_factors, stage_2_targets, year) + + # Solver (in Julia) + env_path = os.path.join(CUR_PATH, "../Project.toml") + os.system(f'julia --project={env_path} solver.jl') + + # write output files to dataframe columns for year in range(START_YEAR, END_YEAR + 1): - print(f"Solving for {year}") - weights[f"WT{year}"] = solve_lp_for_year( - cps, stage_1_factors, stage_2_targets, year, .70 - ) + + s006 = np.where( + cps.e02400 > 0, + cps.s006 * stage_1_factors["APOPSNR"][year], + cps.s006 * stage_1_factors["ARETS"][year] + ) + + array = np.load(str(str(year) + "_output.npz")) + r_val = array['r'] + s_val = array['s'] + + z_val = (1 + r_val - s_val) * s006 * 100 + weights[str("WT" + str(year))] = z_val + weights = weights.round(0).astype("int64") weights.to_csv( Path(CUR_PATH, "cps_weights.csv.gz"), compression="gzip", index=False ) + # remove all .npz (numpy array) files + for file in glob.glob("*.npz"): + os.remove(file) + if __name__ == '__main__': main() diff --git a/environment.yml b/environment.yml index 0ed33d9d..cd0f7010 100644 --- a/environment.yml +++ b/environment.yml @@ -16,4 +16,3 @@ dependencies: - xlrd - pulp - paramtools -- cvxopt diff --git a/puf_stage2/README.md b/puf_stage2/README.md index 8a7b4d90..551e3b36 100644 --- a/puf_stage2/README.md +++ b/puf_stage2/README.md @@ -4,7 +4,7 @@ About puf_stage2 This directory contains the following script: * Python script **stage2.py**, which calls a function in the -`solve_lp_for_year.py` file, reads/writes: +`dataprep.py` and `solver.jl` files, reads/writes: Input files: - ../puf_data/cps-matched-puf.csv (not in repo because is restricted) diff --git a/puf_stage2/solve_lp_for_year.py b/puf_stage2/dataprep.py similarity index 89% rename from puf_stage2/solve_lp_for_year.py rename to puf_stage2/dataprep.py index f58c3fd5..0bf00822 100644 --- a/puf_stage2/solve_lp_for_year.py +++ b/puf_stage2/dataprep.py @@ -1,8 +1,7 @@ import numpy as np -import pulp -def solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, year, tol): +def dataprep(puf, Stage_I_factors, Stage_II_targets, year): print("Preparing coefficient matrix for year {} .....".format(year)) @@ -174,27 +173,5 @@ def solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, year, tol): for m in temp: b.append(m) - print('Constructing LP Model') - LP = pulp.LpProblem('PUF Stage 2', pulp.LpMinimize) - r = pulp.LpVariable.dicts('r', puf.index, lowBound=0) - s = pulp.LpVariable.dicts('s', puf.index, lowBound=0) - # add objective functoin - LP += pulp.lpSum([r[i] + s[i] for i in puf.index]) - # add constraints - for i in puf.index: - LP += r[i] + s[i] <= tol - for i in range(len(b)): - LP += pulp.lpSum([(A1[i][j] * r[j] + A2[i][j] * s[j]) - for j in puf.index]) == b[i] - - print('Solving Model...') - pulp.LpSolverDefault.msg = 1 # ensure there is a command line output - LP.solve() - print(pulp.LpStatus[LP.status]) - - # apply r and s values to the weights - r_val = np.array([r[i].varValue for i in r]) - s_val = np.array([s[i].varValue for i in s]) - z = (1. + r_val - s_val) * s006 * 100 - - return z + # export to .npz file + np.savez(str(str(year) + "_input.npz"), A1=A1, A2=A2, b=b) diff --git a/puf_stage2/puf_weights.csv.gz b/puf_stage2/puf_weights.csv.gz index 3daee86d..dd284dbf 100644 Binary files a/puf_stage2/puf_weights.csv.gz and b/puf_stage2/puf_weights.csv.gz differ diff --git a/puf_stage2/solver.jl b/puf_stage2/solver.jl new file mode 100644 index 00000000..a69b67ab --- /dev/null +++ b/puf_stage2/solver.jl @@ -0,0 +1,52 @@ +using JuMP, Cbc, NPZ + +function Solve_func(year, tol) + + println("Solving weights for $year ...\n\n") + + array = npzread(string(year, "_input.npz")) + + A1 = array["A1"] + A2 = array["A2"] + b = array["b"] + + model = Model(Cbc.Optimizer) + set_optimizer_attribute(model, "logLevel", 1) + N = size(A1)[2] + + @variable(model, r[1:N] >= 0) + @variable(model, s[1:N] >= 0) + + @objective(model, Min, sum(r[i] + s[i] for i in 1:N)) + + # bound on top by tolerance + @constraint(model, [i in 1:N], r[i] + s[i] <= tol) + + # Ax = b + @constraint(model, [i in 1:length(b)], sum(A1[i,j] * r[j] + A2[i,j] * s[j] + for j in 1:N) == b[i]) + + + optimize!(model) + termination_status(model) + + r_vec = value.(r) + s_vec = value.(s) + + npzwrite(string(year, "_output.npz"), Dict("r" => r_vec, "s" => s_vec)) + + println("\n") + +end + + + +year_list = [x for x in 2012:2030] +tol_list = [0.40, 0.38, 0.35, 0.33, 0.30, 0.37, 0.38, + 0.38, 0.39, 0.39, 0.38, 0.40, 0.39, 0.41, + 0.41, 0.42, 0.42, 0.42, 0.42] + +# Run solver function for all years and tolerances (in order) +for i in zip(year_list, tol_list) + Solve_func(i[1], i[2]) +end diff --git a/puf_stage2/stage2.py b/puf_stage2/stage2.py index afc35e1e..9a3995f3 100644 --- a/puf_stage2/stage2.py +++ b/puf_stage2/stage2.py @@ -1,9 +1,11 @@ import os import numpy as np import pandas as pd -from solve_lp_for_year import solve_lp_for_year +from dataprep import dataprep + CUR_PATH = os.path.abspath(os.path.dirname(__file__)) + # Read private CPS-matched-PUF file into a Pandas DataFrame puf = pd.read_csv(os.path.join(CUR_PATH, "../puf_data/cps-matched-puf.csv")) @@ -19,51 +21,39 @@ puf.s006 = puf.matched_weight * 100 +# Dataprep +year_list = [x for x in range(2012, 2030+1)] +for i in year_list: + dataprep(puf, Stage_I_factors, Stage_II_targets, + year=i) + +# Solver (in Julia) +env_path = os.path.join(CUR_PATH, "../Project.toml") +os.system(f'julia --project={env_path} solver.jl') + + +# Initialize weights dataframe z = pd.DataFrame() z["WT2011"] = puf.s006 +# write solution to dataframe +for i in year_list: + s006 = np.where(puf.e02400 > 0, + puf.s006 * Stage_I_factors[i]["APOPSNR"] / 100, + puf.s006 * Stage_I_factors[i]["ARETS"] / 100) + + array = np.load(str(str(i) + "_output.npz")) + r_val = array['r'] + s_val = array['s'] -# Execute stage2 logic for each year using a year-specific LP tolerance -z["WT2012"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2012, tol=0.40) -z['WT2013'] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2013, tol=0.38) -z['WT2014'] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2014, tol=0.35) -z["WT2015"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2015, tol=0.33) -z["WT2016"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2016, tol=0.30) -z["WT2017"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2017, tol=0.37) -z["WT2018"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2018, tol=0.38) -z["WT2019"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2019, tol=0.38) -z["WT2020"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2020, tol=0.39) -z["WT2021"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2021, tol=0.39) -z["WT2022"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2022, tol=0.38) -z["WT2023"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2023, tol=0.40) -z["WT2024"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2024, tol=0.39) -z["WT2025"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2025, tol=0.41) -z["WT2026"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2026, tol=0.41) -z["WT2027"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2027, tol=0.42) -z["WT2028"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2028, tol=0.42) -z["WT2029"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2029, tol=0.42) -z["WT2030"] = solve_lp_for_year(puf, Stage_I_factors, Stage_II_targets, - year=2030, tol=0.42) + z_val = (1. + r_val - s_val) * s006 * 100 + z[str("WT" + str(i))] = z_val # Write all weights (rounded to nearest integer) to puf_weights.csv file z = z.round(0).astype('int64') z.to_csv(os.path.join(CUR_PATH, 'puf_weights.csv.gz'), index=False, compression='gzip') + +# remove all .npz (numpy array) files +for file in glob.glob("*.npz"): + os.remove(file)