From 9cefa7f843fcd21e4965caac3beb381aac5d92ff Mon Sep 17 00:00:00 2001 From: JohannesNaegele <55059387+JohannesNaegele@users.noreply.github.com> Date: Wed, 25 Mar 2020 16:00:13 +0100 Subject: [PATCH 1/6] New CUDA version in Julia --- Julia_CUDAnative.jl | 189 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 Julia_CUDAnative.jl diff --git a/Julia_CUDAnative.jl b/Julia_CUDAnative.jl new file mode 100644 index 0000000..2f725b5 --- /dev/null +++ b/Julia_CUDAnative.jl @@ -0,0 +1,189 @@ +using Distributions +using Dates +using CUDAnative, CuArrays, CUDAdrv, BenchmarkTools +using Crayons + + +struct params + ne::Int64 + nx::Int64 + T::Int64 + ssigma::Float64 + bbeta::Float64 + w::Float64 + r::Float64 +end + +# Function that computes value function, given vector of state variables +function value(params::params, age::Int64, xgrid, egrid, P, V) + + ix = (blockIdx().x-1) * blockDim().x + threadIdx().x + ie = threadIdx().y + + ne = params.ne + nx = params.nx + T = params.T + ssigma = params.ssigma + bbeta = params.bbeta + w = params.w + r = params.r + + VV = -10.0^3; + ixpopt = 0; + + + @inbounds for ixp = 1:nx + expected = 0.0; + if(age < T) + for iep = 1:ne + expected = expected + P[ie, iep]*V[age + 1, ixp, iep]; + end + end + + cons = (1 + r)*xgrid[ix] + egrid[ie]*w - xgrid[ixp]; + + # utility = (cons^(1-ssigma))/(1-ssigma) + bbeta*expected; + utility = (CUDAnative.pow.(cons,1-ssigma))/(1-ssigma) + bbeta*expected; + + if(cons <= 0) + utility = -10.0^(5); + end + + if(utility >= VV) + VV = utility; + ixpopt = ixp; + end + + utility = 0.0; + end + + V[age, ix, ie] = VV + + return nothing +end + +function main() + + println(CUDAdrv.name(CuDevice(0))) + + # Grid for x + nx = 1500; + xmin = 0.1; + xmax = 4.0; + + # Grid for e: parameters for Tauchen + ne = 15; + ssigma_eps = 0.02058; + llambda_eps = 0.99; + m = 1.5; + + # Utility function + ssigma = 2; + bbeta = 0.97; + T = 10; + + # Prices + r = 0.07; + w = 5; + + # Initialize the grid for X + xgrid = CuArray{Float64,1}(zeros(nx)) + + # Initialize the grid for E and the transition probability matrix + egrid = CuArray{Float64,1}(zeros(ne)) + P = CuArray{Float64,2}(zeros(ne, ne)) + + # Initialize value function V + V = CuArray{Float64,3}(zeros(T, nx, ne)) + V_tomorrow = CuArray{Float64,2}(zeros(nx, ne)) + + # Initialize value function as a shared array + tempV = CuArray{Float64,1}(zeros(ne*nx)) + + println("vor grid") + + #--------------------------------# + # Grid creation # + #--------------------------------# + + # Grid for capital (x) + size = nx; + xstep = (xmax - xmin) /(size - 1); + for i = 1:nx + xgrid[i] = xmin + (i-1)*xstep; + end + + # Grid for productivity (e) with Tauchen (1986) + size = ne; + ssigma_y = sqrt((ssigma_eps^2) / (1 - (llambda_eps^2))); + estep = 2*ssigma_y*m / (size-1); + for i = 1:ne + egrid[i] = (-m*sqrt((ssigma_eps^2) / (1 - (llambda_eps^2))) + (i-1)*estep); + end + + # Transition probability matrix (P) Tauchen (1986) + mm = egrid[2] - egrid[1]; + for j = 1:ne + for k = 1:ne + if(k == 1) + P[j, k] = cdf(Normal(), (egrid[k] - llambda_eps*egrid[j] + (mm/2))/ssigma_eps); + elseif(k == ne) + P[j, k] = 1 - cdf(Normal(), (egrid[k] - llambda_eps*egrid[j] - (mm/2))/ssigma_eps); + else + P[j, k] = cdf(Normal(), (egrid[k] - llambda_eps*egrid[j] + (mm/2))/ssigma_eps) - cdf(Normal(), (egrid[k] - llambda_eps*egrid[j] - (mm/2))/ssigma_eps); + end + end + end + + # Exponential of the grid e + for i = 1:ne + egrid[i] = exp(egrid[i]); + end + + println("nach grid") + + #--------------------------------# + # Life-cycle computation # + #--------------------------------# + + print(" \n") + print("Life cycle computation: \n") + print(" \n") + + start = Dates.unix2datetime(time()) + ######################## + currentState = params(ne,nx,T,ssigma,bbeta,w,r) + @inbounds for age = T:-1:1 + CuArrays.@sync @cuda blocks=50 threads=(30, 15) value(currentState, age, xgrid, egrid, P, V) + finish = convert(Int, Dates.value(Dates.unix2datetime(time())- start))/1000; + # print("Age: ", age, ". Time: ", finish, " seconds. \n") + end + V_neu = Array(V) + print("\n") + finish = convert(Int, Dates.value(Dates.unix2datetime(time())- start))/1000; + print("TOTAL ELAPSED TIME: ", finish, " seconds. \n") + + #---------------------# + # Some checks # + #---------------------# + + print(" \n") + print(" - - - - - - - - - - - - - - - - - - - - - \n") + print(" \n") + print("The first entries of the value function: \n") + print(" \n") + + # I print the first entries of the value function, to check + for i = 1:3 + print(round(V[1, 1, i], digits=5), "\n") + end +end + +println(Crayon(foreground = :red), "\nWarmup call -- slower!", Crayon(foreground = :white)) +@time main() +println(Crayon(foreground = :green), "\nProper call -- correct time measurement.", Crayon(foreground = :white)) +@time main() + +# main() + +# @btime(CuArrays.@sync main()) \ No newline at end of file From 75b239f40b15eb77418c61159830d40fb6d286e0 Mon Sep 17 00:00:00 2001 From: JohannesNaegele <55059387+JohannesNaegele@users.noreply.github.com> Date: Wed, 25 Mar 2020 16:02:03 +0100 Subject: [PATCH 2/6] Include Julia_CUDAnative.jl in makefile --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index e374282..e34719f 100644 --- a/Makefile +++ b/Makefile @@ -68,6 +68,8 @@ julia_parallel: julia_pmap: julia -p$(CORES) Julia_main_pmap.jl +julia_CUDA: + julia -p$(CORES) -O3 --color=yes Julia_CUDAnative.jl Rcpp: export OMP_NUM_THREADS=$(CORES); Rscript Rcpp_main.R; From 57bde6940dfa37e7a01818422d192a8e1c57a6c7 Mon Sep 17 00:00:00 2001 From: JohannesNaegele <55059387+JohannesNaegele@users.noreply.github.com> Date: Wed, 25 Mar 2020 16:14:22 +0100 Subject: [PATCH 3/6] Minimal fixes --- Julia_CUDAnative.jl | 4 ---- Makefile | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/Julia_CUDAnative.jl b/Julia_CUDAnative.jl index 2f725b5..5aec42f 100644 --- a/Julia_CUDAnative.jl +++ b/Julia_CUDAnative.jl @@ -100,8 +100,6 @@ function main() # Initialize value function as a shared array tempV = CuArray{Float64,1}(zeros(ne*nx)) - println("vor grid") - #--------------------------------# # Grid creation # #--------------------------------# @@ -140,8 +138,6 @@ function main() egrid[i] = exp(egrid[i]); end - println("nach grid") - #--------------------------------# # Life-cycle computation # #--------------------------------# diff --git a/Makefile b/Makefile index e34719f..133ece6 100644 --- a/Makefile +++ b/Makefile @@ -70,6 +70,7 @@ julia_pmap: julia_CUDA: julia -p$(CORES) -O3 --color=yes Julia_CUDAnative.jl + Rcpp: export OMP_NUM_THREADS=$(CORES); Rscript Rcpp_main.R; From cdccf6931206f6827560b73f2b3cec146ddbd234 Mon Sep 17 00:00:00 2001 From: JohannesNaegele <55059387+JohannesNaegele@users.noreply.github.com> Date: Wed, 25 Mar 2020 16:28:41 +0100 Subject: [PATCH 4/6] Minimal fixes --- Julia_CUDAnative.jl | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/Julia_CUDAnative.jl b/Julia_CUDAnative.jl index 5aec42f..dbe5434 100644 --- a/Julia_CUDAnative.jl +++ b/Julia_CUDAnative.jl @@ -1,8 +1,9 @@ using Distributions using Dates -using CUDAnative, CuArrays, CUDAdrv, BenchmarkTools +using CUDAnative, CuArrays, CUDAdrv using Crayons +println(Crayon(foreground = :yellow), "\nYour CUDA-device: " * CUDAdrv.name(CuDevice(0)), Crayon(foreground = :white)) struct params ne::Int64 @@ -42,7 +43,7 @@ function value(params::params, age::Int64, xgrid, egrid, P, V) cons = (1 + r)*xgrid[ix] + egrid[ie]*w - xgrid[ixp]; - # utility = (cons^(1-ssigma))/(1-ssigma) + bbeta*expected; + # The ^ operator seems to broken on the GPU. One can always use the CUDAnative internals instead. utility = (CUDAnative.pow.(cons,1-ssigma))/(1-ssigma) + bbeta*expected; if(cons <= 0) @@ -62,9 +63,7 @@ function value(params::params, age::Int64, xgrid, egrid, P, V) return nothing end -function main() - - println(CUDAdrv.name(CuDevice(0))) +function main() # Grid for x nx = 1500; @@ -147,13 +146,12 @@ function main() print(" \n") start = Dates.unix2datetime(time()) - ######################## - currentState = params(ne,nx,T,ssigma,bbeta,w,r) - @inbounds for age = T:-1:1 - CuArrays.@sync @cuda blocks=50 threads=(30, 15) value(currentState, age, xgrid, egrid, P, V) - finish = convert(Int, Dates.value(Dates.unix2datetime(time())- start))/1000; - # print("Age: ", age, ". Time: ", finish, " seconds. \n") - end + currentState = params(ne,nx,T,ssigma,bbeta,w,r) + @inbounds for age = T:-1:1 + CuArrays.@sync @cuda blocks=50 threads=(30, 15) value(currentState, age, xgrid, egrid, P, V) + finish = convert(Int, Dates.value(Dates.unix2datetime(time())- start))/1000; + print("Age: ", age, ". Time: ", finish, " seconds. \n") + end V_neu = Array(V) print("\n") finish = convert(Int, Dates.value(Dates.unix2datetime(time())- start))/1000; @@ -175,11 +173,7 @@ function main() end end -println(Crayon(foreground = :red), "\nWarmup call -- slower!", Crayon(foreground = :white)) -@time main() +println(Crayon(foreground = :red), "\nWarmup call -- slow!", Crayon(foreground = :white)) +main() println(Crayon(foreground = :green), "\nProper call -- correct time measurement.", Crayon(foreground = :white)) -@time main() - -# main() - -# @btime(CuArrays.@sync main()) \ No newline at end of file +main() \ No newline at end of file From 19431d7385c328fb942cc587c8197e4111ec1ebe Mon Sep 17 00:00:00 2001 From: JohannesNaegele <55059387+JohannesNaegele@users.noreply.github.com> Date: Wed, 25 Mar 2020 19:08:21 +0100 Subject: [PATCH 5/6] Added some comments and minor fixes --- Julia_CUDAnative.jl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Julia_CUDAnative.jl b/Julia_CUDAnative.jl index dbe5434..7088439 100644 --- a/Julia_CUDAnative.jl +++ b/Julia_CUDAnative.jl @@ -32,7 +32,7 @@ function value(params::params, age::Int64, xgrid, egrid, P, V) VV = -10.0^3; ixpopt = 0; - + # @inbounds disables bounds checking @inbounds for ixp = 1:nx expected = 0.0; if(age < T) @@ -85,6 +85,8 @@ function main() r = 0.07; w = 5; + # The following arrays are CUDA-arrays. + # Initialize the grid for X xgrid = CuArray{Float64,1}(zeros(nx)) @@ -94,15 +96,14 @@ function main() # Initialize value function V V = CuArray{Float64,3}(zeros(T, nx, ne)) - V_tomorrow = CuArray{Float64,2}(zeros(nx, ne)) - - # Initialize value function as a shared array - tempV = CuArray{Float64,1}(zeros(ne*nx)) #--------------------------------# # Grid creation # #--------------------------------# + # The following operations on the CUDA-arrays trigger the warning "Performing scalar operations on GPU arrays: This is very slow..." + # One could as well create regular arrays first and transfer them to the GPU later. However, this doesn't change the runtime significantly. + # Grid for capital (x) size = nx; xstep = (xmax - xmin) /(size - 1); @@ -148,11 +149,15 @@ function main() start = Dates.unix2datetime(time()) currentState = params(ne,nx,T,ssigma,bbeta,w,r) @inbounds for age = T:-1:1 + # I need to sync for correct time measurement. Caution: Just @sync refers to Base.@sync CuArrays.@sync @cuda blocks=50 threads=(30, 15) value(currentState, age, xgrid, egrid, P, V) finish = convert(Int, Dates.value(Dates.unix2datetime(time())- start))/1000; + # One should keep in mind, that printing in some environments is pretty slow in Julia. + # Without printing, on my machine Julia runs roughly as fast as C++-CUDA (ca. 1-2% slower) print("Age: ", age, ". Time: ", finish, " seconds. \n") end - V_neu = Array(V) + # transfer V back to the CPU to get a fair comparison with C++-CUDA (one can print the results without transfering them back manually, however I don't know what happens under the hood) + results = Array(V) print("\n") finish = convert(Int, Dates.value(Dates.unix2datetime(time())- start))/1000; print("TOTAL ELAPSED TIME: ", finish, " seconds. \n") @@ -169,7 +174,7 @@ function main() # I print the first entries of the value function, to check for i = 1:3 - print(round(V[1, 1, i], digits=5), "\n") + print(round(results[1, 1, i], digits=5), "\n") end end From 65db8d1ca6aee383168d79406c1b1441916c7dc8 Mon Sep 17 00:00:00 2001 From: JohannesNaegele <55059387+JohannesNaegele@users.noreply.github.com> Date: Sat, 28 Mar 2020 18:54:26 +0100 Subject: [PATCH 6/6] Corrected grid size nx --- Cpp_main_OpenMP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cpp_main_OpenMP.cpp b/Cpp_main_OpenMP.cpp index 1180b30..820fb91 100644 --- a/Cpp_main_OpenMP.cpp +++ b/Cpp_main_OpenMP.cpp @@ -147,7 +147,7 @@ int main() //--------------------------------// // Grid for x - const int nx = 15000; + const int nx = 1500; const float xmin = 0.1; const float xmax = 4.0;