From 224f6c7545e628ba4f7cd67dfd739614359f926d Mon Sep 17 00:00:00 2001 From: golosio Date: Wed, 22 Jun 2022 23:02:11 +0200 Subject: [PATCH 1/2] Revert "Faster version of nested loop for spike delivery" --- Makefile.am | 3 +- src/get_spike.cu | 16 +- src/get_spike.h | 3 - src/locate.cu | 14 - src/nested_loop.cu | 178 ++++++++++++ src/nested_loop.cu.full | 591 ++++++++++++++++++++++++++++++++++++++++ src/nested_loop.h | 41 +++ src/nestgpu.cu | 12 +- src/rev_spike.cu | 15 +- src/rev_spike.h | 2 - 10 files changed, 820 insertions(+), 55 deletions(-) delete mode 100644 src/locate.cu create mode 100644 src/nested_loop.cu create mode 100644 src/nested_loop.cu.full create mode 100644 src/nested_loop.h diff --git a/Makefile.am b/Makefile.am index d3745b2f7..9105a1636 100644 --- a/Makefile.am +++ b/Makefile.am @@ -42,6 +42,7 @@ $(top_srcdir)/src/dir_connect.h \ $(top_srcdir)/src/getRealTime.h \ $(top_srcdir)/src/get_spike.h \ $(top_srcdir)/src/multimeter.h \ +$(top_srcdir)/src/nested_loop.h \ $(top_srcdir)/src/neuron_models.h \ $(top_srcdir)/src/nestgpu.h \ $(top_srcdir)/src/nestgpu_C.h \ @@ -98,7 +99,7 @@ $(top_srcdir)/src/dummyfile.cpp \ $(top_srcdir)/src/getRealTime.cu \ $(top_srcdir)/src/get_spike.cu \ $(top_srcdir)/src/multimeter.cu \ -$(top_srcdir)/src/locate.cu \ +$(top_srcdir)/src/nested_loop.cu \ $(top_srcdir)/src/neuron_models.cu \ $(top_srcdir)/src/nestgpu.cu \ $(top_srcdir)/src/nestgpu_C.cpp \ diff --git a/src/get_spike.cu b/src/get_spike.cu index 24c3f42b2..a3032e0a8 100644 --- a/src/get_spike.cu +++ b/src/get_spike.cu @@ -55,7 +55,7 @@ __device__ double atomicAddDouble(double* address, double val) ////////////////////////////////////////////////////////////////////// // This is the function called by the nested loop // that collects the spikes -__device__ void CollectSpikeFunction(int i_spike, int i_syn) +__device__ void NestedLoopFunction0(int i_spike, int i_syn) { int i_source = SpikeSourceIdx[i_spike]; int i_conn = SpikeConnIdx[i_spike]; @@ -93,20 +93,6 @@ __device__ void CollectSpikeFunction(int i_spike, int i_syn) } //////////////////////////////////////////////////////////////// } - -__global__ void CollectSpikeKernel(int n_spikes, int *SpikeTargetNum) -{ - const int i_spike = blockIdx.x; - if (i_spike1) { - if (data[i] > val) i_right = i; - else if (data[i]. + * + */ + + + + + +#include +#include +#include + + //#include "cuda_error_nl.h" +#include "cuda_error.h" +#include "nested_loop.h" + +//TMP +#include "getRealTime.h" +// + +////////////////////////////////////////////////////////////////////// +// declare here the functions called by the nested loop +__device__ void NestedLoopFunction0(int ix, int iy); +__device__ void NestedLoopFunction1(int ix, int iy); +////////////////////////////////////////////////////////////////////// + +namespace NestedLoop +{ + PrefixScan prefix_scan_; + int *d_Ny_cumul_sum_; +} + +__device__ int locate(int val, int *data, int n) +{ + int i_left = 0; + int i_right = n-1; + int i = (i_left+i_right)/2; + while(i_right-i_left>1) { + if (data[i] > val) i_right = i; + else if (data[i]0) { + int grid_dim_x, grid_dim_y; + if (Ny_sum<65536*1024) { // max grid dim * max block dim + grid_dim_x = (Ny_sum+1023)/1024; + grid_dim_y = 1; + } + else { + grid_dim_x = 32; // I think it's not necessary to increase it + if (Ny_sum>grid_dim_x*1024*65535) { + throw ngpu_exception(std::string("Ny sum ") + std::to_string(Ny_sum) + + " larger than threshold " + + std::to_string(grid_dim_x*1024*65535)); + } + grid_dim_y = (Ny_sum + grid_dim_x*1024 -1) / (grid_dim_x*1024); + } + dim3 numBlocks(grid_dim_x, grid_dim_y); + //TMP + //double time_mark=getRealTime(); + // + switch (i_func) { + case 0: + CumulSumNestedLoopKernel0<<>> + (Nx, d_Ny_cumul_sum_, Ny_sum); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + break; + case 1: + CumulSumNestedLoopKernel1<<>> + (Nx, d_Ny_cumul_sum_, Ny_sum); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + break; + default: + throw ngpu_exception("unknown nested loop function"); + } + + //TMP + //printf("cst: %lf\n", getRealTime()-time_mark); + // + } + + return 0; +} + diff --git a/src/nested_loop.cu.full b/src/nested_loop.cu.full new file mode 100644 index 000000000..94b8594fe --- /dev/null +++ b/src/nested_loop.cu.full @@ -0,0 +1,591 @@ +/* + * This file is part of NESTGPU. + * + * Copyright (C) 2021 The NEST Initiative + * + * NESTGPU is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * NESTGPU is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with NESTGPU. If not, see . + * + */ + + + + + +#include +#include + +#include +#include +//#include + +#include "cuda_error_nl.h" +#include "nested_loop.h" + +//TMP +#include "getRealTime.h" +// + +////////////////////////////////////////////////////////////////////// +// declare here the function called by the nested loop +__device__ void NestedLoopFunction(int ix, int iy); +////////////////////////////////////////////////////////////////////// + +namespace NestedLoop +{ + #include "Ny_th.h" + void *d_sort_storage_; + size_t sort_storage_bytes_; + void *d_reduce_storage_; + size_t reduce_storage_bytes_; + + int Nx_max_; + int *d_max_Ny_; + int *d_sorted_Ny_; + + int *d_idx_; + int *d_sorted_idx_; + + int block_dim_x_; + int block_dim_y_; + int frame_area_; + float x_lim_; + +#ifdef WITH_CUMUL_SUM + PrefixScan prefix_scan_; + int *d_Ny_cumul_sum_; +#endif + +} + +////////////////////////////////////////////////////////////////////// +__global__ void SimpleNestedLoopKernel(int Nx, int *Ny) +{ + int ix = (blockIdx.x * blockDim.x) + threadIdx.x; + int iy = (blockIdx.y * blockDim.y) + threadIdx.y; + if (ix1) { + if (data[i] > val) i_right = i; + else if (data[i]>>(Nx, d_Ny); + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::ParallelInnerNestedLoop(int Nx, int *d_Ny) +{ + for (int ix=0; ix>>(ix, Ny); + // CudaCheckError(); // uncomment only for debugging + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::ParallelOuterNestedLoop(int Nx, int *d_Ny) +{ + ParallelOuterNestedLoopKernel<<<(Nx+1023)/1024, 1024>>>(Nx, d_Ny); + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Frame1DNestedLoop(int Nx, int *d_Ny) +{ + if (Nx <= 0) return 0; + int dim_x, dim_y; + + // Run sorting operation + cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_, + d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, + Nx); + + int ix0 = Nx; + while(ix0>0) { + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + if (dim_y < 1) dim_y = 1; + dim_x = (frame_area_ - 1) / dim_y + 1; + ix0 -= dim_x; + if (ix0<0) { + dim_x += ix0; + ix0 = 0; + } + Frame1DNestedLoopKernel<<<(dim_x*dim_y+1023)/1024, 1024>>> + (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Frame2DNestedLoop(int Nx, int *d_Ny) +{ + if (Nx <= 0) return 0; + // Sort the pairs (ix, Ny) with ix=0,..,Nx-1 in ascending order of Ny. + // After the sorting operation, d_sorted_idx_ are the reordered indexes ix + // and d_sorted_Ny_ are the sorted values of Ny + cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_, + d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, + Nx); + int ix0 = Nx; // proceeds from right to left + while(ix0>0) { + int dim_x, dim_y; // width and height of the rectangular frame + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + if (dim_y < 1) dim_y = 1; + // frame_area_ is the fixed value of the the rectangular frame area + dim_x = (frame_area_ - 1) / dim_y + 1; // width of the rectangular frame + ix0 -= dim_x; // update the index value + if (ix0<0) { + dim_x += ix0; // adjust the width if ix0<0 + ix0 = 0; + } + dim3 threadsPerBlock(block_dim_x_, block_dim_y_); // block size + dim3 numBlocks((dim_x - 1)/threadsPerBlock.x + 1, + (dim_y - 1)/threadsPerBlock.y + 1); + // run a nested loop kernel on the rectangular frame + Frame2DNestedLoopKernel <<>> + (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Smart1DNestedLoop(int Nx, int *d_Ny) +{ + // Find max value of Ny + cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny, + d_max_Ny_, Nx); + int max_Ny; + CudaSafeCall(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int), + cudaMemcpyDeviceToHost)); + if (Nx <= 0) return 0; + float f_Nx = 2.0*log((float)Nx)-5; + int i_Nx = (int)floor(f_Nx); + int Ny_th; + if (i_Nx<0) { + Ny_th = Ny_th_arr_[0]; + } + else if (i_Nx>=Ny_arr_size_-1) { + Ny_th = Ny_th_arr_[Ny_arr_size_-1]; + } + else { + float t = f_Nx - (float)i_Nx; + Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t; + } + if (max_Ny>>(Nx, d_Ny); + //CudaCheckError(); // uncomment only for debugging + + int ix0 = Nx; + while(ix0>ix1) { + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + dim_y -= Ny1; + if (dim_y<=0) break; + dim_x = (frame_area_ - 1) / dim_y + 1; + ix0 -= dim_x; + if (ix0>> + (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + //CudaCheckError(); // uncomment only for debugging + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Smart2DNestedLoop(int Nx, int *d_Ny) +{ + // Find max value of Ny + cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny, + d_max_Ny_, Nx); + int max_Ny; + CudaSafeCall(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int), + cudaMemcpyDeviceToHost)); + if (Nx <= 0) return 0; + float f_Nx = 2.0*log((float)Nx)-5; + int i_Nx = (int)floor(f_Nx); + int Ny_th; + if (i_Nx<0) { + Ny_th = Ny_th_arr_[0]; + } + else if (i_Nx>=Ny_arr_size_-1) { + Ny_th = Ny_th_arr_[Ny_arr_size_-1]; + } + else { + float t = f_Nx - (float)i_Nx; + Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t; + } + if (max_Ny>>(Nx, d_Ny); + //CudaCheckError(); // uncomment only for debugging + + int ix0 = Nx; + while(ix0>ix1) { + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + dim_y -= Ny1; + if (dim_y<=0) break; + dim_x = (frame_area_ - 1) / dim_y + 1; + ix0 -= dim_x; + if (ix0>> + (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + //CudaCheckError(); // uncomment only for debugging + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +#ifdef WITH_CUMUL_SUM +int NestedLoop::CumulSumNestedLoop(int Nx, int *d_Ny) +{ + //TMP + //double time_mark=getRealTime(); + // + prefix_scan_.Scan(d_Ny_cumul_sum_, d_Ny, Nx+1); + //TMP + //printf("pst: %lf\n", getRealTime()-time_mark); + // + int Ny_sum; + CudaSafeCall(cudaMemcpy(&Ny_sum, &d_Ny_cumul_sum_[Nx], + sizeof(int), cudaMemcpyDeviceToHost)); + + //printf("CSNL: %d %d\n", Nx, Ny_sum); + + //printf("Ny_sum %u\n", Ny_sum); + //temporary - remove + /* + if (Ny_sum==0) { + printf("Nx %d\n", Nx); + for (int i=0; i0) { + int grid_dim_x, grid_dim_y; + if (Ny_sum<65536*1024) { // max grid dim * max block dim + grid_dim_x = (Ny_sum+1023)/1024; + grid_dim_y = 1; + } + else { + grid_dim_x = 64; // I think it's not necessary to increase it + if (Ny_sum>grid_dim_x*1024*65535) { + printf("Ny sum %d larger than threshold %d\n", + Ny_sum, grid_dim_x*1024*65535); + exit(-1); + } + grid_dim_y = (Ny_sum + grid_dim_x*1024 -1) / (grid_dim_x*1024); + } + dim3 numBlocks(grid_dim_x, grid_dim_y); + //TMP + //double time_mark=getRealTime(); + // + CumulSumNestedLoopKernel<<>>(Nx, d_Ny_cumul_sum_, Ny_sum); + + cudaDeviceSynchronize(); + CudaCheckError(); + //TMP + //printf("cst: %lf\n", getRealTime()-time_mark); + // + } + + return 0; +} +#endif diff --git a/src/nested_loop.h b/src/nested_loop.h new file mode 100644 index 000000000..14d398d35 --- /dev/null +++ b/src/nested_loop.h @@ -0,0 +1,41 @@ +/* + * This file is part of NESTGPU. + * + * Copyright (C) 2021 The NEST Initiative + * + * NESTGPU is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * NESTGPU is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with NESTGPU. If not, see . + * + */ + + + + + +#ifndef NESTEDLOOP_H +#define NESTEDLOOP_H + +#include "prefix_scan.h" + +namespace NestedLoop +{ + extern PrefixScan prefix_scan_; + + int Init(); + int Run(int Nx, int *d_Ny, int i_func); + int CumulSumNestedLoop(int Nx, int *d_Ny, int i_func); + + int Free(); +} + +#endif diff --git a/src/nestgpu.cu b/src/nestgpu.cu index 7b04661f9..d929787dc 100644 --- a/src/nestgpu.cu +++ b/src/nestgpu.cu @@ -42,6 +42,7 @@ #include "getRealTime.h" #include "random.h" #include "nestgpu.h" +#include "nested_loop.h" #include "dir_connect.h" #include "rev_spike.h" #include "spike_mpi.h" @@ -131,6 +132,8 @@ NESTGPU::NESTGPU() connect_mpi_->remote_spike_height_ = false; #endif + NestedLoop::Init(); + SpikeBufferUpdate_time_ = 0; poisson_generator_time_ = 0; neuron_Update_time_ = 0; @@ -535,9 +538,7 @@ int NESTGPU::SimulationStep() ClearGetSpikeArrays(); if (n_spikes > 0) { time_mark = getRealTime(); - CollectSpikeKernel<<>>(n_spikes, d_SpikeTargetNum); - gpuErrchk(cudaPeekAtLastError()); - + NestedLoop::Run(n_spikes, d_SpikeTargetNum, 0); NestedLoop_time_ += (getRealTime() - time_mark); } time_mark = getRealTime(); @@ -604,10 +605,7 @@ int NESTGPU::SimulationStep() gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int), cudaMemcpyDeviceToHost)); if (n_rev_spikes > 0) { - SynapseUpdateKernel<<>>(n_rev_spikes, d_RevSpikeNConn); - gpuErrchk(cudaPeekAtLastError()); - gpuErrchk(cudaDeviceSynchronize()); - + NestedLoop::Run(n_rev_spikes, d_RevSpikeNConn, 1); } //RevSpikeBufferUpdate_time_ += (getRealTime() - time_mark); } diff --git a/src/rev_spike.cu b/src/rev_spike.cu index c459a5685..4078b4ea0 100644 --- a/src/rev_spike.cu +++ b/src/rev_spike.cu @@ -48,7 +48,7 @@ __device__ int *RevSpikeNConn; ////////////////////////////////////////////////////////////////////// // This is the function called by the nested loop // that makes use of positive post-pre spike time difference -__device__ void SynapseUpdateFunction(int i_spike, int i_target_rev_conn) +__device__ void NestedLoopFunction1(int i_spike, int i_target_rev_conn) { unsigned int target = RevSpikeTarget[i_spike]; unsigned int i_conn = TargetRevConnection[target][i_target_rev_conn]; @@ -63,18 +63,7 @@ __device__ void SynapseUpdateFunction(int i_spike, int i_target_rev_conn) } } } - -__global__ void SynapseUpdateKernel(int n_rev_spikes, int *RevSpikeNConn) -{ - const int i_spike = blockIdx.x; - if (i_spike Date: Wed, 11 Dec 2024 17:20:56 +0100 Subject: [PATCH 2/2] Updated CMakeLists.txt --- src/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b721b22db..72c224a54 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -64,6 +64,7 @@ set ( nestgpu_sources izhikevich_psc_exp.h locate.h multimeter.h + nested_loop.h nestgpu_C.h nestgpu.h neuron_models.h @@ -120,9 +121,9 @@ set ( nestgpu_sources izhikevich_psc_exp_2s.cu izhikevich_psc_exp_5s.cu izhikevich_psc_exp.cu - locate.cu multimeter.cu nestgpu.cu + nested_loop.cu neuron_models.cu node_group.cu parrot_neuron.cu