From 224f6c7545e628ba4f7cd67dfd739614359f926d Mon Sep 17 00:00:00 2001
From: golosio <golosio@unica.it>
Date: Wed, 22 Jun 2022 23:02:11 +0200
Subject: [PATCH 1/2] Revert "Faster version of nested loop for spike delivery"

---
 Makefile.am             |   3 +-
 src/get_spike.cu        |  16 +-
 src/get_spike.h         |   3 -
 src/locate.cu           |  14 -
 src/nested_loop.cu      | 178 ++++++++++++
 src/nested_loop.cu.full | 591 ++++++++++++++++++++++++++++++++++++++++
 src/nested_loop.h       |  41 +++
 src/nestgpu.cu          |  12 +-
 src/rev_spike.cu        |  15 +-
 src/rev_spike.h         |   2 -
 10 files changed, 820 insertions(+), 55 deletions(-)
 delete mode 100644 src/locate.cu
 create mode 100644 src/nested_loop.cu
 create mode 100644 src/nested_loop.cu.full
 create mode 100644 src/nested_loop.h

diff --git a/Makefile.am b/Makefile.am
index d3745b2f7..9105a1636 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -42,6 +42,7 @@ $(top_srcdir)/src/dir_connect.h \
 $(top_srcdir)/src/getRealTime.h \
 $(top_srcdir)/src/get_spike.h \
 $(top_srcdir)/src/multimeter.h \
+$(top_srcdir)/src/nested_loop.h \
 $(top_srcdir)/src/neuron_models.h \
 $(top_srcdir)/src/nestgpu.h \
 $(top_srcdir)/src/nestgpu_C.h \
@@ -98,7 +99,7 @@ $(top_srcdir)/src/dummyfile.cpp \
 $(top_srcdir)/src/getRealTime.cu \
 $(top_srcdir)/src/get_spike.cu \
 $(top_srcdir)/src/multimeter.cu \
-$(top_srcdir)/src/locate.cu \
+$(top_srcdir)/src/nested_loop.cu \
 $(top_srcdir)/src/neuron_models.cu \
 $(top_srcdir)/src/nestgpu.cu \
 $(top_srcdir)/src/nestgpu_C.cpp \
diff --git a/src/get_spike.cu b/src/get_spike.cu
index 24c3f42b2..a3032e0a8 100644
--- a/src/get_spike.cu
+++ b/src/get_spike.cu
@@ -55,7 +55,7 @@ __device__ double atomicAddDouble(double* address, double val)
 //////////////////////////////////////////////////////////////////////
 // This is the function called by the nested loop
 // that collects the spikes
-__device__ void CollectSpikeFunction(int i_spike, int i_syn)
+__device__ void NestedLoopFunction0(int i_spike, int i_syn)
 {
   int i_source = SpikeSourceIdx[i_spike];
   int i_conn = SpikeConnIdx[i_spike];
@@ -93,20 +93,6 @@ __device__ void CollectSpikeFunction(int i_spike, int i_syn)
   }
   ////////////////////////////////////////////////////////////////
 }
-
-__global__ void CollectSpikeKernel(int n_spikes, int *SpikeTargetNum)
-{
-  const int i_spike = blockIdx.x;
-  if (i_spike<n_spikes) {
-    const int n_spike_targets = SpikeTargetNum[i_spike];
-    for (int i_syn = threadIdx.x; i_syn < n_spike_targets; i_syn += blockDim.x){
-      CollectSpikeFunction(i_spike, i_syn);
-    }
-  }
-}
-
-
-
 ///////////////
 
 // improve using a grid
diff --git a/src/get_spike.h b/src/get_spike.h
index dfd50eb00..9857a640d 100644
--- a/src/get_spike.h
+++ b/src/get_spike.h
@@ -34,7 +34,4 @@ __global__ void GetSpikes(double *spike_array, int array_size, int n_port,
 			  int port_input_arr_step,
 			  int port_input_port_step);
 
-
-__global__ void CollectSpikeKernel(int n_spikes, int *SpikeTargetNum);
-
 #endif
diff --git a/src/locate.cu b/src/locate.cu
deleted file mode 100644
index 74cde28e6..000000000
--- a/src/locate.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-__device__ int locate(int val, int *data, int n)
-{
-  int i_left = 0;
-  int i_right = n-1;
-  int i = (i_left+i_right)/2;
-  while(i_right-i_left>1) {
-    if (data[i] > val) i_right = i;
-    else if (data[i]<val) i_left = i;
-    else break;
-    i=(i_left+i_right)/2;
-  }
-
-  return i;
-}
diff --git a/src/nested_loop.cu b/src/nested_loop.cu
new file mode 100644
index 000000000..9591c2246
--- /dev/null
+++ b/src/nested_loop.cu
@@ -0,0 +1,178 @@
+/*
+ *  This file is part of NESTGPU.
+ *
+ *  Copyright (C) 2021 The NEST Initiative
+ *
+ *  NESTGPU is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  NESTGPU is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with NESTGPU.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+				    //#include "cuda_error_nl.h"
+#include "cuda_error.h"
+#include "nested_loop.h"
+
+//TMP
+#include "getRealTime.h"
+//
+
+//////////////////////////////////////////////////////////////////////
+// declare here the functions called by the nested loop 
+__device__ void NestedLoopFunction0(int ix, int iy);
+__device__ void NestedLoopFunction1(int ix, int iy);
+//////////////////////////////////////////////////////////////////////
+
+namespace NestedLoop
+{
+  PrefixScan prefix_scan_;
+  int *d_Ny_cumul_sum_;   
+}
+
+__device__ int locate(int val, int *data, int n)
+{
+  int i_left = 0;
+  int i_right = n-1;
+  int i = (i_left+i_right)/2;
+  while(i_right-i_left>1) {
+    if (data[i] > val) i_right = i;
+    else if (data[i]<val) i_left = i;
+    else break;
+    i=(i_left+i_right)/2;
+  }
+
+  return i;
+}
+
+__global__ void CumulSumNestedLoopKernel0(int Nx, int *Ny_cumul_sum,
+					 int Ny_sum)
+{
+  int blockId   = blockIdx.y * gridDim.x + blockIdx.x;
+  int array_idx = blockId * blockDim.x + threadIdx.x;
+  if (array_idx<Ny_sum) {
+    int ix = locate(array_idx, Ny_cumul_sum, Nx + 1);
+    int iy = (int)(array_idx - Ny_cumul_sum[ix]);
+    NestedLoopFunction0(ix, iy);
+  }
+}
+
+__global__ void CumulSumNestedLoopKernel1(int Nx, int *Ny_cumul_sum,
+					 int Ny_sum)
+{
+  int blockId   = blockIdx.y * gridDim.x + blockIdx.x;
+  int array_idx = blockId * blockDim.x + threadIdx.x;
+  if (array_idx<Ny_sum) {
+    int ix = locate(array_idx, Ny_cumul_sum, Nx + 1);
+    int iy = (int)(array_idx - Ny_cumul_sum[ix]);
+    NestedLoopFunction1(ix, iy);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Init()
+{
+  //prefix_scan_.Init();
+  gpuErrchk(cudaMalloc(&d_Ny_cumul_sum_,
+			  PrefixScan::AllocSize*sizeof(int)));
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Run(int Nx, int *d_Ny, int i_func)
+{
+  return CumulSumNestedLoop(Nx, d_Ny, i_func);
+}
+
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::CumulSumNestedLoop(int Nx, int *d_Ny, int i_func)
+{
+  //TMP
+  //double time_mark=getRealTime();
+  //
+  prefix_scan_.Scan(d_Ny_cumul_sum_, d_Ny, Nx+1);
+  //TMP
+  //printf("pst: %lf\n", getRealTime()-time_mark);
+  //	 
+  int Ny_sum;
+  gpuErrchk(cudaMemcpy(&Ny_sum, &d_Ny_cumul_sum_[Nx],
+			  sizeof(int), cudaMemcpyDeviceToHost));
+
+  //printf("CSNL: %d %d\n", Nx, Ny_sum);
+  
+  //printf("Ny_sum %u\n", Ny_sum);
+  //temporary - remove
+  /*
+  if (Ny_sum==0) {
+    printf("Nx %d\n", Nx);
+    for (int i=0; i<Nx+1; i++) {
+      int psum;
+      gpuErrchk(cudaMemcpy(&psum, &d_Ny_cumul_sum_[i],
+  			      sizeof(int), cudaMemcpyDeviceToHost));
+      printf("%d %d\n", i, psum);
+    }
+  }
+  */    
+  ////
+  if(Ny_sum>0) {
+    int grid_dim_x, grid_dim_y;
+    if (Ny_sum<65536*1024) { // max grid dim * max block dim
+      grid_dim_x = (Ny_sum+1023)/1024;
+      grid_dim_y = 1;
+    }
+    else {
+      grid_dim_x = 32; // I think it's not necessary to increase it
+      if (Ny_sum>grid_dim_x*1024*65535) {
+	throw ngpu_exception(std::string("Ny sum ") + std::to_string(Ny_sum) +
+			     " larger than threshold "
+			     + std::to_string(grid_dim_x*1024*65535));
+      }
+      grid_dim_y = (Ny_sum + grid_dim_x*1024 -1) / (grid_dim_x*1024);
+    }
+    dim3 numBlocks(grid_dim_x, grid_dim_y);
+    //TMP
+    //double time_mark=getRealTime();
+    //
+    switch (i_func) {
+    case 0:
+      CumulSumNestedLoopKernel0<<<numBlocks, 1024>>>
+	(Nx, d_Ny_cumul_sum_, Ny_sum);
+      gpuErrchk(cudaPeekAtLastError());
+      gpuErrchk(cudaDeviceSynchronize());
+      break;
+    case 1:
+      CumulSumNestedLoopKernel1<<<numBlocks, 1024>>>
+	(Nx, d_Ny_cumul_sum_, Ny_sum);
+      gpuErrchk(cudaPeekAtLastError());
+      gpuErrchk(cudaDeviceSynchronize());
+      break;
+    default:
+      throw ngpu_exception("unknown nested loop function");
+    }
+
+    //TMP
+    //printf("cst: %lf\n", getRealTime()-time_mark);
+    //
+  }
+    
+  return 0;
+}
+
diff --git a/src/nested_loop.cu.full b/src/nested_loop.cu.full
new file mode 100644
index 000000000..94b8594fe
--- /dev/null
+++ b/src/nested_loop.cu.full
@@ -0,0 +1,591 @@
+/*
+ *  This file is part of NESTGPU.
+ *
+ *  Copyright (C) 2021 The NEST Initiative
+ *
+ *  NESTGPU is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  NESTGPU is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with NESTGPU.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_radix_sort.cuh>
+//#include <cub/cub.cuh>
+
+#include "cuda_error_nl.h"
+#include "nested_loop.h"
+
+//TMP
+#include "getRealTime.h"
+//
+
+//////////////////////////////////////////////////////////////////////
+// declare here the function called by the nested loop 
+__device__ void NestedLoopFunction(int ix, int iy);
+//////////////////////////////////////////////////////////////////////
+
+namespace NestedLoop
+{
+  #include "Ny_th.h"
+  void *d_sort_storage_;
+  size_t sort_storage_bytes_;
+  void *d_reduce_storage_;
+  size_t reduce_storage_bytes_;
+
+  int Nx_max_;
+  int *d_max_Ny_;
+  int *d_sorted_Ny_;
+
+  int *d_idx_;
+  int *d_sorted_idx_;
+
+  int block_dim_x_;
+  int block_dim_y_;
+  int frame_area_;
+  float x_lim_;
+
+#ifdef WITH_CUMUL_SUM
+  PrefixScan prefix_scan_;
+  int *d_Ny_cumul_sum_;
+#endif
+   
+}
+
+//////////////////////////////////////////////////////////////////////
+__global__ void SimpleNestedLoopKernel(int Nx, int *Ny)
+{
+  int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+  if (ix<Nx && iy<Ny[ix]) {
+    NestedLoopFunction(ix, iy);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+__global__ void  ParallelInnerNestedLoopKernel(int ix, int Ny)
+{
+  int iy = threadIdx.x + blockIdx.x * blockDim.x;
+  if (iy<Ny) {
+    NestedLoopFunction(ix, iy);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+__global__ void  ParallelOuterNestedLoopKernel(int Nx, int *d_Ny)
+{
+  int ix = threadIdx.x + blockIdx.x * blockDim.x;
+  if (ix<Nx) {
+    for (int iy=0; iy<d_Ny[ix]; iy++) {
+      NestedLoopFunction(ix, iy);
+    }
+  }
+}
+
+
+//////////////////////////////////////////////////////////////////////
+__global__ void Frame1DNestedLoopKernel(int ix0, int dim_x, int dim_y,
+					int *sorted_idx, int *sorted_Ny)
+{
+  int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (array_idx<dim_x*dim_y) {
+    int ix = ix0 + array_idx % dim_x;
+    int iy = array_idx / dim_x;
+    if (iy<sorted_Ny[ix]) {
+      // call here the function that should be called by the nested loop
+      NestedLoopFunction(sorted_idx[ix], iy);
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+__global__ void Frame2DNestedLoopKernel(int ix0, int dim_x, int dim_y,
+					int *sorted_idx, int *sorted_Ny)
+{
+  int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+  if (ix<dim_x && iy<sorted_Ny[ix+ix0]) {
+    // call here the function that should be called by the nested loop
+    NestedLoopFunction(sorted_idx[ix+ix0], iy);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+__global__ void Smart1DNestedLoopKernel(int ix0, int iy0, int dim_x, int dim_y,
+                                 int *sorted_idx, int *sorted_Ny)
+{
+  int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (array_idx<dim_x*dim_y) {
+    int ix = ix0 + array_idx % dim_x;
+    int iy = iy0 + array_idx / dim_x;
+    if (iy<sorted_Ny[ix]) {
+      // call here the function that should be called by the nested loop
+      NestedLoopFunction(sorted_idx[ix], iy);
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+__global__ void Smart2DNestedLoopKernel(int ix0, int iy0, int dim_x,
+					int dim_y, int *sorted_idx,
+					int *sorted_Ny)
+{
+  int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int iy = iy0 + (blockIdx.y * blockDim.y) + threadIdx.y;
+  if (ix<dim_x && iy<sorted_Ny[ix+ix0]) {
+    // call here the function that should be called by the nested loop
+    NestedLoopFunction(sorted_idx[ix+ix0], iy);
+  }
+}
+
+#ifdef WITH_CUMUL_SUM
+__device__ int locate(int val, int *data, int n)
+{
+  int i_left = 0;
+  int i_right = n-1;
+  int i = (i_left+i_right)/2;
+  while(i_right-i_left>1) {
+    if (data[i] > val) i_right = i;
+    else if (data[i]<val) i_left = i;
+    else break;
+    i=(i_left+i_right)/2;
+  }
+
+  return i;
+}
+
+__global__ void CumulSumNestedLoopKernel(int Nx, int *Ny_cumul_sum,
+					 int Ny_sum)
+{
+  int blockId   = blockIdx.y * gridDim.x + blockIdx.x;
+  int array_idx = blockId * blockDim.x + threadIdx.x;
+  if (array_idx<Ny_sum) {
+    int ix = locate(array_idx, Ny_cumul_sum, Nx + 1);
+    int iy = (int)(array_idx - Ny_cumul_sum[ix]);
+    NestedLoopFunction(ix, iy);
+  }
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Init()
+{
+  return Init(65536*1024);
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Init(int Nx_max)
+{
+  if (Nx_max <= 0) return 0;
+
+  block_dim_x_ = 32;
+  block_dim_y_ = 32;
+  frame_area_ = 65536*64;
+  x_lim_ = 0.75;
+  Nx_max_ = Nx_max;
+
+  CudaSafeCall(cudaMalloc(&d_max_Ny_, sizeof(int)));  
+  CudaSafeCall(cudaMalloc(&d_sorted_Ny_, Nx_max*sizeof(int)));
+  CudaSafeCall(cudaMalloc(&d_idx_, Nx_max*sizeof(int)));
+  CudaSafeCall(cudaMalloc(&d_sorted_idx_, Nx_max*sizeof(int)));
+
+  int *h_idx = new int[Nx_max];
+  for(int i=0; i<Nx_max; i++) {
+    h_idx[i] = i;
+  }  
+  CudaSafeCall(cudaMemcpy(d_idx_, h_idx, Nx_max*sizeof(int),
+			  cudaMemcpyHostToDevice));
+  delete[] h_idx;
+    
+  // Determine temporary storage requirements for RadixSort
+  d_sort_storage_ = NULL;
+  sort_storage_bytes_ = 0;
+  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
+				  d_sorted_Ny_, d_sorted_Ny_, d_idx_,
+				  d_sorted_idx_, Nx_max);
+  // Determine temporary device storage requirements for Reduce
+  d_reduce_storage_ = NULL;
+  reduce_storage_bytes_ = 0;
+  int *d_Ny = NULL;
+  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
+			 d_max_Ny_, Nx_max);
+
+  // Allocate temporary storage
+  CudaSafeCall(cudaMalloc(&d_sort_storage_, sort_storage_bytes_));
+  CudaSafeCall(cudaMalloc(&d_reduce_storage_, reduce_storage_bytes_));
+
+#ifdef WITH_CUMUL_SUM
+  prefix_scan_.Init();
+  CudaSafeCall(cudaMalloc(&d_Ny_cumul_sum_,
+			  PrefixScan::AllocSize*sizeof(int)));
+#endif
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Run(int Nx, int *d_Ny)
+{
+  //return SimpleNestedLoop(Nx, d_Ny);
+  //return ParallelInnerNestedLoop(Nx, d_Ny);
+  //return ParallelOuterNestedLoop(Nx, d_Ny);
+  //return Frame1DNestedLoop(Nx, d_Ny);
+  //return Frame2DNestedLoop(Nx, d_Ny);
+  return CumulSumNestedLoop(Nx, d_Ny);
+  //return Smart1DNestedLoop(Nx, d_Ny);
+  //return Smart2DNestedLoop(Nx, d_Ny);
+
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::SimpleNestedLoop(int Nx, int *d_Ny)
+{
+  // Find max value of Ny
+  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
+			 d_max_Ny_, Nx);
+  int max_Ny;
+  CudaSafeCall(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int),
+			  cudaMemcpyDeviceToHost));
+  return SimpleNestedLoop(Nx, d_Ny, max_Ny);
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::SimpleNestedLoop(int Nx, int *d_Ny, int max_Ny)
+{
+  if (max_Ny < 1) max_Ny = 1;
+  dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
+  dim3 numBlocks((Nx - 1)/threadsPerBlock.x + 1,
+		 (max_Ny - 1)/threadsPerBlock.y + 1);
+  SimpleNestedLoopKernel <<<numBlocks,threadsPerBlock>>>(Nx, d_Ny);
+  cudaDeviceSynchronize();
+  CudaCheckError();
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::ParallelInnerNestedLoop(int Nx, int *d_Ny)
+{
+  for (int ix=0; ix<Nx; ix++) {
+    int Ny;
+    CudaSafeCall(cudaMemcpy(&Ny, &d_Ny[ix], sizeof(int),
+			    cudaMemcpyDeviceToHost));
+    ParallelInnerNestedLoopKernel<<<(Ny+1023)/1024, 1024>>>(ix, Ny);
+    // CudaCheckError(); // uncomment only for debugging
+  }
+  cudaDeviceSynchronize();
+  CudaCheckError();
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::ParallelOuterNestedLoop(int Nx, int *d_Ny)
+{
+  ParallelOuterNestedLoopKernel<<<(Nx+1023)/1024, 1024>>>(Nx, d_Ny);
+  cudaDeviceSynchronize();
+  CudaCheckError();
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Frame1DNestedLoop(int Nx, int *d_Ny)
+{
+  if (Nx <= 0) return 0;
+  int dim_x, dim_y;
+
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
+				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
+				  Nx);
+  
+  int ix0 = Nx;
+  while(ix0>0) {
+    CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
+			    cudaMemcpyDeviceToHost));
+    if (dim_y < 1) dim_y = 1;
+    dim_x = (frame_area_ - 1) / dim_y + 1;
+    ix0 -= dim_x;
+    if (ix0<0) {
+      dim_x += ix0;
+      ix0 = 0;
+    } 
+    Frame1DNestedLoopKernel<<<(dim_x*dim_y+1023)/1024, 1024>>>
+      (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
+  }
+  cudaDeviceSynchronize();
+  CudaCheckError();
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Frame2DNestedLoop(int Nx, int *d_Ny)
+{
+  if (Nx <= 0) return 0;
+  // Sort the pairs (ix, Ny) with ix=0,..,Nx-1 in ascending order of Ny.
+  // After the sorting operation, d_sorted_idx_ are the reordered indexes ix
+  // and d_sorted_Ny_ are the sorted values of Ny 
+  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
+				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
+				  Nx);  
+  int ix0 = Nx;	      // proceeds from right to left
+  while(ix0>0) {
+    int dim_x, dim_y;  // width and height of the rectangular frame
+    CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
+			    cudaMemcpyDeviceToHost));
+    if (dim_y < 1) dim_y = 1;
+    // frame_area_ is the fixed value of the the rectangular frame area
+    dim_x = (frame_area_ - 1) / dim_y + 1; // width of the rectangular frame
+    ix0 -= dim_x; // update the index value
+    if (ix0<0) {
+      dim_x += ix0;  // adjust the width if ix0<0 
+      ix0 = 0;
+    }    
+    dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
+    dim3 numBlocks((dim_x - 1)/threadsPerBlock.x + 1,
+		   (dim_y - 1)/threadsPerBlock.y + 1);
+    // run a nested loop kernel on the rectangular frame
+    Frame2DNestedLoopKernel <<<numBlocks,threadsPerBlock>>>
+      (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
+
+  }
+  cudaDeviceSynchronize();
+  CudaCheckError();
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Smart1DNestedLoop(int Nx, int *d_Ny)
+{
+  // Find max value of Ny
+  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
+			 d_max_Ny_, Nx);
+  int max_Ny;
+  CudaSafeCall(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int),
+			  cudaMemcpyDeviceToHost));
+  if (Nx <= 0) return 0;
+  float f_Nx = 2.0*log((float)Nx)-5;
+  int i_Nx = (int)floor(f_Nx);
+  int Ny_th;
+  if (i_Nx<0) {
+    Ny_th = Ny_th_arr_[0];
+  }
+  else if (i_Nx>=Ny_arr_size_-1) {
+    Ny_th = Ny_th_arr_[Ny_arr_size_-1];
+  }
+  else {
+    float t = f_Nx - (float)i_Nx;
+    Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t;
+  }
+  if (max_Ny<Ny_th) {
+    return SimpleNestedLoop(Nx, d_Ny, max_Ny);
+  }
+
+  if(max_Ny < 1) max_Ny = 1;
+  
+  int dim_x, dim_y;
+
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
+				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
+				  Nx);
+  // CudaCheckError(); // uncomment only for debugging
+  
+  int ix1 = (int)round(x_lim_*Nx);
+  if (ix1==Nx) ix1 = Nx - 1;
+  int Ny1;
+  CudaSafeCall(cudaMemcpy(&Ny1, &d_sorted_Ny_[ix1], sizeof(int),
+			  cudaMemcpyDeviceToHost));
+  if(Ny1 < 1) Ny1 = 1;
+
+  dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
+  int nbx = (Nx - 1)/threadsPerBlock.x + 1;
+  int nby = (Ny1 - 1)/threadsPerBlock.y + 1;
+  Ny1 = nby*threadsPerBlock.y;
+  
+  dim3 numBlocks(nbx, nby);
+  SimpleNestedLoopKernel <<<numBlocks,threadsPerBlock>>>(Nx, d_Ny);
+  //CudaCheckError(); // uncomment only for debugging
+  
+  int ix0 = Nx;
+  while(ix0>ix1) {
+    CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
+			    cudaMemcpyDeviceToHost));
+    dim_y -= Ny1;
+    if (dim_y<=0) break;
+    dim_x = (frame_area_ - 1) / dim_y + 1;
+    ix0 -= dim_x;
+    if (ix0<ix1) {
+      dim_x += ix0 - ix1;
+      ix0 = ix1;
+    } 
+    Smart1DNestedLoopKernel<<<(dim_x*dim_y+1023)/1024, 1024>>>
+      (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
+    //CudaCheckError(); // uncomment only for debugging
+  }
+  cudaDeviceSynchronize();
+  CudaCheckError();
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+int NestedLoop::Smart2DNestedLoop(int Nx, int *d_Ny)
+{
+  // Find max value of Ny
+  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
+			 d_max_Ny_, Nx);
+  int max_Ny;
+  CudaSafeCall(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int),
+			  cudaMemcpyDeviceToHost));
+  if (Nx <= 0) return 0;
+  float f_Nx = 2.0*log((float)Nx)-5;
+  int i_Nx = (int)floor(f_Nx);
+  int Ny_th;
+  if (i_Nx<0) {
+    Ny_th = Ny_th_arr_[0];
+  }
+  else if (i_Nx>=Ny_arr_size_-1) {
+    Ny_th = Ny_th_arr_[Ny_arr_size_-1];
+  }
+  else {
+    float t = f_Nx - (float)i_Nx;
+    Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t;
+  }
+  if (max_Ny<Ny_th) {
+    return SimpleNestedLoop(Nx, d_Ny, max_Ny);
+  }
+
+  if(max_Ny < 1) max_Ny = 1;
+
+  int dim_x, dim_y;
+
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
+				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
+				  Nx);
+  // CudaCheckError(); // uncomment only for debugging
+  
+  int ix1 = (int)round(x_lim_*Nx);
+  if (ix1==Nx) ix1 = Nx - 1;
+  int Ny1;
+  CudaSafeCall(cudaMemcpy(&Ny1, &d_sorted_Ny_[ix1], sizeof(int),
+			  cudaMemcpyDeviceToHost));
+  if(Ny1 < 1) Ny1 = 1;
+
+  dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
+  int nbx = (Nx - 1)/threadsPerBlock.x + 1;
+  int nby = (Ny1 - 1)/threadsPerBlock.y + 1;
+  Ny1 = nby*threadsPerBlock.y;
+  
+  dim3 numBlocks(nbx, nby);
+  SimpleNestedLoopKernel <<<numBlocks,threadsPerBlock>>>(Nx, d_Ny);
+  //CudaCheckError(); // uncomment only for debugging
+  
+  int ix0 = Nx;
+  while(ix0>ix1) {
+    CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
+			    cudaMemcpyDeviceToHost));
+    dim_y -= Ny1;
+    if (dim_y<=0) break;
+    dim_x = (frame_area_ - 1) / dim_y + 1;
+    ix0 -= dim_x;
+    if (ix0<ix1) {
+      dim_x += ix0 - ix1;
+      ix0 = ix1;
+    }
+
+    dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
+    dim3 numBlocks((dim_x - 1)/threadsPerBlock.x + 1,
+		   (dim_y - 1)/threadsPerBlock.y + 1);
+    Smart2DNestedLoopKernel <<<numBlocks,threadsPerBlock>>>
+      (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
+    //CudaCheckError(); // uncomment only for debugging      
+  }
+  cudaDeviceSynchronize();
+  CudaCheckError();
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+#ifdef WITH_CUMUL_SUM
+int NestedLoop::CumulSumNestedLoop(int Nx, int *d_Ny)
+{
+  //TMP
+  //double time_mark=getRealTime();
+  //
+  prefix_scan_.Scan(d_Ny_cumul_sum_, d_Ny, Nx+1);
+  //TMP
+  //printf("pst: %lf\n", getRealTime()-time_mark);
+  //	 
+  int Ny_sum;
+  CudaSafeCall(cudaMemcpy(&Ny_sum, &d_Ny_cumul_sum_[Nx],
+			  sizeof(int), cudaMemcpyDeviceToHost));
+
+  //printf("CSNL: %d %d\n", Nx, Ny_sum);
+  
+  //printf("Ny_sum %u\n", Ny_sum);
+  //temporary - remove
+  /*
+  if (Ny_sum==0) {
+    printf("Nx %d\n", Nx);
+    for (int i=0; i<Nx+1; i++) {
+      int psum;
+      CudaSafeCall(cudaMemcpy(&psum, &d_Ny_cumul_sum_[i],
+  			      sizeof(int), cudaMemcpyDeviceToHost));
+      printf("%d %d\n", i, psum);
+    }
+  }
+  */    
+  ////
+  if(Ny_sum>0) {
+    int grid_dim_x, grid_dim_y;
+    if (Ny_sum<65536*1024) { // max grid dim * max block dim
+      grid_dim_x = (Ny_sum+1023)/1024;
+      grid_dim_y = 1;
+    }
+    else {
+      grid_dim_x = 64; // I think it's not necessary to increase it
+      if (Ny_sum>grid_dim_x*1024*65535) {
+	printf("Ny sum %d larger than threshold %d\n",
+	       Ny_sum, grid_dim_x*1024*65535);
+	exit(-1);
+      }
+      grid_dim_y = (Ny_sum + grid_dim_x*1024 -1) / (grid_dim_x*1024);
+    }
+    dim3 numBlocks(grid_dim_x, grid_dim_y);
+    //TMP
+    //double time_mark=getRealTime();
+    //
+    CumulSumNestedLoopKernel<<<numBlocks, 1024>>>(Nx, d_Ny_cumul_sum_, Ny_sum);
+
+    cudaDeviceSynchronize();
+    CudaCheckError();
+    //TMP
+    //printf("cst: %lf\n", getRealTime()-time_mark);
+    //
+  }
+    
+  return 0;
+}
+#endif
diff --git a/src/nested_loop.h b/src/nested_loop.h
new file mode 100644
index 000000000..14d398d35
--- /dev/null
+++ b/src/nested_loop.h
@@ -0,0 +1,41 @@
+/*
+ *  This file is part of NESTGPU.
+ *
+ *  Copyright (C) 2021 The NEST Initiative
+ *
+ *  NESTGPU is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  NESTGPU is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with NESTGPU.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+
+
+
+#ifndef NESTEDLOOP_H
+#define  NESTEDLOOP_H
+
+#include "prefix_scan.h"
+
+namespace NestedLoop
+{
+  extern PrefixScan prefix_scan_;
+  
+  int Init();
+  int Run(int Nx, int *d_Ny, int i_func);
+  int CumulSumNestedLoop(int Nx, int *d_Ny, int i_func);  
+
+  int Free();
+}
+
+#endif
diff --git a/src/nestgpu.cu b/src/nestgpu.cu
index 7b04661f9..d929787dc 100644
--- a/src/nestgpu.cu
+++ b/src/nestgpu.cu
@@ -42,6 +42,7 @@
 #include "getRealTime.h"
 #include "random.h"
 #include "nestgpu.h"
+#include "nested_loop.h"
 #include "dir_connect.h"
 #include "rev_spike.h"
 #include "spike_mpi.h"
@@ -131,6 +132,8 @@ NESTGPU::NESTGPU()
   connect_mpi_->remote_spike_height_ = false;
 #endif
   
+  NestedLoop::Init();
+
   SpikeBufferUpdate_time_ = 0;
   poisson_generator_time_ = 0;
   neuron_Update_time_ = 0;
@@ -535,9 +538,7 @@ int NESTGPU::SimulationStep()
   ClearGetSpikeArrays();    
   if (n_spikes > 0) {
     time_mark = getRealTime();
-    CollectSpikeKernel<<<n_spikes, 1024>>>(n_spikes, d_SpikeTargetNum);
-    gpuErrchk(cudaPeekAtLastError());
-
+    NestedLoop::Run(n_spikes, d_SpikeTargetNum, 0);
     NestedLoop_time_ += (getRealTime() - time_mark);
   }
   time_mark = getRealTime();
@@ -604,10 +605,7 @@ int NESTGPU::SimulationStep()
     gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int),
 			 cudaMemcpyDeviceToHost));
     if (n_rev_spikes > 0) {
-      SynapseUpdateKernel<<<n_rev_spikes, 1024>>>(n_rev_spikes, d_RevSpikeNConn);
-      gpuErrchk(cudaPeekAtLastError());
-      gpuErrchk(cudaDeviceSynchronize());
-
+      NestedLoop::Run(n_rev_spikes, d_RevSpikeNConn, 1);
     }      
     //RevSpikeBufferUpdate_time_ += (getRealTime() - time_mark);
   }
diff --git a/src/rev_spike.cu b/src/rev_spike.cu
index c459a5685..4078b4ea0 100644
--- a/src/rev_spike.cu
+++ b/src/rev_spike.cu
@@ -48,7 +48,7 @@ __device__ int *RevSpikeNConn;
 //////////////////////////////////////////////////////////////////////
 // This is the function called by the nested loop
 // that makes use of positive post-pre spike time difference
-__device__ void SynapseUpdateFunction(int i_spike, int i_target_rev_conn)
+__device__ void NestedLoopFunction1(int i_spike, int i_target_rev_conn)
 {
   unsigned int target = RevSpikeTarget[i_spike];
   unsigned int i_conn = TargetRevConnection[target][i_target_rev_conn];
@@ -63,18 +63,7 @@ __device__ void SynapseUpdateFunction(int i_spike, int i_target_rev_conn)
     }
   }
 }
-
-__global__ void SynapseUpdateKernel(int n_rev_spikes, int *RevSpikeNConn)
-{
-  const int i_spike = blockIdx.x;
-  if (i_spike<n_rev_spikes) {
-    const int n_spike_targets = RevSpikeNConn[i_spike];
-    for (int i_target_rev = threadIdx.x; i_target_rev < n_spike_targets; i_target_rev += blockDim.x){
-      SynapseUpdateFunction(i_spike, i_target_rev);
-    }
-  }
-}
-
+	    
 
 __global__ void RevSpikeBufferUpdate(unsigned int n_node)
 {
diff --git a/src/rev_spike.h b/src/rev_spike.h
index 016459e3b..67c50cea5 100644
--- a/src/rev_spike.h
+++ b/src/rev_spike.h
@@ -33,8 +33,6 @@ __global__ void RevSpikeReset();
 
 __global__ void RevSpikeBufferUpdate(unsigned int n_node);
 
-__global__ void SynapseUpdateKernel(int n_rev_spikes, int *RevSpikeNConn);
-
 int RevSpikeInit(NetConnection *net_connection);
 
 int RevSpikeFree();

From 4af3a7625252f32d58a98ff9cf420b5d806957f1 Mon Sep 17 00:00:00 2001
From: JoseJVS <villamar_jose@hotmail.com>
Date: Wed, 11 Dec 2024 17:20:56 +0100
Subject: [PATCH 2/2] Updated CMakeLists.txt

---
 src/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b721b22db..72c224a54 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -64,6 +64,7 @@ set ( nestgpu_sources
 	izhikevich_psc_exp.h
 	locate.h
 	multimeter.h
+	nested_loop.h
 	nestgpu_C.h
 	nestgpu.h
 	neuron_models.h
@@ -120,9 +121,9 @@ set ( nestgpu_sources
 	izhikevich_psc_exp_2s.cu
 	izhikevich_psc_exp_5s.cu
 	izhikevich_psc_exp.cu
-	locate.cu
 	multimeter.cu
 	nestgpu.cu
+	nested_loop.cu
 	neuron_models.cu
 	node_group.cu
 	parrot_neuron.cu