From 9f3902c7899afad5054bc41fc2e621a76baa655d Mon Sep 17 00:00:00 2001
From: Wendy Li <wli@lyft.com>
Date: Sun, 26 Aug 2018 17:16:02 -0700
Subject: [PATCH 1/4] increase speed

---
 approx_kernel_faster.cu.cc | 173 +++++++++++++++++++++++++++++++------
 1 file changed, 148 insertions(+), 25 deletions(-)

diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc
index 9e1c01b..f3fb4f5 100644
--- a/approx_kernel_faster.cu.cc
+++ b/approx_kernel_faster.cu.cc
@@ -6,9 +6,9 @@
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-   if (code != cudaSuccess) 
-   {
+{  
+   if (code != cudaSuccess)
+   {  
       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
       if (abort) exit(code);
    }
@@ -17,7 +17,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 
 void checkGpuMem()
 {
-
+  
   float free_m,total_m,used_m;
   size_t free_t,total_t;
   cudaMemGetInfo(&free_t,&total_t);
@@ -28,6 +28,7 @@ void checkGpuMem()
 
 }
 
+
 __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* output,
         const int filter_x_, const int filter_y_, const int stride, const int is_0, const int is_1, const int is_2, const int is_3,
          const int ys_0, const int ys_1, const int ys_2, const int ys_3) {
@@ -55,7 +56,7 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
   float sum = 0.0;
   float max = 0.0;
 
-  int t1, t2;
+  int t1, t2, t3, t4, t5, t6, t7, t8;
   t1 = clock();
   float val;
 
@@ -102,6 +103,7 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
 
   int scratch = input[100];
 
+
   t2 = clock();
 
   if(blockIdx.x==0){
@@ -118,26 +120,142 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
     printf("%d ",t2-t1);
   }
 
-  t1 = clock();
-   // I have N patches of size C*H*W. I want ONE patch of C*H*W which is the average of the N. 
-   // splitting the work up by C. So you would have an outer loop over the C dimension. Then in each iteration, each thread loads the patch for one channel for one image (still using CHWN for bandwidth) into a shared memory scratch buffer. Then in the second phase of the iteration, 
-   // you remap so that each thread gets one of H*W pixels and does the reduction into the result shared memory buffer
-  for(int c = 0; c< input_channels; c++)
-  {
+//printf("filterx: %d\n",is_3);
+ t1 = clock();
+
+int c;
+for(c = 0; c<input_channels; c++)
+{
+        // t3 = clock();
     // first read into shared memory
     int input_channel_offset = c * is_1 * is_2 * is_3;
-    for(int row = 0; row < filter_x_; row ++ )
-    {
-        int data_row_offset = row * filter_y_ * is_3;
-        int input_offset = input_channel_offset + (row + max_row) * is_2 * is_3 + max_col * is_3;
-        for(int col = 0; col < filter_y_ * is_3; col+=is_3)
-        {
-            // input_col_offset same as data_col_offset
-            data[data_row_offset + col + threadIdx.x] = input[input_offset + col + input_offset + threadIdx.x] * sum;
-        }
-    }
+    t3 = clock();
 
-    //__syncthreads();
+    int data_row_offset = 0;
+    int input_offset = input_channel_offset +  max_row * is_2 * is_3 + max_col * is_3;
+    int data_index = data_row_offset + threadIdx.x;
+    int data_index2 = input_offset + input_offset + threadIdx.x;
+
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 * is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+// Second row
+    data_row_offset +=  filter_y_ * is_3;
+    input_offset += is_2 * is_3;
+    data_index -= filter_y_ * 4 * is_3;
+    data_index2 -= is_3 * 4 * is_2;
+
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+// Third row
+    data_row_offset +=  filter_y_ * is_3;
+    input_offset += is_2 * is_3;
+
+    data_index -= filter_y_ * 4 * is_3;
+    data_index2 -= is_3 * 4 * is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+// Fourth row
+    data_row_offset +=  filter_y_ * is_3;
+    input_offset += is_2 * is_3;
+
+    data_index -= filter_y_ * 4 * is_3;
+    data_index2 -= is_3 * 4 * is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+// Fifth row
+    data_row_offset +=  filter_y_ * is_3;
+    input_offset += is_2 * is_3;
+
+    data_index -= filter_y_ * 4 * is_3;
+    data_index2 -= is_3 * 4 * is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    data_index += is_3 * filter_y_;
+    data_index2 += is_3 *  is_2;
+    data[data_index] = input[data_index2] * sum;
+
+    t4 = clock();
+
+
+
+//__syncthreads();
     // now reduce inside of shared memory, this can be made more parallel
     // currently only using 9 of the 128 threads.
 
@@ -157,14 +275,18 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
     //__syncthreads();
 
   }
-
-
   // now do the parallel reduction, this can definitely be made better
   t2 = clock();
   if(blockIdx.x==0){
-    printf("%d ",t2-t1);
+     printf("Difference: %d ",t2-t1);
+    printf("Difference2: %d ",t4-t3);
+   // printf("Difference3: %d ",t6-t5);
+    //printf("Difference4: %d ",t8-t7);
   }
 
+
+
+
   int block_offset = blockIdx.x * input_channels * filter_x_ * filter_y_;
 
 
@@ -214,3 +336,4 @@ void TonyConvGradKernelLauncher(const float * input, const int input_size_[], co
 }
 
 #endif
+

From 9e7a1507b1790c67491cfd546f38b7202c89a60f Mon Sep 17 00:00:00 2001
From: Wendy Li <wli@lyft.com>
Date: Tue, 28 Aug 2018 20:20:53 -0700
Subject: [PATCH 2/4] update

---
 approx_kernel_faster.cu.cc | 293 ++++++++++++++++++++-----------------
 1 file changed, 157 insertions(+), 136 deletions(-)

diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc
index f3fb4f5..99c657b 100644
--- a/approx_kernel_faster.cu.cc
+++ b/approx_kernel_faster.cu.cc
@@ -103,7 +103,6 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
 
   int scratch = input[100];
 
-
   t2 = clock();
 
   if(blockIdx.x==0){
@@ -120,140 +119,153 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
     printf("%d ",t2-t1);
   }
 
-//printf("filterx: %d\n",is_3);
- t1 = clock();
-
-int c;
+t1 = clock();
+int c, a, b, c1, d, e;
+int incr_data = filter_y_ * is_3;
+int input_incr = is_2 * is_3;
+int input_channel_offset = 0;
+int data_row_offset = 0;
+int input_offset = max_row * is_2 * is_3 + max_col * is_3;
+int data_index = threadIdx.x;
+int data_index2 = input_offset + input_offset + threadIdx.x;
 for(c = 0; c<input_channels; c++)
 {
-        // t3 = clock();
-    // first read into shared memory
-    int input_channel_offset = c * is_1 * is_2 * is_3;
-    t3 = clock();
-
-    int data_row_offset = 0;
-    int input_offset = input_channel_offset +  max_row * is_2 * is_3 + max_col * is_3;
-    int data_index = data_row_offset + threadIdx.x;
-    int data_index2 = input_offset + input_offset + threadIdx.x;
-
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 * is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
+ // t3 = clock();
+   // t5 = clock();
+    a = input[data_index2] * sum;
+        data_index2 += is_3;
+    b = input[data_index2] * sum;
+    data_index2 += is_3;
+    c1 = input[data_index2] * sum;
+    data_index2 += is_3;
+    d = input[data_index2] * sum;
+    data_index2 += is_3;
+    e = input[data_index2] * sum;
+
+    data[data_index] = a;
+    data_index += is_3;
+    data[data_index] = b;
+    data_index += is_3;
+    data[data_index] = c1;
+    data_index += is_3;
+    data[data_index] = d;
+    data_index += is_3;
+    data[data_index] = e;
+
+   // t6 = clock();
 // Second row
-    data_row_offset +=  filter_y_ * is_3;
-    input_offset += is_2 * is_3;
-    data_index -= filter_y_ * 4 * is_3;
-    data_index2 -= is_3 * 4 * is_2;
-
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-// Third row
-    data_row_offset +=  filter_y_ * is_3;
-    input_offset += is_2 * is_3;
-
-    data_index -= filter_y_ * 4 * is_3;
-    data_index2 -= is_3 * 4 * is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
+    data_row_offset +=  incr_data;
+    input_offset += input_incr;
+    data_index = data_row_offset + threadIdx.x;
+    data_index2 = input_offset + input_offset + threadIdx.x;
+
+    a = input[data_index2] * sum;
+    data_index2 += is_3;
+    b = input[data_index2] * sum;
+    data_index2 += is_3;
+    c1 = input[data_index2] * sum;
+    data_index2 += is_3;
+    d = input[data_index2] * sum;
+    data_index2 += is_3;
+    e = input[data_index2] * sum;
+
+    data[data_index] = a;
+    data_index += is_3;
+    data[data_index] = b;
+    data_index += is_3;
+    data[data_index] = c1;
+    data_index += is_3;
+    data[data_index] = d;
+    data_index += is_3;
+    data[data_index] = e;
+
+//T hird row
+    data_row_offset +=  incr_data;
+    input_offset += input_incr;
+    data_index = data_row_offset + threadIdx.x;
+    data_index2 = input_offset + input_offset + threadIdx.x;
+
+    a = input[data_index2] * sum;
+    data_index2 += is_3;
+    b = input[data_index2] * sum;
+    data_index2 += is_3;
+    c1 = input[data_index2] * sum;
+    data_index2 += is_3;
+    d = input[data_index2] * sum;
+    data_index2 += is_3;
+    e = input[data_index2] * sum;
+
+    data[data_index] = a;
+    data_index += is_3;
+    data[data_index] = b;
+    data_index += is_3;
+    data[data_index] = c1;
+    data_index += is_3;
+    data[data_index] = d;
+    data_index += is_3;
+    data[data_index] = e;
 
 // Fourth row
-    data_row_offset +=  filter_y_ * is_3;
-    input_offset += is_2 * is_3;
-
-    data_index -= filter_y_ * 4 * is_3;
-    data_index2 -= is_3 * 4 * is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-// Fifth row
-    data_row_offset +=  filter_y_ * is_3;
-    input_offset += is_2 * is_3;
-
-    data_index -= filter_y_ * 4 * is_3;
-    data_index2 -= is_3 * 4 * is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    data_index += is_3 * filter_y_;
-    data_index2 += is_3 *  is_2;
-    data[data_index] = input[data_index2] * sum;
-
-    t4 = clock();
-
-
+    data_row_offset +=  incr_data;
+    input_offset += input_incr;
+    data_index = data_row_offset + threadIdx.x;
+    data_index2 = input_offset + input_offset + threadIdx.x;
+
+    a = input[data_index2] * sum;
+    data_index2 += is_3;
+    b = input[data_index2] * sum;
+    data_index2 += is_3;
+    c1 = input[data_index2] * sum;
+    data_index2 += is_3;
+    d = input[data_index2] * sum;
+    data_index2 += is_3;
+    e = input[data_index2] * sum;
+
+    data[data_index] = a;
+    data_index += is_3;
+    data[data_index] = b;
+    data_index += is_3;
+    data[data_index] = c1;
+    data_index += is_3;
+    data[data_index] = d;
+    data_index += is_3;
+    data[data_index] = e;
+
+//Fifth row
+    data_row_offset +=  incr_data;
+    input_offset += input_incr;
+    data_index = data_row_offset + threadIdx.x;
+    data_index2 = input_offset + input_offset + threadIdx.x;
+
+       a = input[data_index2] * sum;
+    data_index2 += is_3;
+    b = input[data_index2] * sum;
+    data_index2 += is_3;
+    c1 = input[data_index2] * sum;
+    data_index2 += is_3;
+    d = input[data_index2] * sum;
+    data_index2 += is_3;
+    e = input[data_index2] * sum;
+
+    data[data_index] = a;
+    data_index += is_3;
+    data[data_index] = b;
+    data_index += is_3;
+    data[data_index] = c1;
+    data_index += is_3;
+    data[data_index] = d;
+    data_index += is_3;
+    data[data_index] = e;
+
+  //reset inputs
+  input_channel_offset += is_1 * is_2 * is_3;
+  data_row_offset = 0;
+  input_offset = input_channel_offset +  max_row * is_2 * is_3 + max_col * is_3;
+  data_index = threadIdx.x;
+  data_index2 = input_offset + input_offset + threadIdx.x;
+
+//t4=clock();
+}
 
 //__syncthreads();
     // now reduce inside of shared memory, this can be made more parallel
@@ -274,19 +286,18 @@ for(c = 0; c<input_channels; c++)
 
     //__syncthreads();
 
-  }
+ // }
   // now do the parallel reduction, this can definitely be made better
   t2 = clock();
   if(blockIdx.x==0){
-     printf("Difference: %d ",t2-t1);
-    printf("Difference2: %d ",t4-t3);
-   // printf("Difference3: %d ",t6-t5);
-    //printf("Difference4: %d ",t8-t7);
+    printf("Difference: %d ",t2-t1);
+ //printf("Difference2: %d ",t4-t3);
+   // printf("Difference3 : %d ",t6-t5);
+   // printf("Difference4: %d ",t8-t7);
   }
 
 
 
-
   int block_offset = blockIdx.x * input_channels * filter_x_ * filter_y_;
 
 
@@ -321,6 +332,15 @@ void TonyConvGradKernelLauncher(const float * input, const int input_size_[], co
   //float *d_debug = (float*) malloc(sizeof(float)*filter_x_*filter_y_*input_size[1]*dy_size[1]);
   //std::cout << "transferred to debug host variable" << std::endl;
 
+  //std::cout << 0 << std::endl;
+
+  TonyConvKernelDraft<<<dy_size_[1],input_size_[3], data_size>>>(input,dy,output,filter_x_,filter_y_,stride,
+  input_size_[0],input_size_[1],input_size_[2],input_size_[3], dy_size_[0],dy_size_[1],dy_size_[2],dy_size_[3]);
+
+  //std::cout << "kernel finished successfully" << std::endl;
+  //float *d_debug = (float*) malloc(sizeof(float)*filter_x_*filter_y_*input_size[1]*dy_size[1]);
+  //std::cout << "transferred to debug host variable" << std::endl;
+
   //std::cout << 0 << std::endl;
   //std::cout << cudaFree(data) << std::endl;
 //  for(int i = 0;i<filter_x_*filter_y_*input_size[1]*dy_size[1];i++)
@@ -337,3 +357,4 @@ void TonyConvGradKernelLauncher(const float * input, const int input_size_[], co
 
 #endif
 
+                                                                                                             350,0-1       Bo
\ No newline at end of file

From 3b5283a3fa59f9d7e34052495f98466d46598c21 Mon Sep 17 00:00:00 2001
From: Wendy Li <wli@lyft.com>
Date: Thu, 30 Aug 2018 21:25:58 -0700
Subject: [PATCH 3/4] optimize

---
 approx_kernel_faster.cu.cc | 75 +++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 45 deletions(-)

diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc
index 99c657b..3832ec4 100644
--- a/approx_kernel_faster.cu.cc
+++ b/approx_kernel_faster.cu.cc
@@ -16,7 +16,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 
 
 void checkGpuMem()
-{
+{ 
   
   float free_m,total_m,used_m;
   size_t free_t,total_t;
@@ -121,19 +121,18 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
 
 t1 = clock();
 int c, a, b, c1, d, e;
-int incr_data = filter_y_ * is_3;
-int input_incr = is_2 * is_3;
+//int incr_data = filter_y_ * is_3;
+//int input_incr = is_2 * is_3;
 int input_channel_offset = 0;
-int data_row_offset = 0;
 int input_offset = max_row * is_2 * is_3 + max_col * is_3;
 int data_index = threadIdx.x;
-int data_index2 = input_offset + input_offset + threadIdx.x;
-for(c = 0; c<input_channels; c++)
+int data_index2 = input_offset + input_offset + data_index;
+for(c = input_channels; c>=0; c--)
 {
- // t3 = clock();
+  //t3 = clock();
    // t5 = clock();
     a = input[data_index2] * sum;
-        data_index2 += is_3;
+    data_index2 += is_3;
     b = input[data_index2] * sum;
     data_index2 += is_3;
     c1 = input[data_index2] * sum;
@@ -154,16 +153,16 @@ for(c = 0; c<input_channels; c++)
 
    // t6 = clock();
 // Second row
-    data_row_offset +=  incr_data;
-    input_offset += input_incr;
-    data_index = data_row_offset + threadIdx.x;
-    data_index2 = input_offset + input_offset + threadIdx.x;
+    //input_offset += input_incr;
+    data_index += is_3;
+    data_index2 += 106 * is_3;
+     //data_index2 =  input_offset + input_offset + threadIdx.x;
 
     a = input[data_index2] * sum;
     data_index2 += is_3;
     b = input[data_index2] * sum;
     data_index2 += is_3;
-    c1 = input[data_index2] * sum;
+c1 = input[data_index2] * sum;
     data_index2 += is_3;
     d = input[data_index2] * sum;
     data_index2 += is_3;
@@ -180,10 +179,10 @@ for(c = 0; c<input_channels; c++)
     data[data_index] = e;
 
 //T hird row
-    data_row_offset +=  incr_data;
-    input_offset += input_incr;
-    data_index = data_row_offset + threadIdx.x;
-    data_index2 = input_offset + input_offset + threadIdx.x;
+    //input_offset += input_incr;
+    data_index += is_3;
+    //data_index2 = input_offset + input_offset + threadIdx.x;
+    data_index2 += 106 * is_3;
 
     a = input[data_index2] * sum;
     data_index2 += is_3;
@@ -206,11 +205,10 @@ for(c = 0; c<input_channels; c++)
     data[data_index] = e;
 
 // Fourth row
-    data_row_offset +=  incr_data;
-    input_offset += input_incr;
-    data_index = data_row_offset + threadIdx.x;
-    data_index2 = input_offset + input_offset + threadIdx.x;
-
+    //input_offset += input_incr;
+    data_index += is_3;
+//    data_index2 = input_offset + input_offset + threadIdx.x;
+data_index2 += 106 * is_3;
     a = input[data_index2] * sum;
     data_index2 += is_3;
     b = input[data_index2] * sum;
@@ -232,10 +230,11 @@ for(c = 0; c<input_channels; c++)
     data[data_index] = e;
 
 //Fifth row
-    data_row_offset +=  incr_data;
-    input_offset += input_incr;
-    data_index = data_row_offset + threadIdx.x;
-    data_index2 = input_offset + input_offset + threadIdx.x;
+    //input_offset += input_incr;
+    data_index += is_3;
+    //data_index2 = input_offset + input_offset + threadIdx.x;
+    data_index2 += 106 * is_3;
+
 
        a = input[data_index2] * sum;
     data_index2 += is_3;
@@ -259,11 +258,10 @@ for(c = 0; c<input_channels; c++)
 
   //reset inputs
   input_channel_offset += is_1 * is_2 * is_3;
-  data_row_offset = 0;
-  input_offset = input_channel_offset +  max_row * is_2 * is_3 + max_col * is_3;
-  data_index = threadIdx.x;
-  data_index2 = input_offset + input_offset + threadIdx.x;
-
+  //input_offset = input_channel_offset +  max_row * is_2 * is_3 + max_col * is_3;
+  data_index -= is_3 * 24;
+    data_index2 = input_channel_offset + input_offset + data_index;
+  //data_index2 -= 444 * is_3 - input_channel_offset;
 //t4=clock();
 }
 
@@ -291,13 +289,11 @@ for(c = 0; c<input_channels; c++)
   t2 = clock();
   if(blockIdx.x==0){
     printf("Difference: %d ",t2-t1);
- //printf("Difference2: %d ",t4-t3);
+ printf("Difference2: %d ",t4-t3);
    // printf("Difference3 : %d ",t6-t5);
    // printf("Difference4: %d ",t8-t7);
   }
 
-
-
   int block_offset = blockIdx.x * input_channels * filter_x_ * filter_y_;
 
 
@@ -332,15 +328,6 @@ void TonyConvGradKernelLauncher(const float * input, const int input_size_[], co
   //float *d_debug = (float*) malloc(sizeof(float)*filter_x_*filter_y_*input_size[1]*dy_size[1]);
   //std::cout << "transferred to debug host variable" << std::endl;
 
-  //std::cout << 0 << std::endl;
-
-  TonyConvKernelDraft<<<dy_size_[1],input_size_[3], data_size>>>(input,dy,output,filter_x_,filter_y_,stride,
-  input_size_[0],input_size_[1],input_size_[2],input_size_[3], dy_size_[0],dy_size_[1],dy_size_[2],dy_size_[3]);
-
-  //std::cout << "kernel finished successfully" << std::endl;
-  //float *d_debug = (float*) malloc(sizeof(float)*filter_x_*filter_y_*input_size[1]*dy_size[1]);
-  //std::cout << "transferred to debug host variable" << std::endl;
-
   //std::cout << 0 << std::endl;
   //std::cout << cudaFree(data) << std::endl;
 //  for(int i = 0;i<filter_x_*filter_y_*input_size[1]*dy_size[1];i++)
@@ -356,5 +343,3 @@ void TonyConvGradKernelLauncher(const float * input, const int input_size_[], co
 }
 
 #endif
-
-                                                                                                             350,0-1       Bo
\ No newline at end of file

From f488666a2c2994dd1955f69d118aefd638eb0169 Mon Sep 17 00:00:00 2001
From: Wendy Li <wli@lyft.com>
Date: Fri, 31 Aug 2018 21:45:14 -0700
Subject: [PATCH 4/4] update

---
 approx_kernel_faster.cu.cc | 227 +++++++++++++++++++++++++++----------
 1 file changed, 168 insertions(+), 59 deletions(-)

diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc
index 3832ec4..f8ff2d4 100644
--- a/approx_kernel_faster.cu.cc
+++ b/approx_kernel_faster.cu.cc
@@ -16,8 +16,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 
 
 void checkGpuMem()
-{ 
-  
+{
   float free_m,total_m,used_m;
   size_t free_t,total_t;
   cudaMemGetInfo(&free_t,&total_t);
@@ -25,7 +24,6 @@ void checkGpuMem()
   total_m=(uint)total_t/1048576.0;
   used_m=total_m-free_m;
   printf ( "  mem free %d .... %f MB mem total %d....%f MB mem used %f MB\n",free_t,free_m,total_t,total_m,used_m);
-
 }
 
 
@@ -54,8 +52,6 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
   int i = threadIdx.x * (ys_1 * ys_2 * ys_3) + blockIdx.x * (ys_2 * ys_3);
   int max_idx =0;
   float sum = 0.0;
-  float max = 0.0;
-
   int t1, t2, t3, t4, t5, t6, t7, t8;
   t1 = clock();
   float val;
@@ -73,7 +69,6 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
 //    max_idx = j * (max == val) + max_idx * (max > val);
 
     sum = sum + dy[i+j];
-
   }
 
   t2 = clock();
@@ -119,18 +114,14 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
     printf("%d ",t2-t1);
   }
 
-t1 = clock();
-int c, a, b, c1, d, e;
-//int incr_data = filter_y_ * is_3;
-//int input_incr = is_2 * is_3;
-int input_channel_offset = 0;
-int input_offset = max_row * is_2 * is_3 + max_col * is_3;
-int data_index = threadIdx.x;
-int data_index2 = input_offset + input_offset + data_index;
-for(c = input_channels; c>=0; c--)
-{
-  //t3 = clock();
-   // t5 = clock();
+  t1 = clock();
+  int c, a, b, c1, d, e;
+  int input_channel_offset = 0;
+  int input_offset = max_row * is_2 * is_3 + max_col * is_3;
+  int data_index = threadIdx.x;
+  int data_index2 = input_offset + input_offset + data_index;
+  for(c = 0; c<32; c++)
+  {
     a = input[data_index2] * sum;
     data_index2 += is_3;
     b = input[data_index2] * sum;
@@ -151,18 +142,15 @@ for(c = input_channels; c>=0; c--)
     data_index += is_3;
     data[data_index] = e;
 
-   // t6 = clock();
-// Second row
-    //input_offset += input_incr;
+    // Second row
     data_index += is_3;
     data_index2 += 106 * is_3;
-     //data_index2 =  input_offset + input_offset + threadIdx.x;
 
     a = input[data_index2] * sum;
     data_index2 += is_3;
     b = input[data_index2] * sum;
     data_index2 += is_3;
-c1 = input[data_index2] * sum;
+    c1 = input[data_index2] * sum;
     data_index2 += is_3;
     d = input[data_index2] * sum;
     data_index2 += is_3;
@@ -178,10 +166,8 @@ c1 = input[data_index2] * sum;
     data_index += is_3;
     data[data_index] = e;
 
-//T hird row
-    //input_offset += input_incr;
+  //Third row
     data_index += is_3;
-    //data_index2 = input_offset + input_offset + threadIdx.x;
     data_index2 += 106 * is_3;
 
     a = input[data_index2] * sum;
@@ -204,11 +190,10 @@ c1 = input[data_index2] * sum;
     data_index += is_3;
     data[data_index] = e;
 
-// Fourth row
-    //input_offset += input_incr;
+  // Fourth row
     data_index += is_3;
-//    data_index2 = input_offset + input_offset + threadIdx.x;
-data_index2 += 106 * is_3;
+    data_index2 += 106 * is_3;
+
     a = input[data_index2] * sum;
     data_index2 += is_3;
     b = input[data_index2] * sum;
@@ -229,14 +214,11 @@ data_index2 += 106 * is_3;
     data_index += is_3;
     data[data_index] = e;
 
-//Fifth row
-    //input_offset += input_incr;
+    //Fifth row
     data_index += is_3;
-    //data_index2 = input_offset + input_offset + threadIdx.x;
     data_index2 += 106 * is_3;
 
-
-       a = input[data_index2] * sum;
+    a = input[data_index2] * sum;
     data_index2 += is_3;
     b = input[data_index2] * sum;
     data_index2 += is_3;
@@ -256,44 +238,171 @@ data_index2 += 106 * is_3;
     data_index += is_3;
     data[data_index] = e;
 
-  //reset inputs
-  input_channel_offset += is_1 * is_2 * is_3;
-  //input_offset = input_channel_offset +  max_row * is_2 * is_3 + max_col * is_3;
-  data_index -= is_3 * 24;
+    //reset inputs
+    input_channel_offset += is_1 * is_2 * is_3;
+    data_index -= is_3 * 24;
     data_index2 = input_channel_offset + input_offset + data_index;
-  //data_index2 -= 444 * is_3 - input_channel_offset;
-//t4=clock();
+  }
+
+
+  int temp = is_1 * is_2 * is_3 * 32;
+  int data_index1 = threadIdx.x;
+  int data_index2_1 = temp + data_index1 + input_offset;
+  for(c = 0; c<32; c++)
+  {
+      a = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      b = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      c1 = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      d = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      e = input[data_index2_1] * sum;
+
+      data[data_index1] = a;
+      data_index1 += is_3;
+      data[data_index1] = b;
+      data_index1 += is_3;
+      data[data_index1] = c1;
+      data_index1 += is_3;
+      data[data_index1] = d;
+      data_index1 += is_3;
+      data[data_index1] = e;
+
+      // Second row
+      data_index1 += is_3;
+      data_index2_1 += 106 * is_3;
+
+      a = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      b = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      c1 = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      d = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      e = input[data_index2_1] * sum;
+
+      data[data_index1] = a;
+      data_index1 += is_3;
+      data[data_index1] = b;
+      data_index1 += is_3;
+      data[data_index1] = c1;
+      data_index1 += is_3;
+      data[data_index1] = d;
+      data_index1 += is_3;
+      data[data_index1] = e;
+
+      // Third row
+      data_index1 += is_3;
+      data_index2_1 += 106 * is_3;
+
+      a = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      b = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      c1 = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      d = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      e = input[data_index2_1] * sum;
+
+      data[data_index1] = a;
+      data_index1 += is_3;
+      data[data_index1] = b;
+      data_index1 += is_3;
+      data[data_index1] = c1;
+      data_index1 += is_3;
+      data[data_index1] = d;
+      data_index1 += is_3;
+      data[data_index1] = e;
+
+      //Fourth row
+      data_index1 += is_3;
+      data_index2_1 += 106 * is_3;
+
+      a = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      b = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      c1 = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      d = input[data_index2_1] * sum;
+      data_index2_1 += is_3;
+      e = input[data_index2_1] * sum;
+
+      data[data_index1] = a;
+      data_index1 += is_3;
+      data[data_index1] = b;
+      data_index1 += is_3;
+      data[data_index1] = c1;
+      data_index1 += is_3;
+      data[data_index1] = d;
+      data_index1 += is_3;
+      data[data_index1] = e;
+
+    //Fifth Row
+    data_index1 += is_3;
+    data_index2_1 += 106 * is_3;
+
+    a = input[data_index2_1] * sum;
+    data_index2_1 += is_3;
+    b = input[data_index2_1] * sum;
+    data_index2_1 += is_3;
+    c1 = input[data_index2_1] * sum;
+    data_index2_1 += is_3;
+    d = input[data_index2_1] * sum;
+    data_index2_1 += is_3;
+    e = input[data_index2_1] * sum;
+
+    data[data_index1] = a;
+    data_index1 += is_3;
+    data[data_index1] = b;
+    data_index1 += is_3;
+    data[data_index1] = c1;
+    data_index1 += is_3;
+    data[data_index1] = d;
+    data_index1 += is_3;
+    data[data_index1] = e;
+
+
+    //reset inputs
+    temp += is_1 * is_2 * is_3;
+    data_index1 -= is_3 * 24;
+    data_index2_1 = temp + input_offset + data_index1;
 }
 
-//__syncthreads();
+
+/*__syncthreads();
     // now reduce inside of shared memory, this can be made more parallel
     // currently only using 9 of the 128 threads.
 
-//    int data_channel_offset = data_result_offset + c * filter_x_ * filter_y_ ;
-//    int my_row = threadIdx.x / filter_y_;
-//    int my_col = threadIdx.x % filter_y_;
-//    if (my_row < filter_x_ && my_col < filter_y_)
-//    {
-//        int data_result_idx = data_channel_offset + my_row * filter_y_ + my_col;
-//        int data_scratch_idx = (my_row * filter_y_ + my_col) * is_3;
-//        for(int element = 0; element < is_3; element ++)
-//        {
-//            data[data_result_idx] += data[data_scratch_idx + element];
-//        }
-//    }
-
-    //__syncthreads();
-
- // }
+    int data_channel_offset = data_result_offset + c * filter_x_ * filter_y_ ;
+    int my_row = threadIdx.x / filter_y_;
+    int my_col = threadIdx.x % filter_y_;
+    if (my_row < filter_x_ && my_col < filter_y_)
+    {
+        int data_result_idx = data_channel_offset + my_row * filter_y_ + my_col;
+        int data_scratch_idx = (my_row * filter_y_ + my_col) * is_3;
+        for(int element = 0; element < is_3; element ++)
+        {
+            data[data_result_idx] += data[data_scratch_idx + element];
+        }    }
+
+   __syncthreads();
+*/
+
   // now do the parallel reduction, this can definitely be made better
   t2 = clock();
   if(blockIdx.x==0){
     printf("Difference: %d ",t2-t1);
- printf("Difference2: %d ",t4-t3);
    // printf("Difference3 : %d ",t6-t5);
    // printf("Difference4: %d ",t8-t7);
   }
 
+
+
   int block_offset = blockIdx.x * input_channels * filter_x_ * filter_y_;