From 9f3902c7899afad5054bc41fc2e621a76baa655d Mon Sep 17 00:00:00 2001 From: Wendy Li Date: Sun, 26 Aug 2018 17:16:02 -0700 Subject: [PATCH 1/4] increase speed --- approx_kernel_faster.cu.cc | 173 +++++++++++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 25 deletions(-) diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc index 9e1c01b..f3fb4f5 100644 --- a/approx_kernel_faster.cu.cc +++ b/approx_kernel_faster.cu.cc @@ -6,9 +6,9 @@ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { +{ + if (code != cudaSuccess) + { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } @@ -17,7 +17,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t void checkGpuMem() { - + float free_m,total_m,used_m; size_t free_t,total_t; cudaMemGetInfo(&free_t,&total_t); @@ -28,6 +28,7 @@ void checkGpuMem() } + __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* output, const int filter_x_, const int filter_y_, const int stride, const int is_0, const int is_1, const int is_2, const int is_3, const int ys_0, const int ys_1, const int ys_2, const int ys_3) { @@ -55,7 +56,7 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* float sum = 0.0; float max = 0.0; - int t1, t2; + int t1, t2, t3, t4, t5, t6, t7, t8; t1 = clock(); float val; @@ -102,6 +103,7 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* int scratch = input[100]; + t2 = clock(); if(blockIdx.x==0){ @@ -118,26 +120,142 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* printf("%d ",t2-t1); } - t1 = clock(); - // I have N patches of size C*H*W. I want ONE patch of C*H*W which is the average of the N. - // splitting the work up by C. So you would have an outer loop over the C dimension. Then in each iteration, each thread loads the patch for one channel for one image (still using CHWN for bandwidth) into a shared memory scratch buffer. Then in the second phase of the iteration, - // you remap so that each thread gets one of H*W pixels and does the reduction into the result shared memory buffer - for(int c = 0; c< input_channels; c++) - { +//printf("filterx: %d\n",is_3); + t1 = clock(); + +int c; +for(c = 0; c Date: Tue, 28 Aug 2018 20:20:53 -0700 Subject: [PATCH 2/4] update --- approx_kernel_faster.cu.cc | 293 ++++++++++++++++++++----------------- 1 file changed, 157 insertions(+), 136 deletions(-) diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc index f3fb4f5..99c657b 100644 --- a/approx_kernel_faster.cu.cc +++ b/approx_kernel_faster.cu.cc @@ -103,7 +103,6 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* int scratch = input[100]; - t2 = clock(); if(blockIdx.x==0){ @@ -120,140 +119,153 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* printf("%d ",t2-t1); } -//printf("filterx: %d\n",is_3); - t1 = clock(); - -int c; +t1 = clock(); +int c, a, b, c1, d, e; +int incr_data = filter_y_ * is_3; +int input_incr = is_2 * is_3; +int input_channel_offset = 0; +int data_row_offset = 0; +int input_offset = max_row * is_2 * is_3 + max_col * is_3; +int data_index = threadIdx.x; +int data_index2 = input_offset + input_offset + threadIdx.x; for(c = 0; c>>(input,dy,output,filter_x_,filter_y_,stride, + input_size_[0],input_size_[1],input_size_[2],input_size_[3], dy_size_[0],dy_size_[1],dy_size_[2],dy_size_[3]); + + //std::cout << "kernel finished successfully" << std::endl; + //float *d_debug = (float*) malloc(sizeof(float)*filter_x_*filter_y_*input_size[1]*dy_size[1]); + //std::cout << "transferred to debug host variable" << std::endl; + //std::cout << 0 << std::endl; //std::cout << cudaFree(data) << std::endl; // for(int i = 0;i Date: Thu, 30 Aug 2018 21:25:58 -0700 Subject: [PATCH 3/4] optimize --- approx_kernel_faster.cu.cc | 75 +++++++++++++++----------------------- 1 file changed, 30 insertions(+), 45 deletions(-) diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc index 99c657b..3832ec4 100644 --- a/approx_kernel_faster.cu.cc +++ b/approx_kernel_faster.cu.cc @@ -16,7 +16,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t void checkGpuMem() -{ +{ float free_m,total_m,used_m; size_t free_t,total_t; @@ -121,19 +121,18 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* t1 = clock(); int c, a, b, c1, d, e; -int incr_data = filter_y_ * is_3; -int input_incr = is_2 * is_3; +//int incr_data = filter_y_ * is_3; +//int input_incr = is_2 * is_3; int input_channel_offset = 0; -int data_row_offset = 0; int input_offset = max_row * is_2 * is_3 + max_col * is_3; int data_index = threadIdx.x; -int data_index2 = input_offset + input_offset + threadIdx.x; -for(c = 0; c=0; c--) { - // t3 = clock(); + //t3 = clock(); // t5 = clock(); a = input[data_index2] * sum; - data_index2 += is_3; + data_index2 += is_3; b = input[data_index2] * sum; data_index2 += is_3; c1 = input[data_index2] * sum; @@ -154,16 +153,16 @@ for(c = 0; c>>(input,dy,output,filter_x_,filter_y_,stride, - input_size_[0],input_size_[1],input_size_[2],input_size_[3], dy_size_[0],dy_size_[1],dy_size_[2],dy_size_[3]); - - //std::cout << "kernel finished successfully" << std::endl; - //float *d_debug = (float*) malloc(sizeof(float)*filter_x_*filter_y_*input_size[1]*dy_size[1]); - //std::cout << "transferred to debug host variable" << std::endl; - //std::cout << 0 << std::endl; //std::cout << cudaFree(data) << std::endl; // for(int i = 0;i Date: Fri, 31 Aug 2018 21:45:14 -0700 Subject: [PATCH 4/4] update --- approx_kernel_faster.cu.cc | 227 +++++++++++++++++++++++++++---------- 1 file changed, 168 insertions(+), 59 deletions(-) diff --git a/approx_kernel_faster.cu.cc b/approx_kernel_faster.cu.cc index 3832ec4..f8ff2d4 100644 --- a/approx_kernel_faster.cu.cc +++ b/approx_kernel_faster.cu.cc @@ -16,8 +16,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t void checkGpuMem() -{ - +{ float free_m,total_m,used_m; size_t free_t,total_t; cudaMemGetInfo(&free_t,&total_t); @@ -25,7 +24,6 @@ void checkGpuMem() total_m=(uint)total_t/1048576.0; used_m=total_m-free_m; printf ( " mem free %d .... %f MB mem total %d....%f MB mem used %f MB\n",free_t,free_m,total_t,total_m,used_m); - } @@ -54,8 +52,6 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* int i = threadIdx.x * (ys_1 * ys_2 * ys_3) + blockIdx.x * (ys_2 * ys_3); int max_idx =0; float sum = 0.0; - float max = 0.0; - int t1, t2, t3, t4, t5, t6, t7, t8; t1 = clock(); float val; @@ -73,7 +69,6 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* // max_idx = j * (max == val) + max_idx * (max > val); sum = sum + dy[i+j]; - } t2 = clock(); @@ -119,18 +114,14 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float* printf("%d ",t2-t1); } -t1 = clock(); -int c, a, b, c1, d, e; -//int incr_data = filter_y_ * is_3; -//int input_incr = is_2 * is_3; -int input_channel_offset = 0; -int input_offset = max_row * is_2 * is_3 + max_col * is_3; -int data_index = threadIdx.x; -int data_index2 = input_offset + input_offset + data_index; -for(c = input_channels; c>=0; c--) -{ - //t3 = clock(); - // t5 = clock(); + t1 = clock(); + int c, a, b, c1, d, e; + int input_channel_offset = 0; + int input_offset = max_row * is_2 * is_3 + max_col * is_3; + int data_index = threadIdx.x; + int data_index2 = input_offset + input_offset + data_index; + for(c = 0; c<32; c++) + { a = input[data_index2] * sum; data_index2 += is_3; b = input[data_index2] * sum; @@ -151,18 +142,15 @@ for(c = input_channels; c>=0; c--) data_index += is_3; data[data_index] = e; - // t6 = clock(); -// Second row - //input_offset += input_incr; + // Second row data_index += is_3; data_index2 += 106 * is_3; - //data_index2 = input_offset + input_offset + threadIdx.x; a = input[data_index2] * sum; data_index2 += is_3; b = input[data_index2] * sum; data_index2 += is_3; -c1 = input[data_index2] * sum; + c1 = input[data_index2] * sum; data_index2 += is_3; d = input[data_index2] * sum; data_index2 += is_3; @@ -178,10 +166,8 @@ c1 = input[data_index2] * sum; data_index += is_3; data[data_index] = e; -//T hird row - //input_offset += input_incr; + //Third row data_index += is_3; - //data_index2 = input_offset + input_offset + threadIdx.x; data_index2 += 106 * is_3; a = input[data_index2] * sum; @@ -204,11 +190,10 @@ c1 = input[data_index2] * sum; data_index += is_3; data[data_index] = e; -// Fourth row - //input_offset += input_incr; + // Fourth row data_index += is_3; -// data_index2 = input_offset + input_offset + threadIdx.x; -data_index2 += 106 * is_3; + data_index2 += 106 * is_3; + a = input[data_index2] * sum; data_index2 += is_3; b = input[data_index2] * sum; @@ -229,14 +214,11 @@ data_index2 += 106 * is_3; data_index += is_3; data[data_index] = e; -//Fifth row - //input_offset += input_incr; + //Fifth row data_index += is_3; - //data_index2 = input_offset + input_offset + threadIdx.x; data_index2 += 106 * is_3; - - a = input[data_index2] * sum; + a = input[data_index2] * sum; data_index2 += is_3; b = input[data_index2] * sum; data_index2 += is_3; @@ -256,44 +238,171 @@ data_index2 += 106 * is_3; data_index += is_3; data[data_index] = e; - //reset inputs - input_channel_offset += is_1 * is_2 * is_3; - //input_offset = input_channel_offset + max_row * is_2 * is_3 + max_col * is_3; - data_index -= is_3 * 24; + //reset inputs + input_channel_offset += is_1 * is_2 * is_3; + data_index -= is_3 * 24; data_index2 = input_channel_offset + input_offset + data_index; - //data_index2 -= 444 * is_3 - input_channel_offset; -//t4=clock(); + } + + + int temp = is_1 * is_2 * is_3 * 32; + int data_index1 = threadIdx.x; + int data_index2_1 = temp + data_index1 + input_offset; + for(c = 0; c<32; c++) + { + a = input[data_index2_1] * sum; + data_index2_1 += is_3; + b = input[data_index2_1] * sum; + data_index2_1 += is_3; + c1 = input[data_index2_1] * sum; + data_index2_1 += is_3; + d = input[data_index2_1] * sum; + data_index2_1 += is_3; + e = input[data_index2_1] * sum; + + data[data_index1] = a; + data_index1 += is_3; + data[data_index1] = b; + data_index1 += is_3; + data[data_index1] = c1; + data_index1 += is_3; + data[data_index1] = d; + data_index1 += is_3; + data[data_index1] = e; + + // Second row + data_index1 += is_3; + data_index2_1 += 106 * is_3; + + a = input[data_index2_1] * sum; + data_index2_1 += is_3; + b = input[data_index2_1] * sum; + data_index2_1 += is_3; + c1 = input[data_index2_1] * sum; + data_index2_1 += is_3; + d = input[data_index2_1] * sum; + data_index2_1 += is_3; + e = input[data_index2_1] * sum; + + data[data_index1] = a; + data_index1 += is_3; + data[data_index1] = b; + data_index1 += is_3; + data[data_index1] = c1; + data_index1 += is_3; + data[data_index1] = d; + data_index1 += is_3; + data[data_index1] = e; + + // Third row + data_index1 += is_3; + data_index2_1 += 106 * is_3; + + a = input[data_index2_1] * sum; + data_index2_1 += is_3; + b = input[data_index2_1] * sum; + data_index2_1 += is_3; + c1 = input[data_index2_1] * sum; + data_index2_1 += is_3; + d = input[data_index2_1] * sum; + data_index2_1 += is_3; + e = input[data_index2_1] * sum; + + data[data_index1] = a; + data_index1 += is_3; + data[data_index1] = b; + data_index1 += is_3; + data[data_index1] = c1; + data_index1 += is_3; + data[data_index1] = d; + data_index1 += is_3; + data[data_index1] = e; + + //Fourth row + data_index1 += is_3; + data_index2_1 += 106 * is_3; + + a = input[data_index2_1] * sum; + data_index2_1 += is_3; + b = input[data_index2_1] * sum; + data_index2_1 += is_3; + c1 = input[data_index2_1] * sum; + data_index2_1 += is_3; + d = input[data_index2_1] * sum; + data_index2_1 += is_3; + e = input[data_index2_1] * sum; + + data[data_index1] = a; + data_index1 += is_3; + data[data_index1] = b; + data_index1 += is_3; + data[data_index1] = c1; + data_index1 += is_3; + data[data_index1] = d; + data_index1 += is_3; + data[data_index1] = e; + + //Fifth Row + data_index1 += is_3; + data_index2_1 += 106 * is_3; + + a = input[data_index2_1] * sum; + data_index2_1 += is_3; + b = input[data_index2_1] * sum; + data_index2_1 += is_3; + c1 = input[data_index2_1] * sum; + data_index2_1 += is_3; + d = input[data_index2_1] * sum; + data_index2_1 += is_3; + e = input[data_index2_1] * sum; + + data[data_index1] = a; + data_index1 += is_3; + data[data_index1] = b; + data_index1 += is_3; + data[data_index1] = c1; + data_index1 += is_3; + data[data_index1] = d; + data_index1 += is_3; + data[data_index1] = e; + + + //reset inputs + temp += is_1 * is_2 * is_3; + data_index1 -= is_3 * 24; + data_index2_1 = temp + input_offset + data_index1; } -//__syncthreads(); + +/*__syncthreads(); // now reduce inside of shared memory, this can be made more parallel // currently only using 9 of the 128 threads. -// int data_channel_offset = data_result_offset + c * filter_x_ * filter_y_ ; -// int my_row = threadIdx.x / filter_y_; -// int my_col = threadIdx.x % filter_y_; -// if (my_row < filter_x_ && my_col < filter_y_) -// { -// int data_result_idx = data_channel_offset + my_row * filter_y_ + my_col; -// int data_scratch_idx = (my_row * filter_y_ + my_col) * is_3; -// for(int element = 0; element < is_3; element ++) -// { -// data[data_result_idx] += data[data_scratch_idx + element]; -// } -// } - - //__syncthreads(); - - // } + int data_channel_offset = data_result_offset + c * filter_x_ * filter_y_ ; + int my_row = threadIdx.x / filter_y_; + int my_col = threadIdx.x % filter_y_; + if (my_row < filter_x_ && my_col < filter_y_) + { + int data_result_idx = data_channel_offset + my_row * filter_y_ + my_col; + int data_scratch_idx = (my_row * filter_y_ + my_col) * is_3; + for(int element = 0; element < is_3; element ++) + { + data[data_result_idx] += data[data_scratch_idx + element]; + } } + + __syncthreads(); +*/ + // now do the parallel reduction, this can definitely be made better t2 = clock(); if(blockIdx.x==0){ printf("Difference: %d ",t2-t1); - printf("Difference2: %d ",t4-t3); // printf("Difference3 : %d ",t6-t5); // printf("Difference4: %d ",t8-t7); } + + int block_offset = blockIdx.x * input_channels * filter_x_ * filter_y_;