Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
324 changes: 281 additions & 43 deletions approx_kernel_faster.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
Expand All @@ -17,17 +17,16 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t

void checkGpuMem()
{

float free_m,total_m,used_m;
size_t free_t,total_t;
cudaMemGetInfo(&free_t,&total_t);
free_m =(uint)free_t/1048576.0 ;
total_m=(uint)total_t/1048576.0;
used_m=total_m-free_m;
printf ( " mem free %d .... %f MB mem total %d....%f MB mem used %f MB\n",free_t,free_m,total_t,total_m,used_m);

}


__global__ void TonyConvKernelDraft(const float* input, const float* dy, float* output,
const int filter_x_, const int filter_y_, const int stride, const int is_0, const int is_1, const int is_2, const int is_3,
const int ys_0, const int ys_1, const int ys_2, const int ys_3) {
Expand All @@ -53,9 +52,7 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
int i = threadIdx.x * (ys_1 * ys_2 * ys_3) + blockIdx.x * (ys_2 * ys_3);
int max_idx =0;
float sum = 0.0;
float max = 0.0;

int t1, t2;
int t1, t2, t3, t4, t5, t6, t7, t8;
t1 = clock();
float val;

Expand All @@ -72,7 +69,6 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
// max_idx = j * (max == val) + max_idx * (max > val);

sum = sum + dy[i+j];

}

t2 = clock();
Expand Down Expand Up @@ -119,52 +115,294 @@ __global__ void TonyConvKernelDraft(const float* input, const float* dy, float*
}

t1 = clock();
// I have N patches of size C*H*W. I want ONE patch of C*H*W which is the average of the N.
// splitting the work up by C. So you would have an outer loop over the C dimension. Then in each iteration, each thread loads the patch for one channel for one image (still using CHWN for bandwidth) into a shared memory scratch buffer. Then in the second phase of the iteration,
// you remap so that each thread gets one of H*W pixels and does the reduction into the result shared memory buffer
for(int c = 0; c< input_channels; c++)
int c, a, b, c1, d, e;
int input_channel_offset = 0;
int input_offset = max_row * is_2 * is_3 + max_col * is_3;
int data_index = threadIdx.x;
int data_index2 = input_offset + input_offset + data_index;
for(c = 0; c<32; c++)
{
// first read into shared memory
int input_channel_offset = c * is_1 * is_2 * is_3;
for(int row = 0; row < filter_x_; row ++ )
{
int data_row_offset = row * filter_y_ * is_3;
int input_offset = input_channel_offset + (row + max_row) * is_2 * is_3 + max_col * is_3;
for(int col = 0; col < filter_y_ * is_3; col+=is_3)
{
// input_col_offset same as data_col_offset
data[data_row_offset + col + threadIdx.x] = input[input_offset + col + input_offset + threadIdx.x] * sum;
}
}
a = input[data_index2] * sum;
data_index2 += is_3;
b = input[data_index2] * sum;
data_index2 += is_3;
c1 = input[data_index2] * sum;
data_index2 += is_3;
d = input[data_index2] * sum;
data_index2 += is_3;
e = input[data_index2] * sum;

data[data_index] = a;
data_index += is_3;
data[data_index] = b;
data_index += is_3;
data[data_index] = c1;
data_index += is_3;
data[data_index] = d;
data_index += is_3;
data[data_index] = e;

// Second row
data_index += is_3;
data_index2 += 106 * is_3;

a = input[data_index2] * sum;
data_index2 += is_3;
b = input[data_index2] * sum;
data_index2 += is_3;
c1 = input[data_index2] * sum;
data_index2 += is_3;
d = input[data_index2] * sum;
data_index2 += is_3;
e = input[data_index2] * sum;

data[data_index] = a;
data_index += is_3;
data[data_index] = b;
data_index += is_3;
data[data_index] = c1;
data_index += is_3;
data[data_index] = d;
data_index += is_3;
data[data_index] = e;

//Third row
data_index += is_3;
data_index2 += 106 * is_3;

a = input[data_index2] * sum;
data_index2 += is_3;
b = input[data_index2] * sum;
data_index2 += is_3;
c1 = input[data_index2] * sum;
data_index2 += is_3;
d = input[data_index2] * sum;
data_index2 += is_3;
e = input[data_index2] * sum;

data[data_index] = a;
data_index += is_3;
data[data_index] = b;
data_index += is_3;
data[data_index] = c1;
data_index += is_3;
data[data_index] = d;
data_index += is_3;
data[data_index] = e;

// Fourth row
data_index += is_3;
data_index2 += 106 * is_3;

a = input[data_index2] * sum;
data_index2 += is_3;
b = input[data_index2] * sum;
data_index2 += is_3;
c1 = input[data_index2] * sum;
data_index2 += is_3;
d = input[data_index2] * sum;
data_index2 += is_3;
e = input[data_index2] * sum;

data[data_index] = a;
data_index += is_3;
data[data_index] = b;
data_index += is_3;
data[data_index] = c1;
data_index += is_3;
data[data_index] = d;
data_index += is_3;
data[data_index] = e;

//Fifth row
data_index += is_3;
data_index2 += 106 * is_3;

a = input[data_index2] * sum;
data_index2 += is_3;
b = input[data_index2] * sum;
data_index2 += is_3;
c1 = input[data_index2] * sum;
data_index2 += is_3;
d = input[data_index2] * sum;
data_index2 += is_3;
e = input[data_index2] * sum;

data[data_index] = a;
data_index += is_3;
data[data_index] = b;
data_index += is_3;
data[data_index] = c1;
data_index += is_3;
data[data_index] = d;
data_index += is_3;
data[data_index] = e;

//reset inputs
input_channel_offset += is_1 * is_2 * is_3;
data_index -= is_3 * 24;
data_index2 = input_channel_offset + input_offset + data_index;
}


//__syncthreads();
int temp = is_1 * is_2 * is_3 * 32;
int data_index1 = threadIdx.x;
int data_index2_1 = temp + data_index1 + input_offset;
for(c = 0; c<32; c++)
{
a = input[data_index2_1] * sum;
data_index2_1 += is_3;
b = input[data_index2_1] * sum;
data_index2_1 += is_3;
c1 = input[data_index2_1] * sum;
data_index2_1 += is_3;
d = input[data_index2_1] * sum;
data_index2_1 += is_3;
e = input[data_index2_1] * sum;

data[data_index1] = a;
data_index1 += is_3;
data[data_index1] = b;
data_index1 += is_3;
data[data_index1] = c1;
data_index1 += is_3;
data[data_index1] = d;
data_index1 += is_3;
data[data_index1] = e;

// Second row
data_index1 += is_3;
data_index2_1 += 106 * is_3;

a = input[data_index2_1] * sum;
data_index2_1 += is_3;
b = input[data_index2_1] * sum;
data_index2_1 += is_3;
c1 = input[data_index2_1] * sum;
data_index2_1 += is_3;
d = input[data_index2_1] * sum;
data_index2_1 += is_3;
e = input[data_index2_1] * sum;

data[data_index1] = a;
data_index1 += is_3;
data[data_index1] = b;
data_index1 += is_3;
data[data_index1] = c1;
data_index1 += is_3;
data[data_index1] = d;
data_index1 += is_3;
data[data_index1] = e;

// Third row
data_index1 += is_3;
data_index2_1 += 106 * is_3;

a = input[data_index2_1] * sum;
data_index2_1 += is_3;
b = input[data_index2_1] * sum;
data_index2_1 += is_3;
c1 = input[data_index2_1] * sum;
data_index2_1 += is_3;
d = input[data_index2_1] * sum;
data_index2_1 += is_3;
e = input[data_index2_1] * sum;

data[data_index1] = a;
data_index1 += is_3;
data[data_index1] = b;
data_index1 += is_3;
data[data_index1] = c1;
data_index1 += is_3;
data[data_index1] = d;
data_index1 += is_3;
data[data_index1] = e;

//Fourth row
data_index1 += is_3;
data_index2_1 += 106 * is_3;

a = input[data_index2_1] * sum;
data_index2_1 += is_3;
b = input[data_index2_1] * sum;
data_index2_1 += is_3;
c1 = input[data_index2_1] * sum;
data_index2_1 += is_3;
d = input[data_index2_1] * sum;
data_index2_1 += is_3;
e = input[data_index2_1] * sum;

data[data_index1] = a;
data_index1 += is_3;
data[data_index1] = b;
data_index1 += is_3;
data[data_index1] = c1;
data_index1 += is_3;
data[data_index1] = d;
data_index1 += is_3;
data[data_index1] = e;

//Fifth Row
data_index1 += is_3;
data_index2_1 += 106 * is_3;

a = input[data_index2_1] * sum;
data_index2_1 += is_3;
b = input[data_index2_1] * sum;
data_index2_1 += is_3;
c1 = input[data_index2_1] * sum;
data_index2_1 += is_3;
d = input[data_index2_1] * sum;
data_index2_1 += is_3;
e = input[data_index2_1] * sum;

data[data_index1] = a;
data_index1 += is_3;
data[data_index1] = b;
data_index1 += is_3;
data[data_index1] = c1;
data_index1 += is_3;
data[data_index1] = d;
data_index1 += is_3;
data[data_index1] = e;


//reset inputs
temp += is_1 * is_2 * is_3;
data_index1 -= is_3 * 24;
data_index2_1 = temp + input_offset + data_index1;
}


/*__syncthreads();
// now reduce inside of shared memory, this can be made more parallel
// currently only using 9 of the 128 threads.

// int data_channel_offset = data_result_offset + c * filter_x_ * filter_y_ ;
// int my_row = threadIdx.x / filter_y_;
// int my_col = threadIdx.x % filter_y_;
// if (my_row < filter_x_ && my_col < filter_y_)
// {
// int data_result_idx = data_channel_offset + my_row * filter_y_ + my_col;
// int data_scratch_idx = (my_row * filter_y_ + my_col) * is_3;
// for(int element = 0; element < is_3; element ++)
// {
// data[data_result_idx] += data[data_scratch_idx + element];
// }
// }

//__syncthreads();

}
int data_channel_offset = data_result_offset + c * filter_x_ * filter_y_ ;
int my_row = threadIdx.x / filter_y_;
int my_col = threadIdx.x % filter_y_;
if (my_row < filter_x_ && my_col < filter_y_)
{
int data_result_idx = data_channel_offset + my_row * filter_y_ + my_col;
int data_scratch_idx = (my_row * filter_y_ + my_col) * is_3;
for(int element = 0; element < is_3; element ++)
{
data[data_result_idx] += data[data_scratch_idx + element];
} }

__syncthreads();
*/

// now do the parallel reduction, this can definitely be made better
t2 = clock();
if(blockIdx.x==0){
printf("%d ",t2-t1);
printf("Difference: %d ",t2-t1);
// printf("Difference3 : %d ",t6-t5);
// printf("Difference4: %d ",t8-t7);
}



int block_offset = blockIdx.x * input_channels * filter_x_ * filter_y_;


Expand Down