-
Notifications
You must be signed in to change notification settings - Fork 3
/
mirror_ops.cu
81 lines (58 loc) · 2.14 KB
/
mirror_ops.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Corresponding header file: /include/mirror_ops.h
#include <cuda_runtime.h>
#include <stdio.h>
/* Mirror operations */
__global__
void mirror(const uchar4* const inputChannel, uchar4* outputChannel, int numRows, int numCols, bool vertical)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if ( col >= numCols || row >= numRows )
{
return;
}
if(!vertical)
{
int thread_x = blockDim.x * blockIdx.x + threadIdx.x;
int thread_y = blockDim.y * blockIdx.y + threadIdx.y;
int thread_x_new = thread_x;
int thread_y_new = numRows-thread_y;
int myId = thread_y * numCols + thread_x;
int myId_new = thread_y_new * numCols + thread_x_new;
outputChannel[myId_new] = inputChannel[myId];
}
else
{
int thread_x = blockDim.x * blockIdx.x + threadIdx.x;
int thread_y = blockDim.y * blockIdx.y + threadIdx.y;
int thread_x_new = numCols-thread_x;
int thread_y_new = thread_y;
int myId = thread_y * numCols + thread_x;
int myId_new = thread_y_new * numCols + thread_x_new;
outputChannel[myId_new] = inputChannel[myId]; // linear data store in global memory
}
}
uchar4* mirror_ops(uchar4 *d_inputImageRGBA, size_t numRows, size_t numCols, bool vertical)
{
//Set reasonable block size (i.e., number of threads per block)
const dim3 blockSize(4,4,1);
//Calculate Grid SIze
int a=numCols/blockSize.x, b=numRows/blockSize.y;
const dim3 gridSize(a+1,b+1,1);
const size_t numPixels = numRows * numCols;
uchar4 *d_outputImageRGBA;
cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels);
//Call mirror kernel.
mirror<<<gridSize, blockSize>>>(d_inputImageRGBA, d_outputImageRGBA, numRows, numCols, vertical);
cudaDeviceSynchronize();
//Initialize memory on host for output uchar4*
uchar4* h_out;
h_out = (uchar4*)malloc(sizeof(uchar4) * numPixels);
//copy output from device to host
cudaMemcpy(h_out, d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost);
//cleanup memory on device
cudaFree(d_inputImageRGBA);
cudaFree(d_outputImageRGBA);
//return h_out
return h_out;
}