1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
| __global__ void CudaCore1(PtrStepSz<uchar3> inputMat, PtrStepSz<uchar3> outputMat) { int tidx = threadIdx.x + blockDim.x * blockIdx.x; int tidy = threadIdx.y + blockDim.y * blockIdx.y;
if (tidx < inputMat.cols && tidy < inputMat.rows) { outputMat(tidy, tidx) = inputMat(tidy, tidx); } }
extern "C" Mat TestCuda1(Mat img) { GpuMat inputMat(img); auto outputMat = GpuMat(img.rows, img.cols, CV_8UC3);
int width = img.cols; int height = img.rows;
dim3 block(32, 32); dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); CudaCore1 << <grid, block >> > (inputMat, outputMat); cudaThreadSynchronize();
Mat dstImg; outputMat.download(dstImg); return dstImg; }
__global__ void CudaCore2(const uchar3* inputImg, uchar3* outputImg, int width, int height) { int tidx = threadIdx.x + blockDim.x * blockIdx.x; int tidy = threadIdx.y + blockDim.y * blockIdx.y;
if (tidx < width && tidy < height) { int idx = tidy * width + tidx; outputImg[idx] = inputImg[idx]; } }
extern "C" Mat TestCuda2(Mat img) { int height = img.rows; int width = img.cols; auto img_size = sizeof(uchar3) * height * width;
uchar3* inputImg = NULL; uchar3* outputImg = NULL;
cudaMalloc((void**)&inputImg, img_size); cudaMalloc((void**)&outputImg, img_size); cudaMemcpy(inputImg, (uchar3*)img.data, img_size, cudaMemcpyHostToDevice);
dim3 block(32, 32); dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); CudaCore2 << <grid, block >> > (inputImg, outputImg, width, height); cudaFree(inputImg); cudaThreadSynchronize();
Mat dstImg(height, width, CV_8UC3); uchar3* outputUChar = (uchar3*)dstImg.data; cudaMemcpy(outputUChar, outputImg, img_size, cudaMemcpyDeviceToHost); cudaFree(outputImg); return dstImg; }
|