Cuda 优化图像金字塔

基于 Cuda 并行加速处理图像金字塔

简介

拉普拉斯金字塔多波段融合 用于融合图像,OpenCV 提供了一系列方法实现这一功能,其实也提供了 GPU 加速版本,但是其中的 cuda::pyrUp 无法固定图像大小,导致很多问题,所以使用 Cuda 重写这些方法,也方便后期根据实际需求修改。

需要重写的方法介绍

cv.GaussianBlur 使用高斯滤波模糊图像,在下采样时保留重要信息,并且在上采样时还原信息。
cv::pyrDown 使用高斯滤波模糊图像后再删除偶数行偶数列,输出的图像大小为:Size((src.cols+1)/2, (src.rows+1)/2)。
cv::pyrUp 对图像上采样,然后使用四倍高斯模糊图像,默认输出的图像大小为:Size(src.cols2, src.rows2),通常使用下采样前的图像大小作为输出图像大小。
cv.subtract 计算两个数组或数组与标量之间的每元素差,图像大小相等时,方法等同于 img1 - img2。
cv.add 计算两个数组或一个数组和一个标量的每元素和,图像大小相等时,方法等同于 img1 + img2。

加载原始图像

左上角加的两条红线用于测试下采样删除偶数行前的高斯滤波验证。

重写代码

GaussianBlur

对图像使用 OpenCV 官方推荐的卷积核进行卷积操作模糊图像。

GaussianBlurGpu 高斯滤波函数用于下采样时使用。
GaussianBlurFourfoldGpu 四倍高斯滤波函数用于上采样时使用。

效果

代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
__global__ void GaussianBlurCore(PtrStepSz<uchar3> inputMat, PtrStepSz<uchar3> outputMat)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int tidy = threadIdx.y + blockDim.y * blockIdx.y;

float blur[5][5] = {
{0.00390625, 0.015625, 0.0234375, 0.015625, 0.00390625},
{0.015625, 0.0625, 0.09375, 0.0625, 0.015625 },
{0.0234375, 0.09375, 0.140625, 0.09375, 0.0234375 },
{0.015625, 0.0625, 0.09375, 0.0625, 0.015625 },
{0.00390625, 0.015625, 0.0234375, 0.015625, 0.00390625} };

int img_cols_max_index = inputMat.cols - 1;
int img_rows_max_index = inputMat.rows - 1;

if (tidx < outputMat.cols && tidy < outputMat.rows)
{
float b, g, r;
for (int x = -2; x <= 2; x++)
{
for (int y = -2; y <= 2; y++)
{
int image_x = tidx + x;
if (image_x < 0)
{
image_x = abs(image_x);
}
if (image_x >= inputMat.cols)
{
image_x = img_cols_max_index - (x - (img_cols_max_index - tidx));
}
int image_y = tidy + y;
if (image_y < 0)
{
image_y = abs(image_y);
}
if (image_y >= inputMat.rows)
{
image_y = img_rows_max_index - (y - (img_rows_max_index - tidy));
}
b += (float)inputMat(image_y, image_x).x * (float)blur[y + 2][x + 2];
g += (float)inputMat(image_y, image_x).y * (float)blur[y + 2][x + 2];
r += (float)inputMat(image_y, image_x).z * (float)blur[y + 2][x + 2];
}
}
outputMat(tidy, tidx).x = (int)b;
outputMat(tidy, tidx).y = (int)g;
outputMat(tidy, tidx).z = (int)r;
}
}

/// <summary>
/// 高斯滤波
/// </summary>
/// <param name="img"></param>
/// <returns></returns>
extern "C" Mat GaussianBlurGpu(Mat img)
{
GpuMat inputMat(img);

int width = img.cols;
int height = img.rows;
auto outputMat = GpuMat(height, width, CV_8UC3);

dim3 block(32, 32);
dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
GaussianBlurCore << <grid, block >> > (inputMat, outputMat);

cudaThreadSynchronize();

Mat dstImg;
outputMat.download(dstImg);
return dstImg;
}

__global__ void GaussianBlurFourfoldCore(PtrStepSz<uchar3> inputMat, PtrStepSz<uchar3> outputMat)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int tidy = threadIdx.y + blockDim.y * blockIdx.y;

float blur[5][5] = {
{0.00390625, 0.015625, 0.0234375, 0.015625, 0.00390625},
{0.015625, 0.0625, 0.09375, 0.0625, 0.015625 },
{0.0234375, 0.09375, 0.140625, 0.09375, 0.0234375 },
{0.015625, 0.0625, 0.09375, 0.0625, 0.015625 },
{0.00390625, 0.015625, 0.0234375, 0.015625, 0.00390625} };

int img_cols_max_index = inputMat.cols - 1;
int img_rows_max_index = inputMat.rows - 1;

if (tidx < outputMat.cols && tidy < outputMat.rows)
{
float b, g, r;
for (int x = -2; x <= 2; x++)
{
for (int y = -2; y <= 2; y++)
{
int image_x = tidx + x;
if (image_x < 0)
{
image_x = abs(image_x);
}
if (image_x >= inputMat.cols)
{
image_x = img_cols_max_index - (x - (img_cols_max_index - tidx));
}
int image_y = tidy + y;
if (image_y < 0)
{
image_y = abs(image_y);
}
if (image_y >= inputMat.rows)
{
image_y = img_rows_max_index - (y - (img_rows_max_index - tidy));
}
b += (float)inputMat(image_y, image_x).x * ((float)blur[y + 2][x + 2] * 4);
g += (float)inputMat(image_y, image_x).y * ((float)blur[y + 2][x + 2] * 4);
r += (float)inputMat(image_y, image_x).z * ((float)blur[y + 2][x + 2] * 4);
}
}
outputMat(tidy, tidx).x = (int)b;
outputMat(tidy, tidx).y = (int)g;
outputMat(tidy, tidx).z = (int)r;
}
}

/// <summary>
/// 四倍高斯滤波
/// </summary>
/// <param name="img"></param>
/// <returns></returns>
extern "C" Mat GaussianBlurFourfoldGpu(Mat img)
{
GpuMat inputMat(img);

int width = img.cols;
int height = img.rows;
auto outputMat = GpuMat(height, width, CV_8UC3);

dim3 block(32, 32);
dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
GaussianBlurFourfoldCore << <grid, block >> > (inputMat, outputMat);

cudaThreadSynchronize();

Mat dstImg;
outputMat.download(dstImg);
return dstImg;
}
pyrDown

调用高斯滤波函数模糊图像,设定图像大小为 Size((src.cols+1)/2, (src.rows+1)/2),删除偶数行和偶数列。

效果

代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
__global__ void PyrDownCore(PtrStepSz<uchar3> inputMat, PtrStepSz<uchar3> outputMat)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int tidy = threadIdx.y + blockDim.y * blockIdx.y;

if (tidx < outputMat.cols && tidy < outputMat.rows)
{
outputMat(tidy, tidx) = inputMat(tidy * 2, tidx * 2);
}
}

/// <summary>
/// 金字塔下采样
/// </summary>
/// <param name="img"></param>
/// <returns></returns>
extern "C" Mat PyrDownGpu(Mat img)
{
// 1. 高斯滤波
GpuMat inputMat(img);

int gb_width = img.cols;
int gb_height = img.rows;
auto output_gb_mat = GpuMat(gb_height, gb_width, CV_8UC3);

dim3 block_gb(32, 32);
dim3 grid_gb((gb_width + block_gb.x - 1) / block_gb.x, (gb_height + block_gb.y - 1) / block_gb.y);
GaussianBlurCore << <grid_gb, block_gb >> > (inputMat, output_gb_mat);

// 2. 对高斯滤波后的图像删除偶数行列
int down_width = (img.cols + 1) / 2;
int down_height = (img.rows + 1) / 2;
auto output_down_mat = GpuMat(down_height, down_width, CV_8UC3);

dim3 block_down(32, 32);
dim3 grid_down((down_width + block_down.x - 1) / block_down.x, (down_height + block_down.y - 1) / block_down.y);
PyrDownCore << <grid_down, block_down >> > (output_gb_mat, output_down_mat);

Mat dstImg;
output_down_mat.download(dstImg);
return dstImg;
}
pyrUp

设定图像大小为下采样前大小,偶数行偶数列填充为0,在使用四倍高斯滤波模糊图像。

效果

代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
__global__ void PyrUpCore(PtrStepSz<uchar3> inputMat, PtrStepSz<uchar3> outputMat)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int tidy = threadIdx.y + blockDim.y * blockIdx.y;

if (tidx < outputMat.cols && tidy < outputMat.rows)
{
if (tidy % 2 == 0 && tidx % 2 == 0)
{
outputMat(tidy, tidx) = inputMat(tidy / 2, tidx / 2);
}
else
{
outputMat(tidy, tidx) = uchar3();
}
}
}

/// <summary>
/// 金字塔上采样
/// </summary>
/// <param name="img"></param>
/// <param name="img_size"></param>
/// <returns></returns>
extern "C" Mat PyrUpGpu(Mat img, Size img_size)
{
// 1. 扩大范围填充0
GpuMat inputMat(img);

int up_width = img_size.width;
int up_height = img_size.height;
auto output_up_mat = GpuMat(up_height, up_width, CV_8UC3);

dim3 block_up(32, 32);
dim3 grid_up((up_width + block_up.x - 1) / block_up.x, (up_height + block_up.y - 1) / block_up.y);
PyrUpCore << <grid_up, block_up >> > (inputMat, output_up_mat);

// 2. 四倍高斯滤波
int gb4_width = img_size.width;
int gb4_height = img_size.height;
auto output_gb4_mat = GpuMat(up_height, up_width, CV_8UC3);

dim3 block_gb4(32, 32);
dim3 grid_gb4((gb4_width + block_gb4.x - 1) / block_gb4.x, (gb4_height + block_gb4.y - 1) / block_gb4.y);
GaussianBlurFourfoldCore << <grid_gb4, block_gb4 >> > (output_up_mat, output_gb4_mat);

Mat dstImg;
output_gb4_mat.download(dstImg);
return dstImg;
}
subtract

计算两个数组或数组与标量之间的每元素差,生成拉普拉斯金字塔所需图像。

效果

代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
__global__ void SubtractCore(PtrStepSz<uchar3> inputMat1, PtrStepSz<uchar3> inputMat2, PtrStepSz<uchar3> outputMat)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int tidy = threadIdx.y + blockDim.y * blockIdx.y;

if (tidx < outputMat.cols && tidy < outputMat.rows)
{
outputMat(tidy, tidx).x = inputMat1(tidy, tidx).x - inputMat2(tidy, tidx).x;
outputMat(tidy, tidx).y = inputMat1(tidy, tidx).y - inputMat2(tidy, tidx).y;
outputMat(tidy, tidx).z = inputMat1(tidy, tidx).z - inputMat2(tidy, tidx).z;
}
}

/// <summary>
/// 图像减除
/// </summary>
/// <param name="img1"></param>
/// <param name="img2"></param>
/// <returns></returns>
extern "C" Mat SubtractGpu(Mat img1, Mat img2)
{
GpuMat inputMat1(img1);
GpuMat inputMat2(img2);

int width = img1.cols;
int height = img1.rows;
auto outputMat = GpuMat(height, width, CV_8UC3);

dim3 block(32, 32);
dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
SubtractCore << <grid, block >> > (inputMat1, inputMat2, outputMat);

cudaThreadSynchronize();

Mat dstImg;
outputMat.download(dstImg);
return dstImg;
}
add

计算两个数组或一个数组和一个标量的每元素和,使用上采样图像与拉普拉斯金字塔图像相加可还原图像

效果

代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
__global__ void AddCore(PtrStepSz<uchar3> inputMat1, PtrStepSz<uchar3> inputMat2, PtrStepSz<uchar3> outputMat)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int tidy = threadIdx.y + blockDim.y * blockIdx.y;

if (tidx < outputMat.cols && tidy < outputMat.rows)
{
outputMat(tidy, tidx).x = inputMat1(tidy, tidx).x + inputMat2(tidy, tidx).x;
outputMat(tidy, tidx).y = inputMat1(tidy, tidx).y + inputMat2(tidy, tidx).y;
outputMat(tidy, tidx).z = inputMat1(tidy, tidx).z + inputMat2(tidy, tidx).z;
}
}

/// <summary>
/// 图像增加
/// </summary>
/// <param name="img1"></param>
/// <param name="img2"></param>
/// <returns></returns>
extern "C" Mat AddGpu(Mat img1, Mat img2)
{
GpuMat inputMat1(img1);
GpuMat inputMat2(img2);

int width = img1.cols;
int height = img1.rows;
auto outputMat = GpuMat(height, width, CV_8UC3);

dim3 block(32, 32);
dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
AddCore << <grid, block >> > (inputMat1, inputMat2, outputMat);

cudaThreadSynchronize();

Mat dstImg;
outputMat.download(dstImg);
return dstImg;
}