使用 CuDNN 进行卷积运算【读书笔记】
【摘要】
原文链接 http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/
以下为长截图,CSDN 限定了图片长度,请点击查看原图
#include <cudnn.h> // ht...
原文链接 http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/
以下为长截图,CSDN 限定了图片长度,请点击查看原图
-
-
#include <cudnn.h>
-
-
// http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/
-
-
#define checkCUDNN(expression) \
-
{ \
-
cudnnStatus_t status = (expression); \
-
if (status != CUDNN_STATUS_SUCCESS) { \
-
std::cerr << "Error on line " << __LINE__ << ": " \
-
<< cudnnGetErrorString(status) << std::endl; \
-
std::exit(EXIT_FAILURE); \
-
} \
-
}
-
-
int main(int argc, const char* argv[]) {
-
if (argc < 2) {
-
std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
-
std::exit(EXIT_FAILURE);
-
}
-
-
int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
-
std::cerr << "GPU: " << gpu_id << std::endl;
-
-
bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
-
std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;
-
-
cv::Mat image = load_image(argv[1]);
-
-
cudaSetDevice(gpu_id);
-
-
cudnnHandle_t cudnn;
-
cudnnCreate(&cudnn);
-
-
// 输入张量的描述
-
cudnnTensorDescriptor_t input_descriptor;
-
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
-
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
-
/*format=*/CUDNN_TENSOR_NHWC, // 注意是 NHWC,TensorFlow更喜欢以 NHWC 格式存储张量(通道是变化最频繁的地方,即 BGR),而其他一些更喜欢将通道放在前面
-
/*dataType=*/CUDNN_DATA_FLOAT,
-
/*batch_size=*/1,
-
/*channels=*/3,
-
/*image_height=*/image.rows,
-
/*image_width=*/image.cols));
-
-
// 卷积核的描述(形状、格式)
-
cudnnFilterDescriptor_t kernel_descriptor;
-
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
-
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
-
/*dataType=*/CUDNN_DATA_FLOAT,
-
/*format=*/CUDNN_TENSOR_NCHW, // 注意是 NCHW
-
/*out_channels=*/3,
-
/*in_channels=*/3,
-
/*kernel_height=*/3,
-
/*kernel_width=*/3));
-
-
// 卷积操作的描述(步长、填充等等)
-
cudnnConvolutionDescriptor_t convolution_descriptor;
-
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
-
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
-
/*pad_height=*/1,
-
/*pad_width=*/1,
-
/*vertical_stride=*/1,
-
/*horizontal_stride=*/1,
-
/*dilation_height=*/1,
-
/*dilation_width=*/1,
-
/*mode=*/CUDNN_CROSS_CORRELATION, // CUDNN_CONVOLUTION
-
/*computeType=*/CUDNN_DATA_FLOAT));
-
-
// 计算卷积后图像的维数
-
int batch_size{ 0 }, channels{ 0 }, height{ 0 }, width{ 0 };
-
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
-
input_descriptor,
-
kernel_descriptor,
-
&batch_size,
-
&channels,
-
&height,
-
&width));
-
-
std::cerr << "Output Image: " << height << " x " << width << " x " << channels
-
<< std::endl;
-
-
// 卷积输出张量的描述
-
cudnnTensorDescriptor_t output_descriptor;
-
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
-
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
-
/*format=*/CUDNN_TENSOR_NHWC,
-
/*dataType=*/CUDNN_DATA_FLOAT,
-
/*batch_size=*/1,
-
/*channels=*/3,
-
/*image_height=*/image.rows,
-
/*image_width=*/image.cols));
-
-
// 卷积算法的描述
-
// cudnn_tion_fwd_algo_gemm——将卷积建模为显式矩阵乘法,
-
// cudnn_tion_fwd_algo_fft——它使用快速傅立叶变换(FFT)进行卷积或
-
// cudnn_tion_fwd_algo_winograd——它使用Winograd算法执行卷积。
-
cudnnConvolutionFwdAlgo_t convolution_algorithm;
-
checkCUDNN(
-
cudnnGetConvolutionForwardAlgorithm(cudnn,
-
input_descriptor,
-
kernel_descriptor,
-
convolution_descriptor,
-
output_descriptor,
-
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, // CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT(在内存受限的情况下,memoryLimitInBytes 设置非 0 值)
-
/*memoryLimitInBytes=*/0,
-
&convolution_algorithm));
-
-
// 计算 cuDNN 它的操作需要多少内存
-
size_t workspace_bytes{ 0 };
-
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
-
input_descriptor,
-
kernel_descriptor,
-
convolution_descriptor,
-
output_descriptor,
-
convolution_algorithm,
-
&workspace_bytes));
-
std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
-
<< std::endl;
-
assert(workspace_bytes > 0);
-
-
// *************************************************************************
-
// 分配内存, 从 cudnnGetConvolutionForwardWorkspaceSize 计算而得
-
void* d_workspace{ nullptr };
-
cudaMalloc(&d_workspace, workspace_bytes);
-
-
// 从 cudnnGetConvolution2dForwardOutputDim 计算而得
-
int image_bytes = batch_size * channels * height * width * sizeof(float);
-
-
float* d_input{ nullptr };
-
cudaMalloc(&d_input, image_bytes);
-
cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);
-
-
float* d_output{ nullptr };
-
cudaMalloc(&d_output, image_bytes);
-
cudaMemset(d_output, 0, image_bytes);
-
// *************************************************************************
-
// clang-format off
-
const float kernel_template[3][3] = {
-
{ 1, 1, 1 },
-
{ 1, -8, 1 },
-
{ 1, 1, 1 }
-
};
-
// clang-format on
-
-
float h_kernel[3][3][3][3]; // NCHW
-
for (int kernel = 0; kernel < 3; ++kernel) {
-
for (int channel = 0; channel < 3; ++channel) {
-
for (int row = 0; row < 3; ++row) {
-
for (int column = 0; column < 3; ++column) {
-
h_kernel[kernel][channel][row][column] = kernel_template[row][column];
-
}
-
}
-
}
-
}
-
-
float* d_kernel{ nullptr };
-
cudaMalloc(&d_kernel, sizeof(h_kernel));
-
cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
-
// *************************************************************************
-
-
const float alpha = 1.0f, beta = 0.0f;
-
-
// 真正的卷积操作 !!!前向卷积
-
checkCUDNN(cudnnConvolutionForward(cudnn,
-
&alpha,
-
input_descriptor,
-
d_input,
-
kernel_descriptor,
-
d_kernel,
-
convolution_descriptor,
-
convolution_algorithm,
-
d_workspace, // 注意,如果我们选择不需要额外内存的卷积算法,d_workspace可以为nullptr。
-
workspace_bytes,
-
&beta,
-
output_descriptor,
-
d_output));
-
-
if (with_sigmoid) {
-
-
// 描述激活
-
cudnnActivationDescriptor_t activation_descriptor;
-
checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
-
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
-
CUDNN_ACTIVATION_SIGMOID,
-
CUDNN_PROPAGATE_NAN,
-
/*relu_coef=*/0));
-
-
// 前向 sigmoid 激活函数
-
checkCUDNN(cudnnActivationForward(cudnn,
-
activation_descriptor,
-
&alpha,
-
output_descriptor,
-
d_output,
-
&beta,
-
output_descriptor,
-
d_output));
-
cudnnDestroyActivationDescriptor(activation_descriptor);
-
}
-
-
float* h_output = new float[image_bytes];
-
cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);
-
-
save_image("../cudnn-out.png", h_output, height, width);
-
-
delete[] h_output;
-
cudaFree(d_kernel);
-
cudaFree(d_input);
-
cudaFree(d_output);
-
cudaFree(d_workspace);
-
-
// 销毁
-
cudnnDestroyTensorDescriptor(input_descriptor);
-
cudnnDestroyTensorDescriptor(output_descriptor);
-
cudnnDestroyFilterDescriptor(kernel_descriptor);
-
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
-
-
cudnnDestroy(cudnn);
-
}
文章来源: panda1234lee.blog.csdn.net,作者:panda1234lee,版权归原作者所有,如需转载,请联系作者。
原文链接:panda1234lee.blog.csdn.net/article/details/83154504
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)