使用 CuDNN 进行卷积运算【读书笔记】

举报
ShaderJoy 发表于 2021/12/29 22:54:28 2021/12/29
【摘要】 原文链接 http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/  以下为长截图,CSDN 限定了图片长度,请点击查看原图 #include <cudnn.h> // ht...

原文链接 http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/ 

以下为长截图,CSDN 限定了图片长度,请点击查看原图


  
  1. #include <cudnn.h>
  2. // http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/
  3. #define checkCUDNN(expression) \
  4. { \
  5. cudnnStatus_t status = (expression); \
  6. if (status != CUDNN_STATUS_SUCCESS) { \
  7. std::cerr << "Error on line " << __LINE__ << ": " \
  8. << cudnnGetErrorString(status) << std::endl; \
  9. std::exit(EXIT_FAILURE); \
  10. } \
  11. }
  12. int main(int argc, const char* argv[]) {
  13. if (argc < 2) {
  14. std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
  15. std::exit(EXIT_FAILURE);
  16. }
  17. int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
  18. std::cerr << "GPU: " << gpu_id << std::endl;
  19. bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
  20. std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;
  21. cv::Mat image = load_image(argv[1]);
  22. cudaSetDevice(gpu_id);
  23. cudnnHandle_t cudnn;
  24. cudnnCreate(&cudnn);
  25. // 输入张量的描述
  26. cudnnTensorDescriptor_t input_descriptor;
  27. checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
  28. checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
  29. /*format=*/CUDNN_TENSOR_NHWC, // 注意是 NHWC,TensorFlow更喜欢以 NHWC 格式存储张量(通道是变化最频繁的地方,即 BGR),而其他一些更喜欢将通道放在前面
  30. /*dataType=*/CUDNN_DATA_FLOAT,
  31. /*batch_size=*/1,
  32. /*channels=*/3,
  33. /*image_height=*/image.rows,
  34. /*image_width=*/image.cols));
  35. // 卷积核的描述(形状、格式)
  36. cudnnFilterDescriptor_t kernel_descriptor;
  37. checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
  38. checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
  39. /*dataType=*/CUDNN_DATA_FLOAT,
  40. /*format=*/CUDNN_TENSOR_NCHW, // 注意是 NCHW
  41. /*out_channels=*/3,
  42. /*in_channels=*/3,
  43. /*kernel_height=*/3,
  44. /*kernel_width=*/3));
  45. // 卷积操作的描述(步长、填充等等)
  46. cudnnConvolutionDescriptor_t convolution_descriptor;
  47. checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
  48. checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
  49. /*pad_height=*/1,
  50. /*pad_width=*/1,
  51. /*vertical_stride=*/1,
  52. /*horizontal_stride=*/1,
  53. /*dilation_height=*/1,
  54. /*dilation_width=*/1,
  55. /*mode=*/CUDNN_CROSS_CORRELATION, // CUDNN_CONVOLUTION
  56. /*computeType=*/CUDNN_DATA_FLOAT));
  57. // 计算卷积后图像的维数
  58. int batch_size{ 0 }, channels{ 0 }, height{ 0 }, width{ 0 };
  59. checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
  60. input_descriptor,
  61. kernel_descriptor,
  62. &batch_size,
  63. &channels,
  64. &height,
  65. &width));
  66. std::cerr << "Output Image: " << height << " x " << width << " x " << channels
  67. << std::endl;
  68. // 卷积输出张量的描述
  69. cudnnTensorDescriptor_t output_descriptor;
  70. checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
  71. checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
  72. /*format=*/CUDNN_TENSOR_NHWC,
  73. /*dataType=*/CUDNN_DATA_FLOAT,
  74. /*batch_size=*/1,
  75. /*channels=*/3,
  76. /*image_height=*/image.rows,
  77. /*image_width=*/image.cols));
  78. // 卷积算法的描述
  79. // cudnn_tion_fwd_algo_gemm——将卷积建模为显式矩阵乘法,
  80. // cudnn_tion_fwd_algo_fft——它使用快速傅立叶变换(FFT)进行卷积或
  81. // cudnn_tion_fwd_algo_winograd——它使用Winograd算法执行卷积。
  82. cudnnConvolutionFwdAlgo_t convolution_algorithm;
  83. checkCUDNN(
  84. cudnnGetConvolutionForwardAlgorithm(cudnn,
  85. input_descriptor,
  86. kernel_descriptor,
  87. convolution_descriptor,
  88. output_descriptor,
  89. CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, // CUDNN_CONVOLUTION_FWD_SPECIFY_​WORKSPACE_LIMIT(在内存受限的情况下,memoryLimitInBytes 设置非 0 值)
  90. /*memoryLimitInBytes=*/0,
  91. &convolution_algorithm));
  92. // 计算 cuDNN 它的操作需要多少内存
  93. size_t workspace_bytes{ 0 };
  94. checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
  95. input_descriptor,
  96. kernel_descriptor,
  97. convolution_descriptor,
  98. output_descriptor,
  99. convolution_algorithm,
  100. &workspace_bytes));
  101. std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
  102. << std::endl;
  103. assert(workspace_bytes > 0);
  104. // *************************************************************************
  105. // 分配内存, 从 cudnnGetConvolutionForwardWorkspaceSize 计算而得
  106. void* d_workspace{ nullptr };
  107. cudaMalloc(&d_workspace, workspace_bytes);
  108. // 从 cudnnGetConvolution2dForwardOutputDim 计算而得
  109. int image_bytes = batch_size * channels * height * width * sizeof(float);
  110. float* d_input{ nullptr };
  111. cudaMalloc(&d_input, image_bytes);
  112. cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);
  113. float* d_output{ nullptr };
  114. cudaMalloc(&d_output, image_bytes);
  115. cudaMemset(d_output, 0, image_bytes);
  116. // *************************************************************************
  117. // clang-format off
  118. const float kernel_template[3][3] = {
  119. { 1, 1, 1 },
  120. { 1, -8, 1 },
  121. { 1, 1, 1 }
  122. };
  123. // clang-format on
  124. float h_kernel[3][3][3][3]; // NCHW
  125. for (int kernel = 0; kernel < 3; ++kernel) {
  126. for (int channel = 0; channel < 3; ++channel) {
  127. for (int row = 0; row < 3; ++row) {
  128. for (int column = 0; column < 3; ++column) {
  129. h_kernel[kernel][channel][row][column] = kernel_template[row][column];
  130. }
  131. }
  132. }
  133. }
  134. float* d_kernel{ nullptr };
  135. cudaMalloc(&d_kernel, sizeof(h_kernel));
  136. cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
  137. // *************************************************************************
  138. const float alpha = 1.0f, beta = 0.0f;
  139. // 真正的卷积操作 !!!前向卷积
  140. checkCUDNN(cudnnConvolutionForward(cudnn,
  141. &alpha,
  142. input_descriptor,
  143. d_input,
  144. kernel_descriptor,
  145. d_kernel,
  146. convolution_descriptor,
  147. convolution_algorithm,
  148. d_workspace, // 注意,如果我们选择不需要额外内存的卷积算法,d_workspace可以为nullptr。
  149. workspace_bytes,
  150. &beta,
  151. output_descriptor,
  152. d_output));
  153. if (with_sigmoid) {
  154. // 描述激活
  155. cudnnActivationDescriptor_t activation_descriptor;
  156. checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
  157. checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
  158. CUDNN_ACTIVATION_SIGMOID,
  159. CUDNN_PROPAGATE_NAN,
  160. /*relu_coef=*/0));
  161. // 前向 sigmoid 激活函数
  162. checkCUDNN(cudnnActivationForward(cudnn,
  163. activation_descriptor,
  164. &alpha,
  165. output_descriptor,
  166. d_output,
  167. &beta,
  168. output_descriptor,
  169. d_output));
  170. cudnnDestroyActivationDescriptor(activation_descriptor);
  171. }
  172. float* h_output = new float[image_bytes];
  173. cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);
  174. save_image("../cudnn-out.png", h_output, height, width);
  175. delete[] h_output;
  176. cudaFree(d_kernel);
  177. cudaFree(d_input);
  178. cudaFree(d_output);
  179. cudaFree(d_workspace);
  180. // 销毁
  181. cudnnDestroyTensorDescriptor(input_descriptor);
  182. cudnnDestroyTensorDescriptor(output_descriptor);
  183. cudnnDestroyFilterDescriptor(kernel_descriptor);
  184. cudnnDestroyConvolutionDescriptor(convolution_descriptor);
  185. cudnnDestroy(cudnn);
  186. }

 

文章来源: panda1234lee.blog.csdn.net,作者:panda1234lee,版权归原作者所有,如需转载,请联系作者。

原文链接:panda1234lee.blog.csdn.net/article/details/83154504

【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。