使用 CuDNN 进行卷积运算【读书笔记】

举报
ShaderJoy 发表于 2021/12/29 22:54:28 2021/12/29
1.6k+ 0 0
【摘要】 原文链接 http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/  以下为长截图,CSDN 限定了图片长度,请点击查看原图 #include <cudnn.h> // ht...

原文链接 http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/ 

以下为长截图,CSDN 限定了图片长度,请点击查看原图


      #include <cudnn.h>
      // http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/
      #define checkCUDNN(expression) \
     
       { \
       cudnnStatus_t status = (expression); \
       if (status != CUDNN_STATUS_SUCCESS) { \
       std::cerr << "Error on line " << __LINE__ << ": " \
       << cudnnGetErrorString(status) << std::endl; \
       std::exit(EXIT_FAILURE); \
       } \
       }
      int main(int argc, const char* argv[]) {
     	if (argc < 2) {
      		std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
      		std::exit(EXIT_FAILURE);
      	}
     	int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
      	std::cerr << "GPU: " << gpu_id << std::endl;
     	bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
      	std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;
      	cv::Mat image = load_image(argv[1]);
     	cudaSetDevice(gpu_id);
      	cudnnHandle_t cudnn;
     	cudnnCreate(&cudnn);
     	// 输入张量的描述
      	cudnnTensorDescriptor_t input_descriptor;
     	checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
     	checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
     		/*format=*/CUDNN_TENSOR_NHWC,	// 注意是 NHWC,TensorFlow更喜欢以 NHWC 格式存储张量(通道是变化最频繁的地方,即 BGR),而其他一些更喜欢将通道放在前面
     		/*dataType=*/CUDNN_DATA_FLOAT,
     		/*batch_size=*/1,
     		/*channels=*/3,
     		/*image_height=*/image.rows,
     		/*image_width=*/image.cols));
     	// 卷积核的描述(形状、格式)
      	cudnnFilterDescriptor_t kernel_descriptor;
     	checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
     	checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
     		/*dataType=*/CUDNN_DATA_FLOAT,
     		/*format=*/CUDNN_TENSOR_NCHW,	// 注意是 NCHW
     		/*out_channels=*/3,
     		/*in_channels=*/3,
     		/*kernel_height=*/3,
     		/*kernel_width=*/3));
     	// 卷积操作的描述(步长、填充等等)
      	cudnnConvolutionDescriptor_t convolution_descriptor;
     	checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
     	checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
     		/*pad_height=*/1,
     		/*pad_width=*/1,
     		/*vertical_stride=*/1,
     		/*horizontal_stride=*/1,
     		/*dilation_height=*/1,
     		/*dilation_width=*/1,
     		/*mode=*/CUDNN_CROSS_CORRELATION, // CUDNN_CONVOLUTION
     		/*computeType=*/CUDNN_DATA_FLOAT));
     	// 计算卷积后图像的维数
     	int batch_size{ 0 }, channels{ 0 }, height{ 0 }, width{ 0 };
     	checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
      		input_descriptor,
      		kernel_descriptor,
      		&batch_size,
      		&channels,
      		&height,
      		&width));
      	std::cerr << "Output Image: " << height << " x " << width << " x " << channels
      		<< std::endl;
     	// 卷积输出张量的描述
      	cudnnTensorDescriptor_t output_descriptor;
     	checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
     	checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
     		/*format=*/CUDNN_TENSOR_NHWC,
     		/*dataType=*/CUDNN_DATA_FLOAT,
     		/*batch_size=*/1,
     		/*channels=*/3,
     		/*image_height=*/image.rows,
     		/*image_width=*/image.cols));
     	// 卷积算法的描述
     	// cudnn_tion_fwd_algo_gemm——将卷积建模为显式矩阵乘法,
     	// cudnn_tion_fwd_algo_fft——它使用快速傅立叶变换(FFT)进行卷积或
     	// cudnn_tion_fwd_algo_winograd——它使用Winograd算法执行卷积。
      	cudnnConvolutionFwdAlgo_t convolution_algorithm;
     	checkCUDNN(
     		cudnnGetConvolutionForwardAlgorithm(cudnn,
      		input_descriptor,
      		kernel_descriptor,
      		convolution_descriptor,
      		output_descriptor,
      		CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, // CUDNN_CONVOLUTION_FWD_SPECIFY_​WORKSPACE_LIMIT(在内存受限的情况下,memoryLimitInBytes 设置非 0 值)
     		/*memoryLimitInBytes=*/0,
      		&convolution_algorithm));
     	// 计算 cuDNN 它的操作需要多少内存
     	size_t workspace_bytes{ 0 };
     	checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
      		input_descriptor,
      		kernel_descriptor,
      		convolution_descriptor,
      		output_descriptor,
      		convolution_algorithm,
      		&workspace_bytes));
      	std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
      		<< std::endl;
     	assert(workspace_bytes > 0);
     	// *************************************************************************
     	// 分配内存, 从 cudnnGetConvolutionForwardWorkspaceSize 计算而得
     	void* d_workspace{ nullptr };
     	cudaMalloc(&d_workspace, workspace_bytes);
     	// 从 cudnnGetConvolution2dForwardOutputDim 计算而得
     	int image_bytes = batch_size * channels * height * width * sizeof(float);
     	float* d_input{ nullptr };
     	cudaMalloc(&d_input, image_bytes);
     	cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);
     	float* d_output{ nullptr };
     	cudaMalloc(&d_output, image_bytes);
     	cudaMemset(d_output, 0, image_bytes);
     	// *************************************************************************
     	// clang-format off
     	const float kernel_template[3][3] = {
      		{ 1, 1, 1 },
      		{ 1, -8, 1 },
      		{ 1, 1, 1 }
      	};
     	// clang-format on
     	float h_kernel[3][3][3][3]; // NCHW
     	for (int kernel = 0; kernel < 3; ++kernel) {
     		for (int channel = 0; channel < 3; ++channel) {
     			for (int row = 0; row < 3; ++row) {
     				for (int column = 0; column < 3; ++column) {
      					h_kernel[kernel][channel][row][column] = kernel_template[row][column];
      				}
      			}
      		}
      	}
     	float* d_kernel{ nullptr };
     	cudaMalloc(&d_kernel, sizeof(h_kernel));
     	cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
     	// *************************************************************************
     	const float alpha = 1.0f, beta = 0.0f;
     	// 真正的卷积操作 !!!前向卷积
     	checkCUDNN(cudnnConvolutionForward(cudnn,
      		&alpha,
      		input_descriptor,
      		d_input,
      		kernel_descriptor,
      		d_kernel,
      		convolution_descriptor,
      		convolution_algorithm,
      		d_workspace, // 注意,如果我们选择不需要额外内存的卷积算法,d_workspace可以为nullptr。
      		workspace_bytes,
      		&beta,
      		output_descriptor,
      		d_output));
     	if (with_sigmoid) {
     		// 描述激活
      		cudnnActivationDescriptor_t activation_descriptor;
     		checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
     		checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
      			CUDNN_ACTIVATION_SIGMOID,
      			CUDNN_PROPAGATE_NAN,
     			/*relu_coef=*/0));
     		// 前向 sigmoid 激活函数
     		checkCUDNN(cudnnActivationForward(cudnn,
      			activation_descriptor,
      			&alpha,
      			output_descriptor,
      			d_output,
      			&beta,
      			output_descriptor,
      			d_output));
     		cudnnDestroyActivationDescriptor(activation_descriptor);
      	}
     	float* h_output = new float[image_bytes];
     	cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);
     	save_image("../cudnn-out.png", h_output, height, width);
     	delete[] h_output;
     	cudaFree(d_kernel);
     	cudaFree(d_input);
     	cudaFree(d_output);
     	cudaFree(d_workspace);
     	// 销毁
     	cudnnDestroyTensorDescriptor(input_descriptor);
     	cudnnDestroyTensorDescriptor(output_descriptor);
     	cudnnDestroyFilterDescriptor(kernel_descriptor);
     	cudnnDestroyConvolutionDescriptor(convolution_descriptor);
     	cudnnDestroy(cudnn);
      }
  
 

文章来源: panda1234lee.blog.csdn.net,作者:panda1234lee,版权归原作者所有,如需转载,请联系作者。

原文链接:panda1234lee.blog.csdn.net/article/details/83154504

【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

作者其他文章

评论(0

抱歉,系统识别当前为高风险访问,暂不支持该操作

    全部回复

    上滑加载中

    设置昵称

    在此一键设置昵称,即可参与社区互动!

    *长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

    *长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。