- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

SLAM各种并行加速方法

Hermit_Rabbit 发表于 2022/09/21 20:23:26 2022/09/21

【摘要】前言 CPU并行加速CPU并行加速的本质就是通过硬件并发（hardware concurrency）的形式来实现。这种的操作方式是通过单个进程里多线程，从而实现共享地址空间，全局变量，指针，引用。但是这种方式相对而言更加传统，但是同时更加具有普适性。其中操作是使用以pthread为代表的多线程并行加速 pthread这是一个pthread的简单示例代码。class helloFromObj...

前言

CPU并行加速

CPU并行加速的本质就是通过硬件并发（hardware concurrency）的形式来实现。这种的操作方式是通过单个进程里多线程，从而实现共享地址空间，全局变量，指针，引用。但是这种方式相对而言更加传统，但是同时更加具有普适性。

其中操作是使用以pthread为代表的多线程并行加速

pthread

这是一个pthread的简单示例代码。

class helloFromObject{
public:
    void operator()() const{
        std::cout << "Hello, My Second thread!" << std::endl;
    }
};

int main() {
    std::cout << "Hello, Coconut Cat!" << std::endl;
    std::thread bthread((helloFromObject()));
    bthread.join();
    return 0;
}

我们可以发现pthread这种多线程加速v-slam场景下都有着充分的应用

OpenMP

openmp作为另一种CPU提速方式，在SLAM的特征提取中拥有比较良好的加速代码。当然thread对于openmp还是有一定的影响的，每个thread分配给omp的线程可能减小或者是在thread里面继续调用omp再开线程会带来更大的成本，导致omp单独执行时变慢。

#include <cv.h>
#include <highgui.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#include "bench_source/harris.h"

// CHECK_FINAL_RESULT toggles the display of the input and output images
//#define CHECK_FINAL_RESULT
// Experiment : try to run multiples pipelines in parallel
//#define RUN_PARALLEL = true
// Check if image to matrix translation produces the correst output
//#define CHECK_LOADING_DATA
using namespace std;

static int minruns = 1;

int main(int argc, char ** argv)
{
  int i, j, run;
  int R, C, nruns;
  double begin, end;
  double init, finish;
  double stime, avgt;
  cv::Mat image, loaded_data;
  cv::Scalar sc;
  cv::Size size;

  float *t_res;
  float *t_data;

  /* Might be unused depending on preprocessor macro definitions */
  (void)t_res;
  (void)t_data;
  (void)loaded_data;

#ifdef VERSION_ALIGNED
    float ** data;
    float ** res;
    printf("VERSION_ALIGNED\n");
    int cache_line_size = get_cache_line_size();
#else
    float * data;
    float * res;
#endif

  if ( argc != 3 )
  {
      printf("usage: harris <Image_Path> <Nruns>\n");
     return -1;
  }

  printf("Loading image ....\n");
  image = cv::imread( argv[1], 1 );
  printf("%s successfully loaded !\n\n", argv[1]);

  if ( !image.data )
  {
      printf("No image data ! Are you sure %s is an image ?\n", argv[1]);
      return -1;
  }

  // Convert image input to grayscale floating point
  cv::cvtColor(image, image, cv::COLOR_BGR2GRAY);
  size = image.size();
  C = size.width;
  R = size.height;
  nruns = max(minruns, atoi(argv[2]));

  printf("_________________________________________\n");
  printf("Values settings :\n");
  printf("Nruns : %i || %s [%i, %i]\n", nruns, argv[1], R, C);
  printf("_________________________________________\n");

#ifdef VERSION_ALIGNED
    res = alloc_array_lines(R, C, cache_line_size);
#else
    res = (float *) calloc(R * C, sizeof(*res));
#endif

  if(res == NULL)
  {
    printf("Error while allocating result table of size %ld B\n",
      (sizeof(*res) * C * R ));
    return -1;
  }

#ifdef VERSION_ALIGNED
    data = (float **) alloc_array_lines(R, C, cache_line_size);
    for(i= 0; i < R;i++){
      for(j = 0; j < C;j++){
        sc = image.at<uchar>(i, j) ;
        data[i][j] = (float) sc.val[0]/255;
      }
    }
#else
    data = (float *) malloc(R*C*sizeof(float));
    for(i= 0; i < R;i++){
      for(j = 0; j < C;j++){
        sc = image.at<uchar>(i, j) ;
        data[i*C+j] = (float) sc.val[0]/255;
      }
    }
#endif

  // Running tests
  avgt = 0.0f;
  /*
  Do not use clock here we need elapsed "wall clock time", not total CPU time.
  */
  init =  omp_get_wtime();

#ifdef RUN_PARALLEL
    #pragma omp parallel for shared(avgt)
#endif
  for(run = 0; run <= nruns; run++)
  {
    begin = omp_get_wtime();
    pipeline_harris(C, R, data, res);
    end = omp_get_wtime();
    stime = end - begin;
    if(run !=0){
      printf("Run %i : \t\t %f ms\n", run, (double) stime * 1000.0 );
#ifdef RUN_PARALLEL
        #pragma omp atomic
#endif
      avgt += stime;
    }
  }

  finish =  omp_get_wtime();

  if(avgt == 0)
  {
    printf("Error : running didn't take time !");
    return -1;
  }
  printf("Average time : %f ms\n", (double) (1000.0 * avgt / (nruns)));
  printf("Total time : %f ms\n", (double) (finish - init) * 1000.0);

#ifdef RUN_PARALLEL
    printf("Gain total times to run %i instances in parallel / serial time :\n ", nruns);
    printf("\t %f\n",(double) (finish-init)/(avgt));
#endif

  // Checking images using OpenCV

#ifdef VERSION_ALIGNED
    t_res = (float *) malloc(sizeof(float)*R*C);
    for(int i = 0; i < R; i++){
      for(int j = 0; j < C; j++){
        t_res[i * C + j] = res[i][j];
      }
    }
#else
   t_res = res;
#endif

#ifdef CHECK_LOADING_DATA
#ifdef VERSION_ALIGNED
    t_data = (float *) malloc(sizeof(float)*R*C);
    for(int i = 0; i < R; i++){
      for(int j = 0; j < C; j++){
        t_data[i * C + j] = data[i][j];
      }
    }
    loaded_data = cv::Mat(R,C,CV_32F, t_data);
#else
    loaded_data = cv::Mat(R,C,CV_32F, data);
#endif

    cv::namedWindow( "Check data", cv::WINDOW_NORMAL);
    cv::imshow( "Check data", loaded_data);
    cv::waitKey(0);
    cv::destroyAllWindows();
    loaded_data.release();

#ifdef VERSION_ALIGNED
      free(t_data);
#endif

#endif /* CHECK_LOADING_DATA */

#ifdef CHECK_FINAL_RESULT
    cv::Mat imres = cv::Mat(R, C, CV_32F, t_res);
    cv::namedWindow( "Input", cv::WINDOW_NORMAL );
    cv::imshow( "Input", image );
    cv::namedWindow( "Output", cv::WINDOW_NORMAL );
    cv::imshow( "Output", imres * 65535.0 );
    cv::waitKey(0);
    cv::destroyAllWindows();
    imres.release();
    free(t_res);
#endif /* CHECK_FINAL_RESULT */

#ifdef VERSION_ALIGNED
    free(res);
#endif

  image.release();
  free(data);
  return 0;

}

GPU并行加速

Nvidia 的CUDA工具箱中提高了免费的GPU加速的快速傅里叶变换（FFT）、基本线性代数子程序（BLAST）、图像与视频处理库（NPP）。用户只要把源代码中CPU版本的快速傅里叶变换、快速傅里叶变换和图像与视频处理库替换成相应的GPU版，即可得到性能加速。除了Nvidia提供的函数库以外，第三方的GPU函数库有：

CUDA数据并行基元库（cuDPP）
CULA工具：由EM Photonics公司推出， CUDA GPU中的LAPACK
MAGMA：由Dongarra’s Group推出，CUDA GPU和多核CPU中的LAPACK
雅可比预处理共轭梯度（JCG）
GPULib：针对接口描述语言（IDL）以及矩阵实验室（MATLAB）的数学函数库
GPU VSIPL信号处理库
计算机视觉（CV）以及成像库
OpenCurrent:规则网格系统中CUDA加速PDE(partial
differential equation，偏微分方程)开源数据库解决方案
CUDA / GPU中的libSVM
Multisvm：利用CUDA的多等级SVM
cuSVM：支持矢量分类与衰减的CUDA使用方法

CUDA

目前v-slam算法的主要是依赖CUDA开发加速的，为此，我们在GPU加速部分仅仅对CUDA进行举例介绍，等后面作者涉及到其他的加速后，再来扩充。我们在使用CPU并行计算的同时可以使用GPU加速，从而来耗费一定的GPU计算资源来提升视觉前端处理的能力。

…详情请参照古月居

<!--

【声明】本内容来自华为云开发者社区博主，不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源（华为云社区）、文章链接、文章作者等基本信息，否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容，欢迎发送邮件进行举报，并提供相关证据，一经查实，本社区将立刻删除涉嫌侵权内容，举报邮箱： cloudbbs@huaweicloud.com

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

SLAM各种并行加速方法

前言