- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

双线性插值算法ARM NEON优化

风吹稻花香发表于 2021/06/05 01:59:40 2021/06/05

【摘要】 C语言版本双线性插值算法 inline double bilinear_interp(double x, double y, double v11, double v12, double v21, double v22) { return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y)...

C语言版本双线性插值算法


       inline double bilinear_interp(double x, double y, double v11, double v12,
       double v21, double v22) {
       return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y) * x;
       }
   
  
   1
   2
   3
   4
   5

使用ARM NOEN优化后的双线性插值版本


       inline uint8x8_t bilinear_interp_NEON(double x, double y, uint8x8_t v11,uint8x8_t v12,
        uint8x8_t v21,uint8x8_t v22)
       {
        uint16x8_t v11_16 = vmovl_u8(v11);
        uint16x8_t v12_16 = vmovl_u8(v12);
        uint16x8_t v21_16 = vmovl_u8(v21);
        uint16x8_t v22_16 = vmovl_u8(v22);
       ///convert v11 to two float32x4
        uint16x4_t v_16_low = vget_low_u16(v11_16);
        uint16x4_t v_16_high = vget_high_u16(v11_16);
        uint32x4_t v_32_low = vmovl_u16(v_16_low);
        uint32x4_t v_32_high = vmovl_u16(v_16_high);
        float32x4_t v11_32f_low = vcvtq_f32_u32(v_32_low);
        float32x4_t v11_32f_high = vcvtq_f32_u32(v_32_high);
       //v12
        v_16_low = vget_low_u16(v12_16);
        v_16_high = vget_high_u16(v12_16);
        v_32_low = vmovl_u16(v_16_low);
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t v12_32f_low = vcvtq_f32_u32(v_32_low);
        float32x4_t v12_32f_high = vcvtq_f32_u32(v_32_high);
       //v21
        v_16_low = vget_low_u16(v21_16);
        v_16_high = vget_high_u16(v21_16);
        v_32_low = vmovl_u16(v_16_low);
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t v21_32f_low = vcvtq_f32_u32(v_32_low);
        float32x4_t v21_32f_high = vcvtq_f32_u32(v_32_high);
       //v22
        v_16_low = vget_low_u16(v22_16);
        v_16_high = vget_high_u16(v22_16);
        v_32_low = vmovl_u16(v_16_low);
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t v22_32f_low = vcvtq_f32_u32(v_32_low);
        float32x4_t v22_32f_high = vcvtq_f32_u32(v_32_high);
        float32_t fx = (float32_t)x;
        float32_t fy = (float32_t)y;
        float32_t one_fx = 1-fx;
        float32_t one_fy = 1-fy;
        float32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
        uint32x4_t result_32_low,result_32_high;
        uint16x4_t result_16_low,result_16_high;
       //for low 32x4
        tmp1 = vmulq_n_f32(v11_32f_low, one_fy);
        tmp2 = vmulq_n_f32(v12_32f_low, fy);
        tmp3 = vaddq_f32(tmp1, tmp2);
        tmp4 = vmulq_n_f32(tmp3, one_fx);
        tmp1 = vmulq_n_f32(v21_32f_low, one_fy);
        tmp2 = vmulq_n_f32(v22_32f_low, fy);
        tmp3 = vaddq_f32(tmp1, tmp2);
        tmp5 = vmulq_n_f32(tmp3, fx);
        tmp = vaddq_f32(tmp4, tmp5);
        result_32_low = vcvtq_u32_f32(tmp);
        result_16_low = vqmovn_u32(result_32_low);
       //for high 32x4
        tmp1 = vmulq_n_f32(v11_32f_high, one_fy);
        tmp2 = vmulq_n_f32(v12_32f_high, fy);
        tmp3 = vaddq_f32(tmp1, tmp2);
        tmp4 = vmulq_n_f32(tmp3, one_fx);
        tmp1 = vmulq_n_f32(v21_32f_high, one_fy);
        tmp2 = vmulq_n_f32(v22_32f_high, fy);
        tmp3 = vaddq_f32(tmp1, tmp2);
        tmp5 = vmulq_n_f32(tmp3, fx);
        tmp = vaddq_f32(tmp4, tmp5);
        result_32_high = vcvtq_u32_f32(tmp);
        result_16_high = vqmovn_u32(result_32_high);
        uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
        uint8x8_t result_8 = vqmovn_u16(result_16);
       return result_8;
       }
   
  
   1
   2
   3
   4
   5
   6
   7
   8
   9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84

使用ARM NEON后，一次可以处理8个像素，成倍提高了运行的速度。实践中需要特别注意对边界的处理（行的开始和结尾处）。
到这里我们还不能满足。要追求更快！！！
注意上面的代码中虽然用了ARM NEON，但是在ARM 指令集的操作中用到了浮点操作。所以，还可以继续使用浮点数定点化的优化方式，优化后的代码如下：


       inline uint8x8_t bilinear_interp_NEON_FixedPoint(double x, double y, uint8x8_t v11,uint8x8_t v12,
        uint8x8_t v21,uint8x8_t v22)
       {
        uint16x8_t v11_16 = vmovl_u8(v11);
        uint16x8_t v12_16 = vmovl_u8(v12);
        uint16x8_t v21_16 = vmovl_u8(v21);
        uint16x8_t v22_16 = vmovl_u8(v22);
        uint16x4_t v_16_low = vget_low_u16(v11_16);
        uint16x4_t v_16_high = vget_high_u16(v11_16);
        uint32x4_t v11_32_low = vmovl_u16(v_16_low);
        uint32x4_t v11_32_high = vmovl_u16(v_16_high);
        v_16_low = vget_low_u16(v12_16);
        v_16_high = vget_high_u16(v12_16);
        uint32x4_t v12_32_low = vmovl_u16(v_16_low);
        uint32x4_t v12_32_high = vmovl_u16(v_16_high);
        v_16_low = vget_low_u16(v21_16);
        v_16_high = vget_high_u16(v21_16);
        uint32x4_t v21_32_low = vmovl_u16(v_16_low);
        uint32x4_t v21_32_high = vmovl_u16(v_16_high);
        v_16_low = vget_low_u16(v22_16);
        v_16_high = vget_high_u16(v22_16);
        uint32x4_t v22_32_low = vmovl_u16(v_16_low);
        uint32x4_t v22_32_high = vmovl_u16(v_16_high);
       unsigned int intX = x*4096;
       unsigned int intY = y*4096;
       unsigned int one_x = 4096-intX;
       unsigned int one_y = 4096-intY;
        uint32_t intX_32 = (uint32_t) intX;
        uint32_t intY_32 = (uint32_t) intY;
        uint32_t oneX_32 = (uint32_t) one_x;
        uint32_t oneY_32 = (uint32_t) one_y;
        uint32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
        uint16x4_t result_16_low, result_16_high;
       //for low 4 numbers
        tmp1 = vmulq_n_u32(v11_32_low,oneY_32);
        tmp2 = vmulq_n_u32(v12_32_low, intY_32);
        tmp3 = vaddq_u32(tmp1, tmp2);
        tmp4 = vmulq_n_u32(tmp3, oneX_32);
        tmp1 = vmulq_n_u32(v21_32_low, oneY_32);
        tmp2 = vmulq_n_u32(v22_32_low, intY_32);
        tmp3 = vaddq_u32(tmp1, tmp2);
        tmp5 = vmulq_n_u32(tmp3, intX_32);
        tmp = vaddq_u32(tmp4, tmp5);
        result_16_low = vshrn_n_u32(tmp,16); //shift right 16 bytes
        result_16_low = vrshr_n_u16(result_16_low,8); //shift right 8 bytes, totally 24 bytes
       //for high 4 numbers
        tmp1 = vmulq_n_u32(v11_32_high,oneY_32);
        tmp2 = vmulq_n_u32(v12_32_high, intY_32);
        tmp3 = vaddq_u32(tmp1, tmp2);
        tmp4 = vmulq_n_u32(tmp3, oneX_32);
        tmp1 = vmulq_n_u32(v21_32_high, oneY_32);
        tmp2 = vmulq_n_u32(v22_32_high, intY_32);
        tmp3 = vaddq_u32(tmp1, tmp2);
        tmp5 = vmulq_n_u32(tmp3, intX_32);
        tmp = vaddq_u32(tmp4, tmp5);
        result_16_high = vshrn_n_u32(tmp,16);  //shift right 16 bytes
        result_16_high = vrshr_n_u16(result_16_high,8);  //shift right 8 bytes, totally 24 bytes
        uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
        uint8x8_t result_8 = vqmovn_u16(result_16);
       return result_8;
       }
   
  
   1
   2
   3
   4
   5
   6
   7
   8
   9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79

加入浮点定点化之后的优化，时间能进一步提升一倍左右。

文章来源: blog.csdn.net，作者：网奇，版权归原作者所有，如需转载，请联系作者。

原文链接：blog.csdn.net/jacke121/article/details/55253542

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

双线性插值算法ARM NEON优化

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

双线性插值算法ARM NEON优化

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

推荐阅读

相关产品