双线性插值算法ARM NEON优化

举报
风吹稻花香 发表于 2021/06/05 01:59:40 2021/06/05
【摘要】 C语言版本双线性插值算法 inline double bilinear_interp(double x, double y, double v11, double v12, double v21, double v22) { return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y)...

C语言版本双线性插值算法


   
  1. inline double bilinear_interp(double x, double y, double v11, double v12,
  2. double v21, double v22) {
  3. return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y) * x;
  4. }
  • 1
  • 2
  • 3
  • 4
  • 5

使用ARM NOEN优化后的双线性插值版本


   
  1. inline uint8x8_t bilinear_interp_NEON(double x, double y, uint8x8_t v11,uint8x8_t v12,
  2. uint8x8_t v21,uint8x8_t v22)
  3. {
  4. uint16x8_t v11_16 = vmovl_u8(v11);
  5. uint16x8_t v12_16 = vmovl_u8(v12);
  6. uint16x8_t v21_16 = vmovl_u8(v21);
  7. uint16x8_t v22_16 = vmovl_u8(v22);
  8. ///convert v11 to two float32x4
  9. uint16x4_t v_16_low = vget_low_u16(v11_16);
  10. uint16x4_t v_16_high = vget_high_u16(v11_16);
  11. uint32x4_t v_32_low = vmovl_u16(v_16_low);
  12. uint32x4_t v_32_high = vmovl_u16(v_16_high);
  13. float32x4_t v11_32f_low = vcvtq_f32_u32(v_32_low);
  14. float32x4_t v11_32f_high = vcvtq_f32_u32(v_32_high);
  15. //v12
  16. v_16_low = vget_low_u16(v12_16);
  17. v_16_high = vget_high_u16(v12_16);
  18. v_32_low = vmovl_u16(v_16_low);
  19. v_32_high = vmovl_u16(v_16_high);
  20. float32x4_t v12_32f_low = vcvtq_f32_u32(v_32_low);
  21. float32x4_t v12_32f_high = vcvtq_f32_u32(v_32_high);
  22. //v21
  23. v_16_low = vget_low_u16(v21_16);
  24. v_16_high = vget_high_u16(v21_16);
  25. v_32_low = vmovl_u16(v_16_low);
  26. v_32_high = vmovl_u16(v_16_high);
  27. float32x4_t v21_32f_low = vcvtq_f32_u32(v_32_low);
  28. float32x4_t v21_32f_high = vcvtq_f32_u32(v_32_high);
  29. //v22
  30. v_16_low = vget_low_u16(v22_16);
  31. v_16_high = vget_high_u16(v22_16);
  32. v_32_low = vmovl_u16(v_16_low);
  33. v_32_high = vmovl_u16(v_16_high);
  34. float32x4_t v22_32f_low = vcvtq_f32_u32(v_32_low);
  35. float32x4_t v22_32f_high = vcvtq_f32_u32(v_32_high);
  36. float32_t fx = (float32_t)x;
  37. float32_t fy = (float32_t)y;
  38. float32_t one_fx = 1-fx;
  39. float32_t one_fy = 1-fy;
  40. float32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
  41. uint32x4_t result_32_low,result_32_high;
  42. uint16x4_t result_16_low,result_16_high;
  43. //for low 32x4
  44. tmp1 = vmulq_n_f32(v11_32f_low, one_fy);
  45. tmp2 = vmulq_n_f32(v12_32f_low, fy);
  46. tmp3 = vaddq_f32(tmp1, tmp2);
  47. tmp4 = vmulq_n_f32(tmp3, one_fx);
  48. tmp1 = vmulq_n_f32(v21_32f_low, one_fy);
  49. tmp2 = vmulq_n_f32(v22_32f_low, fy);
  50. tmp3 = vaddq_f32(tmp1, tmp2);
  51. tmp5 = vmulq_n_f32(tmp3, fx);
  52. tmp = vaddq_f32(tmp4, tmp5);
  53. result_32_low = vcvtq_u32_f32(tmp);
  54. result_16_low = vqmovn_u32(result_32_low);
  55. //for high 32x4
  56. tmp1 = vmulq_n_f32(v11_32f_high, one_fy);
  57. tmp2 = vmulq_n_f32(v12_32f_high, fy);
  58. tmp3 = vaddq_f32(tmp1, tmp2);
  59. tmp4 = vmulq_n_f32(tmp3, one_fx);
  60. tmp1 = vmulq_n_f32(v21_32f_high, one_fy);
  61. tmp2 = vmulq_n_f32(v22_32f_high, fy);
  62. tmp3 = vaddq_f32(tmp1, tmp2);
  63. tmp5 = vmulq_n_f32(tmp3, fx);
  64. tmp = vaddq_f32(tmp4, tmp5);
  65. result_32_high = vcvtq_u32_f32(tmp);
  66. result_16_high = vqmovn_u32(result_32_high);
  67. uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
  68. uint8x8_t result_8 = vqmovn_u16(result_16);
  69. return result_8;
  70. }
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84

使用ARM NEON后,一次可以处理8个像素,成倍提高了运行的速度。实践中需要特别注意对边界的处理(行的开始和结尾处)。
到这里我们还不能满足。要追求更快!!!
注意上面的代码中虽然用了ARM NEON,但是在ARM 指令集的操作中用到了浮点操作。所以,还可以继续使用浮点数定点化的优化方式,优化后的代码如下:


   
  1. inline uint8x8_t bilinear_interp_NEON_FixedPoint(double x, double y, uint8x8_t v11,uint8x8_t v12,
  2. uint8x8_t v21,uint8x8_t v22)
  3. {
  4. uint16x8_t v11_16 = vmovl_u8(v11);
  5. uint16x8_t v12_16 = vmovl_u8(v12);
  6. uint16x8_t v21_16 = vmovl_u8(v21);
  7. uint16x8_t v22_16 = vmovl_u8(v22);
  8. uint16x4_t v_16_low = vget_low_u16(v11_16);
  9. uint16x4_t v_16_high = vget_high_u16(v11_16);
  10. uint32x4_t v11_32_low = vmovl_u16(v_16_low);
  11. uint32x4_t v11_32_high = vmovl_u16(v_16_high);
  12. v_16_low = vget_low_u16(v12_16);
  13. v_16_high = vget_high_u16(v12_16);
  14. uint32x4_t v12_32_low = vmovl_u16(v_16_low);
  15. uint32x4_t v12_32_high = vmovl_u16(v_16_high);
  16. v_16_low = vget_low_u16(v21_16);
  17. v_16_high = vget_high_u16(v21_16);
  18. uint32x4_t v21_32_low = vmovl_u16(v_16_low);
  19. uint32x4_t v21_32_high = vmovl_u16(v_16_high);
  20. v_16_low = vget_low_u16(v22_16);
  21. v_16_high = vget_high_u16(v22_16);
  22. uint32x4_t v22_32_low = vmovl_u16(v_16_low);
  23. uint32x4_t v22_32_high = vmovl_u16(v_16_high);
  24. unsigned int intX = x*4096;
  25. unsigned int intY = y*4096;
  26. unsigned int one_x = 4096-intX;
  27. unsigned int one_y = 4096-intY;
  28. uint32_t intX_32 = (uint32_t) intX;
  29. uint32_t intY_32 = (uint32_t) intY;
  30. uint32_t oneX_32 = (uint32_t) one_x;
  31. uint32_t oneY_32 = (uint32_t) one_y;
  32. uint32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
  33. uint16x4_t result_16_low, result_16_high;
  34. //for low 4 numbers
  35. tmp1 = vmulq_n_u32(v11_32_low,oneY_32);
  36. tmp2 = vmulq_n_u32(v12_32_low, intY_32);
  37. tmp3 = vaddq_u32(tmp1, tmp2);
  38. tmp4 = vmulq_n_u32(tmp3, oneX_32);
  39. tmp1 = vmulq_n_u32(v21_32_low, oneY_32);
  40. tmp2 = vmulq_n_u32(v22_32_low, intY_32);
  41. tmp3 = vaddq_u32(tmp1, tmp2);
  42. tmp5 = vmulq_n_u32(tmp3, intX_32);
  43. tmp = vaddq_u32(tmp4, tmp5);
  44. result_16_low = vshrn_n_u32(tmp,16); //shift right 16 bytes
  45. result_16_low = vrshr_n_u16(result_16_low,8); //shift right 8 bytes, totally 24 bytes
  46. //for high 4 numbers
  47. tmp1 = vmulq_n_u32(v11_32_high,oneY_32);
  48. tmp2 = vmulq_n_u32(v12_32_high, intY_32);
  49. tmp3 = vaddq_u32(tmp1, tmp2);
  50. tmp4 = vmulq_n_u32(tmp3, oneX_32);
  51. tmp1 = vmulq_n_u32(v21_32_high, oneY_32);
  52. tmp2 = vmulq_n_u32(v22_32_high, intY_32);
  53. tmp3 = vaddq_u32(tmp1, tmp2);
  54. tmp5 = vmulq_n_u32(tmp3, intX_32);
  55. tmp = vaddq_u32(tmp4, tmp5);
  56. result_16_high = vshrn_n_u32(tmp,16); //shift right 16 bytes
  57. result_16_high = vrshr_n_u16(result_16_high,8); //shift right 8 bytes, totally 24 bytes
  58. uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
  59. uint8x8_t result_8 = vqmovn_u16(result_16);
  60. return result_8;
  61. }
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79

加入浮点定点化之后的优化,时间能进一步提升一倍左右。

文章来源: blog.csdn.net,作者:网奇,版权归原作者所有,如需转载,请联系作者。

原文链接:blog.csdn.net/jacke121/article/details/55253542

【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。