双线性插值算法ARM NEON优化
【摘要】
C语言版本双线性插值算法
inline double bilinear_interp(double x, double y, double v11, double v12, double v21, double v22) { return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y)...
-
inline double bilinear_interp(double x, double y, double v11, double v12,
-
double v21, double v22) {
-
-
return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y) * x;
-
}
- 1
- 2
- 3
- 4
- 5
使用ARM NOEN优化后的双线性插值版本
-
inline uint8x8_t bilinear_interp_NEON(double x, double y, uint8x8_t v11,uint8x8_t v12,
-
uint8x8_t v21,uint8x8_t v22)
-
{
-
uint16x8_t v11_16 = vmovl_u8(v11);
-
uint16x8_t v12_16 = vmovl_u8(v12);
-
uint16x8_t v21_16 = vmovl_u8(v21);
-
uint16x8_t v22_16 = vmovl_u8(v22);
-
-
///convert v11 to two float32x4
-
uint16x4_t v_16_low = vget_low_u16(v11_16);
-
uint16x4_t v_16_high = vget_high_u16(v11_16);
-
uint32x4_t v_32_low = vmovl_u16(v_16_low);
-
uint32x4_t v_32_high = vmovl_u16(v_16_high);
-
float32x4_t v11_32f_low = vcvtq_f32_u32(v_32_low);
-
float32x4_t v11_32f_high = vcvtq_f32_u32(v_32_high);
-
-
//v12
-
v_16_low = vget_low_u16(v12_16);
-
v_16_high = vget_high_u16(v12_16);
-
v_32_low = vmovl_u16(v_16_low);
-
v_32_high = vmovl_u16(v_16_high);
-
float32x4_t v12_32f_low = vcvtq_f32_u32(v_32_low);
-
float32x4_t v12_32f_high = vcvtq_f32_u32(v_32_high);
-
-
//v21
-
v_16_low = vget_low_u16(v21_16);
-
v_16_high = vget_high_u16(v21_16);
-
v_32_low = vmovl_u16(v_16_low);
-
v_32_high = vmovl_u16(v_16_high);
-
float32x4_t v21_32f_low = vcvtq_f32_u32(v_32_low);
-
float32x4_t v21_32f_high = vcvtq_f32_u32(v_32_high);
-
-
//v22
-
v_16_low = vget_low_u16(v22_16);
-
v_16_high = vget_high_u16(v22_16);
-
v_32_low = vmovl_u16(v_16_low);
-
v_32_high = vmovl_u16(v_16_high);
-
float32x4_t v22_32f_low = vcvtq_f32_u32(v_32_low);
-
float32x4_t v22_32f_high = vcvtq_f32_u32(v_32_high);
-
-
float32_t fx = (float32_t)x;
-
float32_t fy = (float32_t)y;
-
float32_t one_fx = 1-fx;
-
float32_t one_fy = 1-fy;
-
-
float32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
-
uint32x4_t result_32_low,result_32_high;
-
uint16x4_t result_16_low,result_16_high;
-
//for low 32x4
-
tmp1 = vmulq_n_f32(v11_32f_low, one_fy);
-
tmp2 = vmulq_n_f32(v12_32f_low, fy);
-
tmp3 = vaddq_f32(tmp1, tmp2);
-
tmp4 = vmulq_n_f32(tmp3, one_fx);
-
-
tmp1 = vmulq_n_f32(v21_32f_low, one_fy);
-
tmp2 = vmulq_n_f32(v22_32f_low, fy);
-
tmp3 = vaddq_f32(tmp1, tmp2);
-
tmp5 = vmulq_n_f32(tmp3, fx);
-
-
tmp = vaddq_f32(tmp4, tmp5);
-
result_32_low = vcvtq_u32_f32(tmp);
-
result_16_low = vqmovn_u32(result_32_low);
-
-
//for high 32x4
-
tmp1 = vmulq_n_f32(v11_32f_high, one_fy);
-
tmp2 = vmulq_n_f32(v12_32f_high, fy);
-
tmp3 = vaddq_f32(tmp1, tmp2);
-
tmp4 = vmulq_n_f32(tmp3, one_fx);
-
-
tmp1 = vmulq_n_f32(v21_32f_high, one_fy);
-
tmp2 = vmulq_n_f32(v22_32f_high, fy);
-
tmp3 = vaddq_f32(tmp1, tmp2);
-
tmp5 = vmulq_n_f32(tmp3, fx);
-
-
tmp = vaddq_f32(tmp4, tmp5);
-
result_32_high = vcvtq_u32_f32(tmp);
-
result_16_high = vqmovn_u32(result_32_high);
-
-
-
uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
-
-
uint8x8_t result_8 = vqmovn_u16(result_16);
-
return result_8;
-
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
使用ARM NEON后,一次可以处理8个像素,成倍提高了运行的速度。实践中需要特别注意对边界的处理(行的开始和结尾处)。
到这里我们还不能满足。要追求更快!!!
注意上面的代码中虽然用了ARM NEON,但是在ARM 指令集的操作中用到了浮点操作。所以,还可以继续使用浮点数定点化的优化方式,优化后的代码如下:
-
inline uint8x8_t bilinear_interp_NEON_FixedPoint(double x, double y, uint8x8_t v11,uint8x8_t v12,
-
uint8x8_t v21,uint8x8_t v22)
-
{
-
uint16x8_t v11_16 = vmovl_u8(v11);
-
uint16x8_t v12_16 = vmovl_u8(v12);
-
uint16x8_t v21_16 = vmovl_u8(v21);
-
uint16x8_t v22_16 = vmovl_u8(v22);
-
-
-
uint16x4_t v_16_low = vget_low_u16(v11_16);
-
uint16x4_t v_16_high = vget_high_u16(v11_16);
-
uint32x4_t v11_32_low = vmovl_u16(v_16_low);
-
uint32x4_t v11_32_high = vmovl_u16(v_16_high);
-
-
v_16_low = vget_low_u16(v12_16);
-
v_16_high = vget_high_u16(v12_16);
-
uint32x4_t v12_32_low = vmovl_u16(v_16_low);
-
uint32x4_t v12_32_high = vmovl_u16(v_16_high);
-
-
v_16_low = vget_low_u16(v21_16);
-
v_16_high = vget_high_u16(v21_16);
-
uint32x4_t v21_32_low = vmovl_u16(v_16_low);
-
uint32x4_t v21_32_high = vmovl_u16(v_16_high);
-
-
v_16_low = vget_low_u16(v22_16);
-
v_16_high = vget_high_u16(v22_16);
-
uint32x4_t v22_32_low = vmovl_u16(v_16_low);
-
uint32x4_t v22_32_high = vmovl_u16(v_16_high);
-
-
-
-
unsigned int intX = x*4096;
-
unsigned int intY = y*4096;
-
unsigned int one_x = 4096-intX;
-
unsigned int one_y = 4096-intY;
-
-
uint32_t intX_32 = (uint32_t) intX;
-
uint32_t intY_32 = (uint32_t) intY;
-
uint32_t oneX_32 = (uint32_t) one_x;
-
uint32_t oneY_32 = (uint32_t) one_y;
-
-
uint32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
-
uint16x4_t result_16_low, result_16_high;
-
//for low 4 numbers
-
tmp1 = vmulq_n_u32(v11_32_low,oneY_32);
-
tmp2 = vmulq_n_u32(v12_32_low, intY_32);
-
tmp3 = vaddq_u32(tmp1, tmp2);
-
tmp4 = vmulq_n_u32(tmp3, oneX_32);
-
-
tmp1 = vmulq_n_u32(v21_32_low, oneY_32);
-
tmp2 = vmulq_n_u32(v22_32_low, intY_32);
-
tmp3 = vaddq_u32(tmp1, tmp2);
-
tmp5 = vmulq_n_u32(tmp3, intX_32);
-
-
tmp = vaddq_u32(tmp4, tmp5);
-
result_16_low = vshrn_n_u32(tmp,16); //shift right 16 bytes
-
result_16_low = vrshr_n_u16(result_16_low,8); //shift right 8 bytes, totally 24 bytes
-
-
//for high 4 numbers
-
tmp1 = vmulq_n_u32(v11_32_high,oneY_32);
-
tmp2 = vmulq_n_u32(v12_32_high, intY_32);
-
tmp3 = vaddq_u32(tmp1, tmp2);
-
tmp4 = vmulq_n_u32(tmp3, oneX_32);
-
-
tmp1 = vmulq_n_u32(v21_32_high, oneY_32);
-
tmp2 = vmulq_n_u32(v22_32_high, intY_32);
-
tmp3 = vaddq_u32(tmp1, tmp2);
-
tmp5 = vmulq_n_u32(tmp3, intX_32);
-
-
tmp = vaddq_u32(tmp4, tmp5);
-
result_16_high = vshrn_n_u32(tmp,16); //shift right 16 bytes
-
result_16_high = vrshr_n_u16(result_16_high,8); //shift right 8 bytes, totally 24 bytes
-
-
uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
-
-
uint8x8_t result_8 = vqmovn_u16(result_16);
-
return result_8;
-
-
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
加入浮点定点化之后的优化,时间能进一步提升一倍左右。
文章来源: blog.csdn.net,作者:网奇,版权归原作者所有,如需转载,请联系作者。
原文链接:blog.csdn.net/jacke121/article/details/55253542
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)