sse优化一例
【摘要】 前两天为了加速一段求梯度的代码,用了SSE指令,在实验室PMH大侠的指导下,最终实现了3倍速度提升(极限是4倍,因为4个浮点数一起计算)。在这里写一下心得,欢迎拍砖。
SSE加速的几个关键是
(1) 用于并行计算的数据结构要16字节对齐
(2) 直接写汇编,不要用SSE的Load Store指令
(3) 对于SSE本身不提供的三角函数等指令,可以用查表法,但要用S...
1
// 计算梯度的代码
2
for ( int s = 1 ; s < (GetCount() - 1 ) ; ++ s) {
3
for (int y = 1 ; y < (imgScaled[s]->Height() - 1) ; ++y) {
4
for (int x = 1 ; x < (imgScaled[s]->Width() - 1) ; ++x) {
5
float gy= imgScaled[s]->At(x, y + 1) - imgScaled[s]->At(x, y - 1);
6
float gx = imgScaled[s]->At(x + 1, y) - imgScaled[s]->At(x - 1, y);
7
8
magnitudes[s]->At(x, y) = sqrt(gx*gx + gy*gy);
9
directions[s]->At(x, y) = AtanLookupF32::Value(gy, gx);
10
}
11
}
12
}
13
14
// arctan 查表函数
15
static inline float AtanLookupF32::Value( float y, float x) {
16
float N_DOUBLE = 4 * 4096;
17
if( x > 0.0 ){
18
if( y > 0.0 )
19
return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))];
20
else
21
return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))];
22
}
23
24
if( x == 0.0 ){
25
if( y > 0 )
26
return LU_PI/2;
27
else
28
return -LU_PI/2;
29
}
30
31
if( y < 0.0 )
32
return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))] - LU_PI;
33
else
34
return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))] + LU_PI;
35
}
36
// 计算梯度的代码 2
for ( int s = 1 ; s < (GetCount() - 1 ) ; ++ s) {3
for (int y = 1 ; y < (imgScaled[s]->Height() - 1) ; ++y) {4
for (int x = 1 ; x < (imgScaled[s]->Width() - 1) ; ++x) {5
float gy= imgScaled[s]->At(x, y + 1) - imgScaled[s]->At(x, y - 1);6
float gx = imgScaled[s]->At(x + 1, y) - imgScaled[s]->At(x - 1, y);7

8
magnitudes[s]->At(x, y) = sqrt(gx*gx + gy*gy);9
directions[s]->At(x, y) = AtanLookupF32::Value(gy, gx);10
}11
}12
} 13

14
// arctan 查表函数 15
static inline float AtanLookupF32::Value( float y, float x) {16
float N_DOUBLE = 4 * 4096;17
if( x > 0.0 ){18
if( y > 0.0 )19
return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))];20
else21
return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))];22
}23

24
if( x == 0.0 ){25
if( y > 0 )26
return LU_PI/2;27
else28
return -LU_PI/2;29
}30

31
if( y < 0.0 )32
return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))] - LU_PI;33
else34
return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))] + LU_PI;35
} 36
1
template < typename T >
2
class ImageArray
3
{
4
protected:
5
int m_nWidth;
6
int m_nHeight;
7
8
// 16字节补齐后的实际宽度,单位为 sizeof(float)
9
int m_nWidthActual;
10
11
// 积分图像,用来加速图像的区域求和用
12
ImageArray* m_pImageIntegral;
13
14
// 计算补足后的长度
15
static __forceinline int expandAlign(int w){
16
return w + 3 - (w - 1) % 4;
17
}
18
19
// 数据
20
T* m_afData;
21
T** m_aafEntry;
22
23
typedef T* PointerType;
24
typedef T** EntryType;
25
26
void SetSize(int height, int width){
27
m_nWidth = width;
28
m_nHeight = height;
29
m_nWidthActual = expandAlign(width);
30
31
// 16字节对齐的分配
32
m_afData = (T*)_aligned_malloc(sizeof(T) * m_nWidthActual * m_nHeight, 16);
33
34
// 这一部分是加速索引,参考Wild Magic Lib里的GMatrix类
35
m_aafEntry = new PointerType[m_nHeight];
36
T* ptr = m_afData;
37
for(int i=0;i<m_nHeight;i++){
38
m_aafEntry[i] = ptr;
39
ptr += m_nWidthActual;
40
}
41
42
if(m_pImageIntegral)
43
delete m_pImageIntegral;
44
m_pImageIntegral = NULL;
45
}
46
47
public:
48
49
ImageArray():m_pImageIntegral(NULL){SetSize(0, 0);}
50
51
ImageArray(int width, int height):m_pImageIntegral(NULL){
52
SetSize(height, width);
53
}
54
55
ImageArray(const ImageArray& that):m_pImageIntegral(NULL){
56
SetSize(that.Height(), that.Width());
57
memcpy(m_afData, that.m_afData, sizeof(T) * that.m_nWidthActual * that.m_nHeight);
58
}
59
60
~ImageArray(){
61
if(m_pImageIntegral)
62
delete m_pImageIntegral;
63
if(m_aafEntry)
64
delete []m_aafEntry;
65
66
// 对应的释放
67
if(m_afData)
68
_aligned_free(m_afData);
69
}
70
71
void CreateDataArray(int width, int height){
72
m_nWidthActual = expandAlign(width);
73
SetSize(height, m_nWidthActual);
74
m_nWidth = width;
75
m_nHeight = height;
76
}
77
78
__forceinline T& At(int x, int y){
79
_ASSERT(m_afData);
80
_ASSERT(x >= 0 && x < m_nWidth && y >= 0 && y < m_nHeight);
81
return m_aafEntry[y][x];
82
}
83
84
__forceinline const int Width() const {return m_nWidth;}
85
__forceinline const int Height() const {return m_nHeight;}
86
87
// 建立移位的图像
88
void fillShiftedImage(int shift, ImageArray<T>& dst)
89
{
90
for(int i=0;i<m_nHeight;i++)
91
{
92
memcpy(dst[i], m_aafEntry[i] + shift, sizeof(T) * (m_nWidthActual - shift));
93
}
94
}
95
96
//
以下省略
97
} ;
template < typename T > 2
class ImageArray3
{4
protected:5
int m_nWidth;6
int m_nHeight;7

8
// 16字节补齐后的实际宽度,单位为 sizeof(float)9
int m_nWidthActual;10

11
// 积分图像,用来加速图像的区域求和用12
ImageArray* m_pImageIntegral;13

14
// 计算补足后的长度15
static __forceinline int expandAlign(int w){16
return w + 3 - (w - 1) % 4;17
}18

19
// 数据20
T* m_afData;21
T** m_aafEntry;22

23
typedef T* PointerType;24
typedef T** EntryType;25

26
void SetSize(int height, int width){27
m_nWidth = width;28
m_nHeight = height;29
m_nWidthActual = expandAlign(width);30

31
// 16字节对齐的分配32
m_afData = (T*)_aligned_malloc(sizeof(T) * m_nWidthActual * m_nHeight, 16);33

34
// 这一部分是加速索引,参考Wild Magic Lib里的GMatrix类35
m_aafEntry = new PointerType[m_nHeight];36
T* ptr = m_afData;37
for(int i=0;i<m_nHeight;i++){38
m_aafEntry[i] = ptr;39
ptr += m_nWidthActual;40
}41

42
if(m_pImageIntegral)43
delete m_pImageIntegral;44
m_pImageIntegral = NULL;45
}46

47
public:48

49
ImageArray():m_pImageIntegral(NULL){SetSize(0, 0);}50

51
ImageArray(int width, int height):m_pImageIntegral(NULL){52
SetSize(height, width);53
}54

55
ImageArray(const ImageArray& that):m_pImageIntegral(NULL){56
SetSize(that.Height(), that.Width());57
memcpy(m_afData, that.m_afData, sizeof(T) * that.m_nWidthActual * that.m_nHeight);58
}59

60
~ImageArray(){61
if(m_pImageIntegral)62
delete m_pImageIntegral;63
if(m_aafEntry)64
delete []m_aafEntry;65

66
// 对应的释放67
if(m_afData)68
_aligned_free(m_afData);69
}70

71
void CreateDataArray(int width, int height){72
m_nWidthActual = expandAlign(width);73
SetSize(height, m_nWidthActual);74
m_nWidth = width;75
m_nHeight = height;76
}77

78
__forceinline T& At(int x, int y){79
_ASSERT(m_afData);80
_ASSERT(x >= 0 && x < m_nWidth && y >= 0 && y < m_nHeight);81
return m_aafEntry[y][x];82
}83

84
__forceinline const int Width() const {return m_nWidth;}85
__forceinline const int Height() const {return m_nHeight;}86

87
// 建立移位的图像88
void fillShiftedImage(int shift, ImageArray<T>& dst)89
{90
for(int i=0;i<m_nHeight;i++)91
{92
memcpy(dst[i], m_aafEntry[i] + shift, sizeof(T) * (m_nWidthActual - shift));93
}94
}95

96
//
以下省略97
} ;
// ----------------------------------------------------------------------------
template < class Real >
Real Math < Real > ::FastInvSqrt (Real fValue)
{
// TO DO. This routine was designed for 'float'. Come up with an
// equivalent one for 'double' and specialize the templates for 'float'
// and 'double'.
float fFValue = (float)fValue;
float fHalf = 0.5f*fFValue;
int i = *(int*)&fFValue;
i = 0x5f3759df - (i >> 1);
fFValue = *(float*)&i;
fFValue = fFValue*(1.5f - fHalf*fFValue*fFValue);
return (Real)fFValue;
}
__m128 _mm_sqrt_ps(__m128 a );
SQRTPS
static __forceinline float ValueDirect( int y, int x, int idx)
{
x = x * 2 + y + 3;
return m_dATAN_LU[x][idx];
}
1
magnitudes.resize(GetCount()
-
1
, NULL);2
directions.resize(GetCount()
-
1
, NULL);3

4
ImageArrayf imggm;5

6
int
w
=
imgScaled[
0
]
->
Width();7
int
h
=
imgScaled[
0
]
->
Height();8

9
int
scnt
=
GetCount()
-
1
;10

11
ImageArrayf imgsa(w, h), imgsb(w, h);12
ImageArray
<
int
>
imgsi(w, h), imggx(w, h), imggy(w, h);13
imggm.CreateDataArray(w, h);14

15
for
(
int
s
=
1
; s
<
(GetCount()
-
1
) ;
++
s)
{16
magnitudes[s] = new ImageArrayf(imgScaled[s]->Width(), imgScaled[s]->Height());17
directions[s] = new ImageArrayf(imgScaled[s]->Width(), imgScaled[s]->Height());18
}
19

20
__m128 ma, mb, mr;21
__m128 na, nb, nr;22
__m128 gl, gr, gtt, gb;23
__m128 gx, gy, sgx, sgy, sg, sqsg;24
__m128 gn, gi;25
__m128i gii;26
__m128 gzero;27

28
memset(gzero.m128_f32,
0
,
sizeof
(
float
)
*
4
);29

30
for
(
int
i
=
0
;i
<
4
;i
++
)31
{32
gn.m128_f32[i] = AtanLookupF32::NDOUBLE();33
}
34

35
for
(
int
s
=
1
; s
<
scnt ;
++
s)
{36

37
ImageArrayf& imgt = *imgScaled[s];38

39
imgt.fillShiftedImage(1, imgsa);40
imgt.fillShiftedImage(2, imgsb);41

42
for (int y = 1 ; y < (h - 1) ; ++y) {43
int x;44
for (x = 0 ; x < (w - 2) ; x += 4) {45

46
gl = _mm_load_ps(imgt[y] + x);47
gr = _mm_load_ps(imgsb[y] + x);48
gtt = _mm_load_ps(imgsa[y+1] + x);49
gb = _mm_load_ps(imgsa[y-1] + x);50

51
_asm52
{53
// x0 = right;54
movaps xmm0, gr;55

56
// x1 = left;57
movaps xmm1, gl;58

59
// x2 = top;60
movaps xmm2, gtt;61

62
// x3 = bottom63
movaps xmm3, gb;64

65
// x0 = right - left = gx66
subps xmm0, xmm1;67

68
// x2 = top - bottom = gy69
subps xmm2, xmm3;70

71
// x4 = right72
movaps xmm4, gr;73

74
// x6 = top75
movaps xmm6, gtt;76

77
// x1 = left - right = -gx;78
subps xmm1, xmm4;79

80
// x3 = bottom - top = -gy;81
subps xmm3, xmm6;82

83
// x1 = |gx|84
maxps xmm1, xmm0;85

86
// x3 = |gy|87
maxps xmm3, xmm2;88

89
// gx = x090
movaps gx, xmm0;91

92
// gy = x293
movaps gy, xmm2;94

95
// x1 = |gx| + |gy|96
addps xmm1, xmm3;97

98
// x4 = gx;99
movaps xmm4, xmm0;100

101
// x6 = gy;102
movaps xmm6, xmm2;103

104
// x4 = gx^2;105
mulps xmm4, xmm4;106

107
// x6 = gy^2;108
mulps xmm6, xmm6;109

110
// x4 = gx^2 + gy^2;111
addps xmm4, xmm6;112

113
// x4 = sqrt(
)114
sqrtps xmm4, xmm4;115

116
// sqsg = x4;117
movaps sqsg, xmm4;118

119
// x3 = |gy| / (|gx| + |gy|) = dy;120
divps xmm3, xmm1;121

122
// x1 = n;123
movaps xmm1, gn;124

125
// x3 = |dy| * n;126
mulps xmm3, xmm1;127

128
// gi = |dy| * n;129
movaps gi, xmm3;130
}131

132
_mm_store_ps(imggm[y] + x, sqsg);133

134
gx = _mm_cmpgt_ps(gx, gzero);135
gy = _mm_cmpgt_ps(gy, gzero);136

137
_mm_store_si128((__m128i*)(imggx[y] + x), *((__m128i*)&gx));138
_mm_store_si128((__m128i*)(imggy[y] + x), *((__m128i*)&gy));139

140
gii = _mm_cvtps_epi32(gi);141
_mm_store_si128((__m128i*)(imgsi[y] + x), gii);142
}143
}144

145
for (int y = 1 ; y < (h - 1) ; ++y) {146
for (int x = 1 ; x < (w - 1) ; x ++) {147
magnitudes[s]->At(x, y) = imggm[y][x-1];148
directions[s]->At(x, y) = AtanLookupF32::ValueDirect(imggy[y][x-1], imggx[y][x-1], imgsi[y][x-1]);149
}150
}151
}
原文:http://www.cppblog.com/fbitw/archive/2007/05/04/23287.html
文章来源: blog.csdn.net,作者:网奇,版权归原作者所有,如需转载,请联系作者。
原文链接:blog.csdn.net/jacke121/article/details/54706128
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)