sse优化一例
【摘要】 前两天为了加速一段求梯度的代码,用了SSE指令,在实验室PMH大侠的指导下,最终实现了3倍速度提升(极限是4倍,因为4个浮点数一起计算)。在这里写一下心得,欢迎拍砖。
SSE加速的几个关键是
(1) 用于并行计算的数据结构要16字节对齐
(2) 直接写汇编,不要用SSE的Load Store指令
(3) 对于SSE本身不提供的三角函数等指令,可以用查表法,但要用S...
1
// 计算梯度的代码
2 for ( int s = 1 ; s < (GetCount() - 1 ) ; ++ s) {
3 for (int y = 1 ; y < (imgScaled[s]->Height() - 1) ; ++y) {
4 for (int x = 1 ; x < (imgScaled[s]->Width() - 1) ; ++x) {
5 float gy= imgScaled[s]->At(x, y + 1) - imgScaled[s]->At(x, y - 1);
6 float gx = imgScaled[s]->At(x + 1, y) - imgScaled[s]->At(x - 1, y);
7
8 magnitudes[s]->At(x, y) = sqrt(gx*gx + gy*gy);
9 directions[s]->At(x, y) = AtanLookupF32::Value(gy, gx);
10 }
11 }
12}
13
14 // arctan 查表函数
15 static inline float AtanLookupF32::Value( float y, float x) {
16 float N_DOUBLE = 4 * 4096;
17 if( x > 0.0 ){
18 if( y > 0.0 )
19 return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))];
20 else
21 return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))];
22 }
23
24 if( x == 0.0 ){
25 if( y > 0 )
26 return LU_PI/2;
27 else
28 return -LU_PI/2;
29 }
30
31 if( y < 0.0 )
32 return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))] - LU_PI;
33 else
34 return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))] + LU_PI;
35}
36
2 for ( int s = 1 ; s < (GetCount() - 1 ) ; ++ s) {
3 for (int y = 1 ; y < (imgScaled[s]->Height() - 1) ; ++y) {
4 for (int x = 1 ; x < (imgScaled[s]->Width() - 1) ; ++x) {
5 float gy= imgScaled[s]->At(x, y + 1) - imgScaled[s]->At(x, y - 1);
6 float gx = imgScaled[s]->At(x + 1, y) - imgScaled[s]->At(x - 1, y);
7
8 magnitudes[s]->At(x, y) = sqrt(gx*gx + gy*gy);
9 directions[s]->At(x, y) = AtanLookupF32::Value(gy, gx);
10 }
11 }
12}
13
14 // arctan 查表函数
15 static inline float AtanLookupF32::Value( float y, float x) {
16 float N_DOUBLE = 4 * 4096;
17 if( x > 0.0 ){
18 if( y > 0.0 )
19 return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))];
20 else
21 return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))];
22 }
23
24 if( x == 0.0 ){
25 if( y > 0 )
26 return LU_PI/2;
27 else
28 return -LU_PI/2;
29 }
30
31 if( y < 0.0 )
32 return m_dATAN_LU[(int)(N_DOUBLE * y / ( x + y ))] - LU_PI;
33 else
34 return -m_dATAN_LU[(int)(N_DOUBLE * (-y) / ( x - y ))] + LU_PI;
35}
36
1
template < typename T >
2 class ImageArray
3 {
4protected:
5 int m_nWidth;
6 int m_nHeight;
7
8 // 16字节补齐后的实际宽度,单位为 sizeof(float)
9 int m_nWidthActual;
10
11 // 积分图像,用来加速图像的区域求和用
12 ImageArray* m_pImageIntegral;
13
14 // 计算补足后的长度
15 static __forceinline int expandAlign(int w){
16 return w + 3 - (w - 1) % 4;
17 }
18
19 // 数据
20 T* m_afData;
21 T** m_aafEntry;
22
23 typedef T* PointerType;
24 typedef T** EntryType;
25
26 void SetSize(int height, int width){
27 m_nWidth = width;
28 m_nHeight = height;
29 m_nWidthActual = expandAlign(width);
30
31 // 16字节对齐的分配
32 m_afData = (T*)_aligned_malloc(sizeof(T) * m_nWidthActual * m_nHeight, 16);
33
34 // 这一部分是加速索引,参考Wild Magic Lib里的GMatrix类
35 m_aafEntry = new PointerType[m_nHeight];
36 T* ptr = m_afData;
37 for(int i=0;i<m_nHeight;i++){
38 m_aafEntry[i] = ptr;
39 ptr += m_nWidthActual;
40 }
41
42 if(m_pImageIntegral)
43 delete m_pImageIntegral;
44 m_pImageIntegral = NULL;
45 }
46
47public:
48
49 ImageArray():m_pImageIntegral(NULL){SetSize(0, 0);}
50
51 ImageArray(int width, int height):m_pImageIntegral(NULL){
52 SetSize(height, width);
53 }
54
55 ImageArray(const ImageArray& that):m_pImageIntegral(NULL){
56 SetSize(that.Height(), that.Width());
57 memcpy(m_afData, that.m_afData, sizeof(T) * that.m_nWidthActual * that.m_nHeight);
58 }
59
60 ~ImageArray(){
61 if(m_pImageIntegral)
62 delete m_pImageIntegral;
63 if(m_aafEntry)
64 delete []m_aafEntry;
65
66 // 对应的释放
67 if(m_afData)
68 _aligned_free(m_afData);
69 }
70
71 void CreateDataArray(int width, int height){
72 m_nWidthActual = expandAlign(width);
73 SetSize(height, m_nWidthActual);
74 m_nWidth = width;
75 m_nHeight = height;
76 }
77
78 __forceinline T& At(int x, int y){
79 _ASSERT(m_afData);
80 _ASSERT(x >= 0 && x < m_nWidth && y >= 0 && y < m_nHeight);
81 return m_aafEntry[y][x];
82 }
83
84 __forceinline const int Width() const {return m_nWidth;}
85 __forceinline const int Height() const {return m_nHeight;}
86
87 // 建立移位的图像
88 void fillShiftedImage(int shift, ImageArray<T>& dst)
89 {
90 for(int i=0;i<m_nHeight;i++)
91 {
92 memcpy(dst[i], m_aafEntry[i] + shift, sizeof(T) * (m_nWidthActual - shift));
93 }
94 }
95
96 // 以下省略
97} ;
2 class ImageArray
3 {
4protected:
5 int m_nWidth;
6 int m_nHeight;
7
8 // 16字节补齐后的实际宽度,单位为 sizeof(float)
9 int m_nWidthActual;
10
11 // 积分图像,用来加速图像的区域求和用
12 ImageArray* m_pImageIntegral;
13
14 // 计算补足后的长度
15 static __forceinline int expandAlign(int w){
16 return w + 3 - (w - 1) % 4;
17 }
18
19 // 数据
20 T* m_afData;
21 T** m_aafEntry;
22
23 typedef T* PointerType;
24 typedef T** EntryType;
25
26 void SetSize(int height, int width){
27 m_nWidth = width;
28 m_nHeight = height;
29 m_nWidthActual = expandAlign(width);
30
31 // 16字节对齐的分配
32 m_afData = (T*)_aligned_malloc(sizeof(T) * m_nWidthActual * m_nHeight, 16);
33
34 // 这一部分是加速索引,参考Wild Magic Lib里的GMatrix类
35 m_aafEntry = new PointerType[m_nHeight];
36 T* ptr = m_afData;
37 for(int i=0;i<m_nHeight;i++){
38 m_aafEntry[i] = ptr;
39 ptr += m_nWidthActual;
40 }
41
42 if(m_pImageIntegral)
43 delete m_pImageIntegral;
44 m_pImageIntegral = NULL;
45 }
46
47public:
48
49 ImageArray():m_pImageIntegral(NULL){SetSize(0, 0);}
50
51 ImageArray(int width, int height):m_pImageIntegral(NULL){
52 SetSize(height, width);
53 }
54
55 ImageArray(const ImageArray& that):m_pImageIntegral(NULL){
56 SetSize(that.Height(), that.Width());
57 memcpy(m_afData, that.m_afData, sizeof(T) * that.m_nWidthActual * that.m_nHeight);
58 }
59
60 ~ImageArray(){
61 if(m_pImageIntegral)
62 delete m_pImageIntegral;
63 if(m_aafEntry)
64 delete []m_aafEntry;
65
66 // 对应的释放
67 if(m_afData)
68 _aligned_free(m_afData);
69 }
70
71 void CreateDataArray(int width, int height){
72 m_nWidthActual = expandAlign(width);
73 SetSize(height, m_nWidthActual);
74 m_nWidth = width;
75 m_nHeight = height;
76 }
77
78 __forceinline T& At(int x, int y){
79 _ASSERT(m_afData);
80 _ASSERT(x >= 0 && x < m_nWidth && y >= 0 && y < m_nHeight);
81 return m_aafEntry[y][x];
82 }
83
84 __forceinline const int Width() const {return m_nWidth;}
85 __forceinline const int Height() const {return m_nHeight;}
86
87 // 建立移位的图像
88 void fillShiftedImage(int shift, ImageArray<T>& dst)
89 {
90 for(int i=0;i<m_nHeight;i++)
91 {
92 memcpy(dst[i], m_aafEntry[i] + shift, sizeof(T) * (m_nWidthActual - shift));
93 }
94 }
95
96 // 以下省略
97} ;
// ----------------------------------------------------------------------------
template < class Real >
Real Math < Real > ::FastInvSqrt (Real fValue)
{
// TO DO. This routine was designed for 'float'. Come up with an
// equivalent one for 'double' and specialize the templates for 'float'
// and 'double'.
float fFValue = (float)fValue;
float fHalf = 0.5f*fFValue;
int i = *(int*)&fFValue;
i = 0x5f3759df - (i >> 1);
fFValue = *(float*)&i;
fFValue = fFValue*(1.5f - fHalf*fFValue*fFValue);
return (Real)fFValue;
}
template < class Real >
Real Math < Real > ::FastInvSqrt (Real fValue)
{
// TO DO. This routine was designed for 'float'. Come up with an
// equivalent one for 'double' and specialize the templates for 'float'
// and 'double'.
float fFValue = (float)fValue;
float fHalf = 0.5f*fFValue;
int i = *(int*)&fFValue;
i = 0x5f3759df - (i >> 1);
fFValue = *(float*)&i;
fFValue = fFValue*(1.5f - fHalf*fFValue*fFValue);
return (Real)fFValue;
}
__m128 _mm_sqrt_ps(__m128 a );
SQRTPS
SQRTPS
static __forceinline float ValueDirect( int y, int x, int idx)
{
x = x * 2 + y + 3;
return m_dATAN_LU[x][idx];
}
{
x = x * 2 + y + 3;
return m_dATAN_LU[x][idx];
}
1 magnitudes.resize(GetCount() - 1 , NULL);
2 directions.resize(GetCount() - 1 , NULL);
3
4 ImageArrayf imggm;
5
6 int w = imgScaled[ 0 ] -> Width();
7 int h = imgScaled[ 0 ] -> Height();
8
9 int scnt = GetCount() - 1 ;
10
11 ImageArrayf imgsa(w, h), imgsb(w, h);
12 ImageArray < int > imgsi(w, h), imggx(w, h), imggy(w, h);
13 imggm.CreateDataArray(w, h);
14
15 for ( int s = 1 ; s < (GetCount() - 1 ) ; ++ s) {
16 magnitudes[s] = new ImageArrayf(imgScaled[s]->Width(), imgScaled[s]->Height());
17 directions[s] = new ImageArrayf(imgScaled[s]->Width(), imgScaled[s]->Height());
18}
19
20 __m128 ma, mb, mr;
21 __m128 na, nb, nr;
22 __m128 gl, gr, gtt, gb;
23 __m128 gx, gy, sgx, sgy, sg, sqsg;
24 __m128 gn, gi;
25 __m128i gii;
26 __m128 gzero;
27
28 memset(gzero.m128_f32, 0 , sizeof ( float ) * 4 );
29
30 for ( int i = 0 ;i < 4 ;i ++ )
31 {
32 gn.m128_f32[i] = AtanLookupF32::NDOUBLE();
33}
34
35 for ( int s = 1 ; s < scnt ; ++ s) {
36
37 ImageArrayf& imgt = *imgScaled[s];
38
39 imgt.fillShiftedImage(1, imgsa);
40 imgt.fillShiftedImage(2, imgsb);
41
42 for (int y = 1 ; y < (h - 1) ; ++y) {
43 int x;
44 for (x = 0 ; x < (w - 2) ; x += 4) {
45
46 gl = _mm_load_ps(imgt[y] + x);
47 gr = _mm_load_ps(imgsb[y] + x);
48 gtt = _mm_load_ps(imgsa[y+1] + x);
49 gb = _mm_load_ps(imgsa[y-1] + x);
50
51 _asm
52 {
53 // x0 = right;
54 movaps xmm0, gr;
55
56 // x1 = left;
57 movaps xmm1, gl;
58
59 // x2 = top;
60 movaps xmm2, gtt;
61
62 // x3 = bottom
63 movaps xmm3, gb;
64
65 // x0 = right - left = gx
66 subps xmm0, xmm1;
67
68 // x2 = top - bottom = gy
69 subps xmm2, xmm3;
70
71 // x4 = right
72 movaps xmm4, gr;
73
74 // x6 = top
75 movaps xmm6, gtt;
76
77 // x1 = left - right = -gx;
78 subps xmm1, xmm4;
79
80 // x3 = bottom - top = -gy;
81 subps xmm3, xmm6;
82
83 // x1 = |gx|
84 maxps xmm1, xmm0;
85
86 // x3 = |gy|
87 maxps xmm3, xmm2;
88
89 // gx = x0
90 movaps gx, xmm0;
91
92 // gy = x2
93 movaps gy, xmm2;
94
95 // x1 = |gx| + |gy|
96 addps xmm1, xmm3;
97
98 // x4 = gx;
99 movaps xmm4, xmm0;
100
101 // x6 = gy;
102 movaps xmm6, xmm2;
103
104 // x4 = gx^2;
105 mulps xmm4, xmm4;
106
107 // x6 = gy^2;
108 mulps xmm6, xmm6;
109
110 // x4 = gx^2 + gy^2;
111 addps xmm4, xmm6;
112
113 // x4 = sqrt()
114 sqrtps xmm4, xmm4;
115
116 // sqsg = x4;
117 movaps sqsg, xmm4;
118
119 // x3 = |gy| / (|gx| + |gy|) = dy;
120 divps xmm3, xmm1;
121
122 // x1 = n;
123 movaps xmm1, gn;
124
125 // x3 = |dy| * n;
126 mulps xmm3, xmm1;
127
128 // gi = |dy| * n;
129 movaps gi, xmm3;
130 }
131
132 _mm_store_ps(imggm[y] + x, sqsg);
133
134 gx = _mm_cmpgt_ps(gx, gzero);
135 gy = _mm_cmpgt_ps(gy, gzero);
136
137 _mm_store_si128((__m128i*)(imggx[y] + x), *((__m128i*)&gx));
138 _mm_store_si128((__m128i*)(imggy[y] + x), *((__m128i*)&gy));
139
140 gii = _mm_cvtps_epi32(gi);
141 _mm_store_si128((__m128i*)(imgsi[y] + x), gii);
142 }
143 }
144
145 for (int y = 1 ; y < (h - 1) ; ++y) {
146 for (int x = 1 ; x < (w - 1) ; x ++) {
147 magnitudes[s]->At(x, y) = imggm[y][x-1];
148 directions[s]->At(x, y) = AtanLookupF32::ValueDirect(imggy[y][x-1], imggx[y][x-1], imgsi[y][x-1]);
149 }
150 }
151}
原文:http://www.cppblog.com/fbitw/archive/2007/05/04/23287.html
文章来源: blog.csdn.net,作者:网奇,版权归原作者所有,如需转载,请联系作者。
原文链接:blog.csdn.net/jacke121/article/details/54706128
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)