1 | /* ----------------------------------------------------------------------------- |
2 | |
3 | Copyright (c) 2006 Simon Brown si@sjbrown.co.uk |
4 | |
5 | Permission is hereby granted, free of charge, to any person obtaining |
6 | a copy of this software and associated documentation files (the |
7 | "Software"), to deal in the Software without restriction, including |
8 | without limitation the rights to use, copy, modify, merge, publish, |
9 | distribute, sublicense, and/or sell copies of the Software, and to |
10 | permit persons to whom the Software is furnished to do so, subject to |
11 | the following conditions: |
12 | |
13 | The above copyright notice and this permission notice shall be included |
14 | in all copies or substantial portions of the Software. |
15 | |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
17 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
23 | |
24 | -------------------------------------------------------------------------- */ |
25 | |
26 | #ifndef SQUISH_SIMD_SSE_H |
27 | #define SQUISH_SIMD_SSE_H |
28 | |
29 | #include <xmmintrin.h> |
30 | #if ( SQUISH_USE_SSE > 1 ) |
31 | #include <emmintrin.h> |
32 | #endif |
33 | |
34 | #define SQUISH_SSE_SPLAT( a ) \ |
35 | ( ( a ) | ( ( a ) << 2 ) | ( ( a ) << 4 ) | ( ( a ) << 6 ) ) |
36 | |
37 | #define SQUISH_SSE_SHUF( x, y, z, w ) \ |
38 | ( ( x ) | ( ( y ) << 2 ) | ( ( z ) << 4 ) | ( ( w ) << 6 ) ) |
39 | |
40 | namespace squish { |
41 | |
42 | #define VEC4_CONST( X ) Vec4( X ) |
43 | |
44 | class Vec4 |
45 | { |
46 | public: |
47 | typedef Vec4 const& Arg; |
48 | |
49 | Vec4() {} |
50 | |
51 | explicit Vec4( __m128 v ) : m_v( v ) {} |
52 | |
53 | Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {} |
54 | |
55 | Vec4& operator=( Vec4 const& arg ) |
56 | { |
57 | m_v = arg.m_v; |
58 | return *this; |
59 | } |
60 | |
61 | explicit Vec4( float s ) : m_v( _mm_set1_ps( s ) ) {} |
62 | |
63 | Vec4( float x, float y, float z, float w ) : m_v( _mm_setr_ps( x, y, z, w ) ) {} |
64 | |
65 | Vec3 GetVec3() const |
66 | { |
67 | #ifdef __GNUC__ |
68 | __attribute__ ((__aligned__ (16))) float c[4]; |
69 | #else |
70 | __declspec(align(16)) float c[4]; |
71 | #endif |
72 | _mm_store_ps( c, m_v ); |
73 | return Vec3( c[0], c[1], c[2] ); |
74 | } |
75 | |
76 | Vec4 SplatX() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) ) ); } |
77 | Vec4 SplatY() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) ) ); } |
78 | Vec4 SplatZ() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 2 ) ) ); } |
79 | Vec4 SplatW() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 3 ) ) ); } |
80 | |
81 | Vec4& operator+=( Arg v ) |
82 | { |
83 | m_v = _mm_add_ps( m_v, v.m_v ); |
84 | return *this; |
85 | } |
86 | |
87 | Vec4& operator-=( Arg v ) |
88 | { |
89 | m_v = _mm_sub_ps( m_v, v.m_v ); |
90 | return *this; |
91 | } |
92 | |
93 | Vec4& operator*=( Arg v ) |
94 | { |
95 | m_v = _mm_mul_ps( m_v, v.m_v ); |
96 | return *this; |
97 | } |
98 | |
99 | friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right ) |
100 | { |
101 | return Vec4( _mm_add_ps( left.m_v, right.m_v ) ); |
102 | } |
103 | |
104 | friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right ) |
105 | { |
106 | return Vec4( _mm_sub_ps( left.m_v, right.m_v ) ); |
107 | } |
108 | |
109 | friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right ) |
110 | { |
111 | return Vec4( _mm_mul_ps( left.m_v, right.m_v ) ); |
112 | } |
113 | |
114 | //! Returns a*b + c |
115 | friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c ) |
116 | { |
117 | return Vec4( _mm_add_ps( _mm_mul_ps( a.m_v, b.m_v ), c.m_v ) ); |
118 | } |
119 | |
120 | //! Returns -( a*b - c ) |
121 | friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c ) |
122 | { |
123 | return Vec4( _mm_sub_ps( c.m_v, _mm_mul_ps( a.m_v, b.m_v ) ) ); |
124 | } |
125 | |
126 | friend Vec4 Reciprocal( Vec4::Arg v ) |
127 | { |
128 | // get the reciprocal estimate |
129 | __m128 estimate = _mm_rcp_ps( v.m_v ); |
130 | |
131 | // one round of Newton-Rhaphson refinement |
132 | __m128 diff = _mm_sub_ps( _mm_set1_ps( 1.0f ), _mm_mul_ps( estimate, v.m_v ) ); |
133 | return Vec4( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) ); |
134 | } |
135 | |
136 | friend Vec4 Min( Vec4::Arg left, Vec4::Arg right ) |
137 | { |
138 | return Vec4( _mm_min_ps( left.m_v, right.m_v ) ); |
139 | } |
140 | |
141 | friend Vec4 Max( Vec4::Arg left, Vec4::Arg right ) |
142 | { |
143 | return Vec4( _mm_max_ps( left.m_v, right.m_v ) ); |
144 | } |
145 | |
146 | friend Vec4 Truncate( Vec4::Arg v ) |
147 | { |
148 | #if ( SQUISH_USE_SSE == 1 ) |
149 | // convert to ints |
150 | __m128 input = v.m_v; |
151 | __m64 lo = _mm_cvttps_pi32( input ); |
152 | __m64 hi = _mm_cvttps_pi32( _mm_movehl_ps( input, input ) ); |
153 | |
154 | // convert to floats |
155 | __m128 part = _mm_movelh_ps( input, _mm_cvtpi32_ps( input, hi ) ); |
156 | __m128 truncated = _mm_cvtpi32_ps( part, lo ); |
157 | |
158 | // clear out the MMX multimedia state to allow FP calls later |
159 | _mm_empty(); |
160 | return Vec4( truncated ); |
161 | #else |
162 | // use SSE2 instructions |
163 | return Vec4( _mm_cvtepi32_ps( _mm_cvttps_epi32( v.m_v ) ) ); |
164 | #endif |
165 | } |
166 | |
167 | friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) |
168 | { |
169 | __m128 bits = _mm_cmplt_ps( left.m_v, right.m_v ); |
170 | int value = _mm_movemask_ps( bits ); |
171 | return value != 0; |
172 | } |
173 | |
174 | private: |
175 | __m128 m_v; |
176 | }; |
177 | |
178 | } // namespace squish |
179 | |
180 | #endif // ndef SQUISH_SIMD_SSE_H |
181 | |