AVIR
High-quality pro image resizing library
 All Classes Files Functions Variables Typedefs Macros
avir_float4_sse.h
Go to the documentation of this file.
1 //$ nobt
2 //$ nocpp
3 
15 #ifndef AVIR_FLOAT4_SSE_INCLUDED
16 #define AVIR_FLOAT4_SSE_INCLUDED
17 
18 #include <xmmintrin.h>
19 #include <emmintrin.h>
20 
21 namespace avir {
22 
32 class float4
33 {
34 public:
35  float4()
36  {
37  }
38 
39  float4( const float4& s )
40  : value( s.value )
41  {
42  }
43 
44  float4( const __m128 s )
45  : value( s )
46  {
47  }
48 
49  float4( const float s )
50  : value( _mm_set1_ps( s ))
51  {
52  }
53 
54  float4& operator = ( const float4& s )
55  {
56  value = s.value;
57  return( *this );
58  }
59 
60  float4& operator = ( const __m128 s )
61  {
62  value = s;
63  return( *this );
64  }
65 
66  float4& operator = ( const float s )
67  {
68  value = _mm_set1_ps( s );
69  return( *this );
70  }
71 
72  operator float () const
73  {
74  return( _mm_cvtss_f32( value ));
75  }
76 
83  static float4 load( const float* const p )
84  {
85  return( _mm_load_ps( p ));
86  }
87 
94  static float4 loadu( const float* const p )
95  {
96  return( _mm_loadu_ps( p ));
97  }
98 
107  static float4 loadu( const float* const p, int lim )
108  {
109  if( lim > 2 )
110  {
111  if( lim > 3 )
112  {
113  return( _mm_loadu_ps( p ));
114  }
115  else
116  {
117  return( _mm_set_ps( 0.0f, p[ 2 ], p[ 1 ], p[ 0 ]));
118  }
119  }
120  else
121  {
122  if( lim == 2 )
123  {
124  return( _mm_set_ps( 0.0f, 0.0f, p[ 1 ], p[ 0 ]));
125  }
126  else
127  {
128  return( _mm_load_ss( p ));
129  }
130  }
131  }
132 
139  void store( float* const p ) const
140  {
141  _mm_store_ps( p, value );
142  }
143 
150  void storeu( float* const p ) const
151  {
152  _mm_storeu_ps( p, value );
153  }
154 
163  void storeu( float* const p, int lim ) const
164  {
165  if( lim > 2 )
166  {
167  if( lim > 3 )
168  {
169  _mm_storeu_ps( p, value );
170  }
171  else
172  {
173  _mm_storel_pi( (__m64*) p, value );
174  _mm_store_ss( p + 2, _mm_movehl_ps( value, value ));
175  }
176  }
177  else
178  {
179  if( lim == 2 )
180  {
181  _mm_storel_pi( (__m64*) p, value );
182  }
183  else
184  {
185  _mm_store_ss( p, value );
186  }
187  }
188  }
189 
190  float4& operator += ( const float4& s )
191  {
192  value = _mm_add_ps( value, s.value );
193  return( *this );
194  }
195 
196  float4& operator -= ( const float4& s )
197  {
198  value = _mm_sub_ps( value, s.value );
199  return( *this );
200  }
201 
202  float4& operator *= ( const float4& s )
203  {
204  value = _mm_mul_ps( value, s.value );
205  return( *this );
206  }
207 
208  float4& operator /= ( const float4& s )
209  {
210  value = _mm_div_ps( value, s.value );
211  return( *this );
212  }
213 
214  float4 operator + ( const float4& s ) const
215  {
216  return( _mm_add_ps( value, s.value ));
217  }
218 
219  float4 operator - ( const float4& s ) const
220  {
221  return( _mm_sub_ps( value, s.value ));
222  }
223 
224  float4 operator * ( const float4& s ) const
225  {
226  return( _mm_mul_ps( value, s.value ));
227  }
228 
229  float4 operator / ( const float4& s ) const
230  {
231  return( _mm_div_ps( value, s.value ));
232  }
233 
238  float hadd() const
239  {
240  const __m128 v = _mm_add_ps( value, _mm_movehl_ps( value, value ));
241  const __m128 res = _mm_add_ss( v, _mm_shuffle_ps( v, v, 1 ));
242  return( _mm_cvtss_f32( res ));
243  }
244 
253  static void addu( float* const p, const float4& v )
254  {
255  ( loadu( p ) + v ).storeu( p );
256  }
257 
267  static void addu( float* const p, const float4& v, const int lim )
268  {
269  ( loadu( p, lim ) + v ).storeu( p, lim );
270  }
271 
272  __m128 value;
273 };
275 
283 inline float4 round( const float4& v )
284 {
285  unsigned int prevrm = _MM_GET_ROUNDING_MODE();
286  _MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );
287 
288  const __m128 res = _mm_cvtepi32_ps( _mm_cvtps_epi32( v.value ));
289 
290  _MM_SET_ROUNDING_MODE( prevrm );
291 
292  return( res );
293 }
294 
305 inline float4 clamp( const float4& Value, const float4& minv,
306  const float4& maxv )
307 {
308  return( _mm_min_ps( _mm_max_ps( Value.value, minv.value ), maxv.value ));
309 }
310 
311 typedef fpclass_def< avir :: float4, float > fpclass_float4;
312 
317 } // namespace avir
318 
319 #endif // AVIR_FLOAT4_SSE_INCLUDED
static float4 loadu(const float *const p, int lim)
Definition: avir_float4_sse.h:107
static float4 loadu(const float *const p)
Definition: avir_float4_sse.h:94
SIMD packed 4-float type.
Definition: avir_float4_sse.h:32
void store(float *const p) const
Definition: avir_float4_sse.h:139
float hadd() const
Definition: avir_float4_sse.h:238
static float4 load(const float *const p)
Definition: avir_float4_sse.h:83
void storeu(float *const p) const
Definition: avir_float4_sse.h:150
static void addu(float *const p, const float4 &v, const int lim)
Definition: avir_float4_sse.h:267
static void addu(float *const p, const float4 &v)
Definition: avir_float4_sse.h:253
void storeu(float *const p, int lim) const
Definition: avir_float4_sse.h:163
__m128 value
Definition: avir_float4_sse.h:272