Marine systems simulation
sse_mathfun.h
1/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2
3 Inspired by Intel Approximate Math library, and based on the
4 corresponding algorithms of the cephes math library
5
6 The default is to use the SSE1 version. If you define USE_SSE2 the
7 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8 not expect any significant performance improvement with SSE2.
9*/
10
11/* Copyright (C) 2007 Julien Pommier
12
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
16
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
20
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
28
29 (this is the zlib license)
30*/
31
32#ifdef __clang__
33 #pragma clang diagnostic ignored "-Wc++11-narrowing"
34#elif __GNUC__
35 #pragma GCC diagnostic ignored "-Wnarrowing"
36#elif _MSC_VER
37 #pragma warning(push)
38 #pragma warning(disable: 4838) // possible loss of data
39#endif
40#include <xmmintrin.h>
41
42/* yes I know, the top of this file is quite ugly */
43
44#ifdef _MSC_VER /* visual c++ */
45# define ALIGN16_BEG __declspec(align(16))
46# define ALIGN16_END
47#else /* gcc or icc */
48# define ALIGN16_BEG
49# define ALIGN16_END __attribute__((aligned(16)))
50#endif
51
52/* __m128 is ugly to write */
53typedef __m128 v4sf; // vector of 4 float (sse1)
54
55#ifdef USE_SSE2
56# include <emmintrin.h>
57typedef __m128i v4si; // vector of 4 int (sse2)
58#else
59typedef __m64 v2si; // vector of 2 int (mmx)
60#endif
61
62/* declare some SSE constants -- why can't I figure a better way to do that? */
63#define _PS_CONST(Name, Val) \
64 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
65#define _PI32_CONST(Name, Val) \
66 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
67#define _PS_CONST_TYPE(Name, Type, Val) \
68 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
69
70_PS_CONST(1 , 1.0f);
71_PS_CONST(0p5, 0.5f);
72/* the smallest non denormalized float number */
73_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
74_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
75_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
76
77_PS_CONST_TYPE(sign_mask, int, 0x80000000);
78_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
79
80_PI32_CONST(1, 1);
81_PI32_CONST(inv1, ~1);
82_PI32_CONST(2, 2);
83_PI32_CONST(4, 4);
84_PI32_CONST(0x7f, 0x7f);
85
86_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
87_PS_CONST(cephes_log_p0, 7.0376836292E-2);
88_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
89_PS_CONST(cephes_log_p2, 1.1676998740E-1);
90_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
91_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
92_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
93_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
94_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
95_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
96_PS_CONST(cephes_log_q1, -2.12194440e-4);
97_PS_CONST(cephes_log_q2, 0.693359375);
98
99#if defined (__MINGW32__)
100
101/* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
102 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
103 broken on my mingw gcc 3.4.5 ...
104
105 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
106*/
107
108inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
109 asm (
110 "movhlps %2,%0\n\t"
111 : "=x" (a)
112 : "0" (a), "x"(b)
113 );
114 return a; }
115#warning "redefined _mm_movehl_ps (see gcc bug 21179)"
116#define _mm_movehl_ps my_movehl_ps
117
118inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
119 asm (
120 "cmpltps %2,%0\n\t"
121 : "=x" (a)
122 : "0" (a), "x"(b)
123 );
124 return a;
125 }
126inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
127 asm (
128 "cmpnleps %2,%0\n\t"
129 : "=x" (a)
130 : "0" (a), "x"(b)
131 );
132 return a;
133}
134inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
135 asm (
136 "cmpeqps %2,%0\n\t"
137 : "=x" (a)
138 : "0" (a), "x"(b)
139 );
140 return a;
141}
142#warning "redefined _mm_cmpxx_ps functions..."
143#define _mm_cmplt_ps my_cmplt_ps
144#define _mm_cmpgt_ps my_cmpgt_ps
145#define _mm_cmpeq_ps my_cmpeq_ps
146#endif
147
148#ifndef USE_SSE2
149typedef union xmm_mm_union {
150 __m128 xmm;
151 __m64 mm[2];
153
154#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
155 xmm_mm_union u; u.xmm = xmm_; \
156 mm0_ = u.mm[0]; \
157 mm1_ = u.mm[1]; \
158}
159
160#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
161 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
162 }
163
164#endif // USE_SSE2
165
166/* natural logarithm computed for 4 simultaneous float
167 return NaN for x <= 0
168*/
169v4sf log_ps(v4sf x) {
170#ifdef USE_SSE2
171 v4si emm0;
172#else
173 v2si mm0, mm1;
174#endif
175 v4sf one = *(v4sf*)_ps_1;
176
177 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
178
179 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
180
181#ifndef USE_SSE2
182 /* part 1: x = frexpf(x, &e); */
183 COPY_XMM_TO_MM(x, mm0, mm1);
184 mm0 = _mm_srli_pi32(mm0, 23);
185 mm1 = _mm_srli_pi32(mm1, 23);
186#else
187 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
188#endif
189 /* keep only the fractional part */
190 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
191 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
192
193#ifndef USE_SSE2
194 /* now e=mm0:mm1 contain the really base-2 exponent */
195 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
196 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
197 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
198 _mm_empty(); /* bye bye mmx */
199#else
200 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
201 v4sf e = _mm_cvtepi32_ps(emm0);
202#endif
203
204 e = _mm_add_ps(e, one);
205
206 /* part2:
207 if( x < SQRTHF ) {
208 e -= 1;
209 x = x + x - 1.0;
210 } else { x = x - 1.0; }
211 */
212 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
213 v4sf tmp = _mm_and_ps(x, mask);
214 x = _mm_sub_ps(x, one);
215 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
216 x = _mm_add_ps(x, tmp);
217
218
219 v4sf z = _mm_mul_ps(x,x);
220
221 v4sf y = *(v4sf*)_ps_cephes_log_p0;
222 y = _mm_mul_ps(y, x);
223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
224 y = _mm_mul_ps(y, x);
225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
226 y = _mm_mul_ps(y, x);
227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
232 y = _mm_mul_ps(y, x);
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
234 y = _mm_mul_ps(y, x);
235 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
236 y = _mm_mul_ps(y, x);
237 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
238 y = _mm_mul_ps(y, x);
239
240 y = _mm_mul_ps(y, z);
241
242
243 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
244 y = _mm_add_ps(y, tmp);
245
246
247 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
248 y = _mm_sub_ps(y, tmp);
249
250 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
251 x = _mm_add_ps(x, y);
252 x = _mm_add_ps(x, tmp);
253 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
254 return x;
255}
256
257_PS_CONST(exp_hi, 88.3762626647949f);
258_PS_CONST(exp_lo, -88.3762626647949f);
259
260_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
261_PS_CONST(cephes_exp_C1, 0.693359375);
262_PS_CONST(cephes_exp_C2, -2.12194440e-4);
263
264_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
265_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
266_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
267_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
268_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
269_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
270
271v4sf exp_ps(v4sf x) {
272 v4sf tmp = _mm_setzero_ps(), fx;
273#ifdef USE_SSE2
274 v4si emm0;
275#else
276 v2si mm0, mm1;
277#endif
278 v4sf one = *(v4sf*)_ps_1;
279
280 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
281 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
282
283 /* express exp(x) as exp(g + n*log(2)) */
284 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
285 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
286
287 /* how to perform a floorf with SSE: just below */
288#ifndef USE_SSE2
289 /* step 1 : cast to int */
290 tmp = _mm_movehl_ps(tmp, fx);
291 mm0 = _mm_cvttps_pi32(fx);
292 mm1 = _mm_cvttps_pi32(tmp);
293 /* step 2 : cast back to float */
294 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
295#else
296 emm0 = _mm_cvttps_epi32(fx);
297 tmp = _mm_cvtepi32_ps(emm0);
298#endif
299 /* if greater, substract 1 */
300 v4sf mask = _mm_cmpgt_ps(tmp, fx);
301 mask = _mm_and_ps(mask, one);
302 fx = _mm_sub_ps(tmp, mask);
303
304 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
305 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
306 x = _mm_sub_ps(x, tmp);
307 x = _mm_sub_ps(x, z);
308
309 z = _mm_mul_ps(x,x);
310
311 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
312 y = _mm_mul_ps(y, x);
313 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
314 y = _mm_mul_ps(y, x);
315 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
316 y = _mm_mul_ps(y, x);
317 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
318 y = _mm_mul_ps(y, x);
319 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
320 y = _mm_mul_ps(y, x);
321 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
322 y = _mm_mul_ps(y, z);
323 y = _mm_add_ps(y, x);
324 y = _mm_add_ps(y, one);
325
326 /* build 2^n */
327#ifndef USE_SSE2
328 z = _mm_movehl_ps(z, fx);
329 mm0 = _mm_cvttps_pi32(fx);
330 mm1 = _mm_cvttps_pi32(z);
331 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
332 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
333 mm0 = _mm_slli_pi32(mm0, 23);
334 mm1 = _mm_slli_pi32(mm1, 23);
335
336 v4sf pow2n;
337 COPY_MM_TO_XMM(mm0, mm1, pow2n);
338 _mm_empty();
339#else
340 emm0 = _mm_cvttps_epi32(fx);
341 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
342 emm0 = _mm_slli_epi32(emm0, 23);
343 v4sf pow2n = _mm_castsi128_ps(emm0);
344#endif
345 y = _mm_mul_ps(y, pow2n);
346 return y;
347}
348
349_PS_CONST(minus_cephes_DP1, -0.78515625);
350_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
351_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
352_PS_CONST(sincof_p0, -1.9515295891E-4);
353_PS_CONST(sincof_p1, 8.3321608736E-3);
354_PS_CONST(sincof_p2, -1.6666654611E-1);
355_PS_CONST(coscof_p0, 2.443315711809948E-005);
356_PS_CONST(coscof_p1, -1.388731625493765E-003);
357_PS_CONST(coscof_p2, 4.166664568298827E-002);
358_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
359
360
361/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
362 it runs also on old athlons XPs and the pentium III of your grand
363 mother.
364
365 The code is the exact rewriting of the cephes sinf function.
366 Precision is excellent as long as x < 8192 (I did not bother to
367 take into account the special handling they have for greater values
368 -- it does not return garbage for arguments over 8192, though, but
369 the extra precision is missing).
370
371 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
372 surprising but correct result.
373
374 Performance is also surprisingly good, 1.33 times faster than the
375 macos vsinf SSE2 function, and 1.5 times faster than the
376 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
377 too bad for an SSE1 function (with no special tuning) !
378 However the latter libraries probably have a much better handling of NaN,
379 Inf, denormalized and other special arguments..
380
381 On my core 1 duo, the execution of this function takes approximately 95 cycles.
382
383 From what I have observed on the experiments with Intel AMath lib, switching to an
384 SSE2 version would improve the perf by only 10%.
385
386 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
387 deliver full speed.
388*/
389v4sf sin_ps(v4sf x) { // any x
390 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
391
392#ifdef USE_SSE2
393 v4si emm0, emm2;
394#else
395 v2si mm0, mm1, mm2, mm3;
396#endif
397 sign_bit = x;
398 /* take the absolute value */
399 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
400 /* extract the sign bit (upper one) */
401 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
402
403 /* scale by 4/Pi */
404 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
405
406 //printf("plop:"); print4(y);
407#ifdef USE_SSE2
408 /* store the integer part of y in mm0 */
409 emm2 = _mm_cvttps_epi32(y);
410 /* j=(j+1) & (~1) (see the cephes sources) */
411 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
412 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
413 y = _mm_cvtepi32_ps(emm2);
414 /* get the swap sign flag */
415 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
416 emm0 = _mm_slli_epi32(emm0, 29);
417 /* get the polynom selection mask
418 there is one polynom for 0 <= x <= Pi/4
419 and another one for Pi/4<x<=Pi/2
420
421 Both branches will be computed.
422 */
423 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
424 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
425
426 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
427 v4sf poly_mask = _mm_castsi128_ps(emm2);
428 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
429#else
430 /* store the integer part of y in mm0:mm1 */
431 xmm2 = _mm_movehl_ps(xmm2, y);
432 mm2 = _mm_cvttps_pi32(y);
433 mm3 = _mm_cvttps_pi32(xmm2);
434 /* j=(j+1) & (~1) (see the cephes sources) */
435 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
436 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
437 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
438 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
439 y = _mm_cvtpi32x2_ps(mm2, mm3);
440 /* get the swap sign flag */
441 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
442 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
443 mm0 = _mm_slli_pi32(mm0, 29);
444 mm1 = _mm_slli_pi32(mm1, 29);
445 /* get the polynom selection mask */
446 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
447 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
448 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
449 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
450 v4sf swap_sign_bit, poly_mask;
451 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
452 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
453 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
454 _mm_empty(); /* good-bye mmx */
455#endif
456
457 /* The magic pass: "Extended precision modular arithmetic"
458 x = ((x - y * DP1) - y * DP2) - y * DP3; */
459 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
460 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
461 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
462 xmm1 = _mm_mul_ps(y, xmm1);
463 xmm2 = _mm_mul_ps(y, xmm2);
464 xmm3 = _mm_mul_ps(y, xmm3);
465 x = _mm_add_ps(x, xmm1);
466 x = _mm_add_ps(x, xmm2);
467 x = _mm_add_ps(x, xmm3);
468
469 /* Evaluate the first polynom (0 <= x <= Pi/4) */
470 y = *(v4sf*)_ps_coscof_p0;
471 v4sf z = _mm_mul_ps(x,x);
472
473 y = _mm_mul_ps(y, z);
474 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
475 y = _mm_mul_ps(y, z);
476 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
477 y = _mm_mul_ps(y, z);
478 y = _mm_mul_ps(y, z);
479 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
480 y = _mm_sub_ps(y, tmp);
481 y = _mm_add_ps(y, *(v4sf*)_ps_1);
482
483 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
484
485 v4sf y2 = *(v4sf*)_ps_sincof_p0;
486 y2 = _mm_mul_ps(y2, z);
487 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
488 y2 = _mm_mul_ps(y2, z);
489 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
490 y2 = _mm_mul_ps(y2, z);
491 y2 = _mm_mul_ps(y2, x);
492 y2 = _mm_add_ps(y2, x);
493
494 /* select the correct result from the two polynoms */
495 xmm3 = poly_mask;
496 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
497 y = _mm_andnot_ps(xmm3, y);
498 y = _mm_add_ps(y,y2);
499 /* update the sign */
500 y = _mm_xor_ps(y, sign_bit);
501
502 return y;
503}
504
505/* almost the same as sin_ps */
506v4sf cos_ps(v4sf x) { // any x
507 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
508#ifdef USE_SSE2
509 v4si emm0, emm2;
510#else
511 v2si mm0, mm1, mm2, mm3;
512#endif
513 /* take the absolute value */
514 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
515
516 /* scale by 4/Pi */
517 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
518
519#ifdef USE_SSE2
520 /* store the integer part of y in mm0 */
521 emm2 = _mm_cvttps_epi32(y);
522 /* j=(j+1) & (~1) (see the cephes sources) */
523 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
524 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
525 y = _mm_cvtepi32_ps(emm2);
526
527 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
528
529 /* get the swap sign flag */
530 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
531 emm0 = _mm_slli_epi32(emm0, 29);
532 /* get the polynom selection mask */
533 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
534 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
535
536 v4sf sign_bit = _mm_castsi128_ps(emm0);
537 v4sf poly_mask = _mm_castsi128_ps(emm2);
538#else
539 /* store the integer part of y in mm0:mm1 */
540 xmm2 = _mm_movehl_ps(xmm2, y);
541 mm2 = _mm_cvttps_pi32(y);
542 mm3 = _mm_cvttps_pi32(xmm2);
543
544 /* j=(j+1) & (~1) (see the cephes sources) */
545 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
546 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
547 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
548 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
549
550 y = _mm_cvtpi32x2_ps(mm2, mm3);
551
552
553 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
554 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
555
556 /* get the swap sign flag in mm0:mm1 and the
557 polynom selection mask in mm2:mm3 */
558
559 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
560 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
561 mm0 = _mm_slli_pi32(mm0, 29);
562 mm1 = _mm_slli_pi32(mm1, 29);
563
564 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
565 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
566
567 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
568 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
569
570 v4sf sign_bit, poly_mask;
571 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
572 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
573 _mm_empty(); /* good-bye mmx */
574#endif
575 /* The magic pass: "Extended precision modular arithmetic"
576 x = ((x - y * DP1) - y * DP2) - y * DP3; */
577 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
578 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
579 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
580 xmm1 = _mm_mul_ps(y, xmm1);
581 xmm2 = _mm_mul_ps(y, xmm2);
582 xmm3 = _mm_mul_ps(y, xmm3);
583 x = _mm_add_ps(x, xmm1);
584 x = _mm_add_ps(x, xmm2);
585 x = _mm_add_ps(x, xmm3);
586
587 /* Evaluate the first polynom (0 <= x <= Pi/4) */
588 y = *(v4sf*)_ps_coscof_p0;
589 v4sf z = _mm_mul_ps(x,x);
590
591 y = _mm_mul_ps(y, z);
592 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
593 y = _mm_mul_ps(y, z);
594 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
595 y = _mm_mul_ps(y, z);
596 y = _mm_mul_ps(y, z);
597 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
598 y = _mm_sub_ps(y, tmp);
599 y = _mm_add_ps(y, *(v4sf*)_ps_1);
600
601 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
602
603 v4sf y2 = *(v4sf*)_ps_sincof_p0;
604 y2 = _mm_mul_ps(y2, z);
605 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
606 y2 = _mm_mul_ps(y2, z);
607 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
608 y2 = _mm_mul_ps(y2, z);
609 y2 = _mm_mul_ps(y2, x);
610 y2 = _mm_add_ps(y2, x);
611
612 /* select the correct result from the two polynoms */
613 xmm3 = poly_mask;
614 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
615 y = _mm_andnot_ps(xmm3, y);
616 y = _mm_add_ps(y,y2);
617 /* update the sign */
618 y = _mm_xor_ps(y, sign_bit);
619
620 return y;
621}
622
623/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
624 it is almost as fast, and gives you a free cosine with your sine */
625void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
626 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
627#ifdef USE_SSE2
628 v4si emm0, emm2, emm4;
629#else
630 v2si mm0, mm1, mm2, mm3, mm4, mm5;
631#endif
632 sign_bit_sin = x;
633 /* take the absolute value */
634 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
635 /* extract the sign bit (upper one) */
636 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
637
638 /* scale by 4/Pi */
639 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
640
641#ifdef USE_SSE2
642 /* store the integer part of y in emm2 */
643 emm2 = _mm_cvttps_epi32(y);
644
645 /* j=(j+1) & (~1) (see the cephes sources) */
646 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
647 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
648 y = _mm_cvtepi32_ps(emm2);
649
650 emm4 = emm2;
651
652 /* get the swap sign flag for the sine */
653 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
654 emm0 = _mm_slli_epi32(emm0, 29);
655 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
656
657 /* get the polynom selection mask for the sine*/
658 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
659 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
660 v4sf poly_mask = _mm_castsi128_ps(emm2);
661#else
662 /* store the integer part of y in mm2:mm3 */
663 xmm3 = _mm_movehl_ps(xmm3, y);
664 mm2 = _mm_cvttps_pi32(y);
665 mm3 = _mm_cvttps_pi32(xmm3);
666
667 /* j=(j+1) & (~1) (see the cephes sources) */
668 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
669 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
670 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
671 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
672
673 y = _mm_cvtpi32x2_ps(mm2, mm3);
674
675 mm4 = mm2;
676 mm5 = mm3;
677
678 /* get the swap sign flag for the sine */
679 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
680 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
681 mm0 = _mm_slli_pi32(mm0, 29);
682 mm1 = _mm_slli_pi32(mm1, 29);
683 v4sf swap_sign_bit_sin;
684 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
685
686 /* get the polynom selection mask for the sine */
687
688 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
689 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
690 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
691 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
692 v4sf poly_mask;
693 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
694#endif
695
696 /* The magic pass: "Extended precision modular arithmetic"
697 x = ((x - y * DP1) - y * DP2) - y * DP3; */
698 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
699 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
700 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
701 xmm1 = _mm_mul_ps(y, xmm1);
702 xmm2 = _mm_mul_ps(y, xmm2);
703 xmm3 = _mm_mul_ps(y, xmm3);
704 x = _mm_add_ps(x, xmm1);
705 x = _mm_add_ps(x, xmm2);
706 x = _mm_add_ps(x, xmm3);
707
708#ifdef USE_SSE2
709 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
710 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
711 emm4 = _mm_slli_epi32(emm4, 29);
712 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
713#else
714 /* get the sign flag for the cosine */
715 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
716 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
717 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
718 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
719 mm4 = _mm_slli_pi32(mm4, 29);
720 mm5 = _mm_slli_pi32(mm5, 29);
721 v4sf sign_bit_cos;
722 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
723 _mm_empty(); /* good-bye mmx */
724#endif
725
726 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
727
728
729 /* Evaluate the first polynom (0 <= x <= Pi/4) */
730 v4sf z = _mm_mul_ps(x,x);
731 y = *(v4sf*)_ps_coscof_p0;
732
733 y = _mm_mul_ps(y, z);
734 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
735 y = _mm_mul_ps(y, z);
736 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
737 y = _mm_mul_ps(y, z);
738 y = _mm_mul_ps(y, z);
739 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
740 y = _mm_sub_ps(y, tmp);
741 y = _mm_add_ps(y, *(v4sf*)_ps_1);
742
743 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
744
745 v4sf y2 = *(v4sf*)_ps_sincof_p0;
746 y2 = _mm_mul_ps(y2, z);
747 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
748 y2 = _mm_mul_ps(y2, z);
749 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
750 y2 = _mm_mul_ps(y2, z);
751 y2 = _mm_mul_ps(y2, x);
752 y2 = _mm_add_ps(y2, x);
753
754 /* select the correct result from the two polynoms */
755 xmm3 = poly_mask;
756 v4sf ysin2 = _mm_and_ps(xmm3, y2);
757 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
758 y2 = _mm_sub_ps(y2,ysin2);
759 y = _mm_sub_ps(y, ysin1);
760
761 xmm1 = _mm_add_ps(ysin1,ysin2);
762 xmm2 = _mm_add_ps(y,y2);
763
764 /* update the sign */
765 *s = _mm_xor_ps(xmm1, sign_bit_sin);
766 *c = _mm_xor_ps(xmm2, sign_bit_cos);
767}
768
769#ifdef _MSC_VER
770 #pragma warning(pop)
771#endif
Definition: sse_mathfun.h:149