33 #pragma clang diagnostic ignored "-Wc++11-narrowing"
35 #pragma GCC diagnostic ignored "-Wnarrowing"
38 #pragma warning(disable: 4838)
45# define ALIGN16_BEG __declspec(align(16))
49# define ALIGN16_END __attribute__((aligned(16)))
56# include <emmintrin.h>
63#define _PS_CONST(Name, Val) \
64 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
65#define _PI32_CONST(Name, Val) \
66 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
67#define _PS_CONST_TYPE(Name, Type, Val) \
68 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
73_PS_CONST_TYPE(min_norm_pos,
int, 0x00800000);
74_PS_CONST_TYPE(mant_mask,
int, 0x7f800000);
75_PS_CONST_TYPE(inv_mant_mask,
int, ~0x7f800000);
77_PS_CONST_TYPE(sign_mask,
int, 0x80000000);
78_PS_CONST_TYPE(inv_sign_mask,
int, ~0x80000000);
84_PI32_CONST(0x7f, 0x7f);
86_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
87_PS_CONST(cephes_log_p0, 7.0376836292E-2);
88_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
89_PS_CONST(cephes_log_p2, 1.1676998740E-1);
90_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
91_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
92_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
93_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
94_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
95_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
96_PS_CONST(cephes_log_q1, -2.12194440e-4);
97_PS_CONST(cephes_log_q2, 0.693359375);
99#if defined (__MINGW32__)
108inline __m128 my_movehl_ps(__m128 a,
const __m128 b) {
115#warning "redefined _mm_movehl_ps (see gcc bug 21179)"
116#define _mm_movehl_ps my_movehl_ps
118inline __m128 my_cmplt_ps(__m128 a,
const __m128 b) {
126inline __m128 my_cmpgt_ps(__m128 a,
const __m128 b) {
134inline __m128 my_cmpeq_ps(__m128 a,
const __m128 b) {
142#warning "redefined _mm_cmpxx_ps functions..."
143#define _mm_cmplt_ps my_cmplt_ps
144#define _mm_cmpgt_ps my_cmpgt_ps
145#define _mm_cmpeq_ps my_cmpeq_ps
154#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
155 xmm_mm_union u; u.xmm = xmm_; \
160#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
161 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
175 v4sf one = *(v4sf*)_ps_1;
177 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
179 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);
183 COPY_XMM_TO_MM(x, mm0, mm1);
184 mm0 = _mm_srli_pi32(mm0, 23);
185 mm1 = _mm_srli_pi32(mm1, 23);
187 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
190 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
191 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
195 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
196 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
197 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
200 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
201 v4sf e = _mm_cvtepi32_ps(emm0);
204 e = _mm_add_ps(e, one);
212 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
213 v4sf tmp = _mm_and_ps(x, mask);
214 x = _mm_sub_ps(x, one);
215 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
216 x = _mm_add_ps(x, tmp);
219 v4sf z = _mm_mul_ps(x,x);
221 v4sf y = *(v4sf*)_ps_cephes_log_p0;
222 y = _mm_mul_ps(y, x);
223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
224 y = _mm_mul_ps(y, x);
225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
226 y = _mm_mul_ps(y, x);
227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
232 y = _mm_mul_ps(y, x);
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
234 y = _mm_mul_ps(y, x);
235 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
236 y = _mm_mul_ps(y, x);
237 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
238 y = _mm_mul_ps(y, x);
240 y = _mm_mul_ps(y, z);
243 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
244 y = _mm_add_ps(y, tmp);
247 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
248 y = _mm_sub_ps(y, tmp);
250 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
251 x = _mm_add_ps(x, y);
252 x = _mm_add_ps(x, tmp);
253 x = _mm_or_ps(x, invalid_mask);
257_PS_CONST(exp_hi, 88.3762626647949f);
258_PS_CONST(exp_lo, -88.3762626647949f);
260_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
261_PS_CONST(cephes_exp_C1, 0.693359375);
262_PS_CONST(cephes_exp_C2, -2.12194440e-4);
264_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
265_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
266_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
267_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
268_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
269_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
272 v4sf tmp = _mm_setzero_ps(), fx;
278 v4sf one = *(v4sf*)_ps_1;
280 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
281 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
284 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
285 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
290 tmp = _mm_movehl_ps(tmp, fx);
291 mm0 = _mm_cvttps_pi32(fx);
292 mm1 = _mm_cvttps_pi32(tmp);
294 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
296 emm0 = _mm_cvttps_epi32(fx);
297 tmp = _mm_cvtepi32_ps(emm0);
300 v4sf mask = _mm_cmpgt_ps(tmp, fx);
301 mask = _mm_and_ps(mask, one);
302 fx = _mm_sub_ps(tmp, mask);
304 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
305 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
306 x = _mm_sub_ps(x, tmp);
307 x = _mm_sub_ps(x, z);
311 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
312 y = _mm_mul_ps(y, x);
313 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
314 y = _mm_mul_ps(y, x);
315 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
316 y = _mm_mul_ps(y, x);
317 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
318 y = _mm_mul_ps(y, x);
319 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
320 y = _mm_mul_ps(y, x);
321 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
322 y = _mm_mul_ps(y, z);
323 y = _mm_add_ps(y, x);
324 y = _mm_add_ps(y, one);
328 z = _mm_movehl_ps(z, fx);
329 mm0 = _mm_cvttps_pi32(fx);
330 mm1 = _mm_cvttps_pi32(z);
331 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
332 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
333 mm0 = _mm_slli_pi32(mm0, 23);
334 mm1 = _mm_slli_pi32(mm1, 23);
337 COPY_MM_TO_XMM(mm0, mm1, pow2n);
340 emm0 = _mm_cvttps_epi32(fx);
341 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
342 emm0 = _mm_slli_epi32(emm0, 23);
343 v4sf pow2n = _mm_castsi128_ps(emm0);
345 y = _mm_mul_ps(y, pow2n);
349_PS_CONST(minus_cephes_DP1, -0.78515625);
350_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
351_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
352_PS_CONST(sincof_p0, -1.9515295891E-4);
353_PS_CONST(sincof_p1, 8.3321608736E-3);
354_PS_CONST(sincof_p2, -1.6666654611E-1);
355_PS_CONST(coscof_p0, 2.443315711809948E-005);
356_PS_CONST(coscof_p1, -1.388731625493765E-003);
357_PS_CONST(coscof_p2, 4.166664568298827E-002);
358_PS_CONST(cephes_FOPI, 1.27323954473516);
390 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
395 v2si mm0, mm1, mm2, mm3;
399 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
401 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
404 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
409 emm2 = _mm_cvttps_epi32(y);
411 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
412 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
413 y = _mm_cvtepi32_ps(emm2);
415 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
416 emm0 = _mm_slli_epi32(emm0, 29);
423 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
424 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
426 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
427 v4sf poly_mask = _mm_castsi128_ps(emm2);
428 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
431 xmm2 = _mm_movehl_ps(xmm2, y);
432 mm2 = _mm_cvttps_pi32(y);
433 mm3 = _mm_cvttps_pi32(xmm2);
435 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
436 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
437 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
438 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
439 y = _mm_cvtpi32x2_ps(mm2, mm3);
441 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
442 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
443 mm0 = _mm_slli_pi32(mm0, 29);
444 mm1 = _mm_slli_pi32(mm1, 29);
446 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
447 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
448 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
449 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
450 v4sf swap_sign_bit, poly_mask;
451 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
452 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
453 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
459 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
460 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
461 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
462 xmm1 = _mm_mul_ps(y, xmm1);
463 xmm2 = _mm_mul_ps(y, xmm2);
464 xmm3 = _mm_mul_ps(y, xmm3);
465 x = _mm_add_ps(x, xmm1);
466 x = _mm_add_ps(x, xmm2);
467 x = _mm_add_ps(x, xmm3);
470 y = *(v4sf*)_ps_coscof_p0;
471 v4sf z = _mm_mul_ps(x,x);
473 y = _mm_mul_ps(y, z);
474 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
475 y = _mm_mul_ps(y, z);
476 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
477 y = _mm_mul_ps(y, z);
478 y = _mm_mul_ps(y, z);
479 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
480 y = _mm_sub_ps(y, tmp);
481 y = _mm_add_ps(y, *(v4sf*)_ps_1);
485 v4sf y2 = *(v4sf*)_ps_sincof_p0;
486 y2 = _mm_mul_ps(y2, z);
487 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
488 y2 = _mm_mul_ps(y2, z);
489 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
490 y2 = _mm_mul_ps(y2, z);
491 y2 = _mm_mul_ps(y2, x);
492 y2 = _mm_add_ps(y2, x);
496 y2 = _mm_and_ps(xmm3, y2);
497 y = _mm_andnot_ps(xmm3, y);
498 y = _mm_add_ps(y,y2);
500 y = _mm_xor_ps(y, sign_bit);
507 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
511 v2si mm0, mm1, mm2, mm3;
514 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
517 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
521 emm2 = _mm_cvttps_epi32(y);
523 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
524 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
525 y = _mm_cvtepi32_ps(emm2);
527 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
530 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
531 emm0 = _mm_slli_epi32(emm0, 29);
533 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
534 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
536 v4sf sign_bit = _mm_castsi128_ps(emm0);
537 v4sf poly_mask = _mm_castsi128_ps(emm2);
540 xmm2 = _mm_movehl_ps(xmm2, y);
541 mm2 = _mm_cvttps_pi32(y);
542 mm3 = _mm_cvttps_pi32(xmm2);
545 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
546 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
547 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
548 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
550 y = _mm_cvtpi32x2_ps(mm2, mm3);
553 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
554 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
559 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
560 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
561 mm0 = _mm_slli_pi32(mm0, 29);
562 mm1 = _mm_slli_pi32(mm1, 29);
564 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
565 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
567 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
568 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
570 v4sf sign_bit, poly_mask;
571 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
572 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
577 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
578 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
579 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
580 xmm1 = _mm_mul_ps(y, xmm1);
581 xmm2 = _mm_mul_ps(y, xmm2);
582 xmm3 = _mm_mul_ps(y, xmm3);
583 x = _mm_add_ps(x, xmm1);
584 x = _mm_add_ps(x, xmm2);
585 x = _mm_add_ps(x, xmm3);
588 y = *(v4sf*)_ps_coscof_p0;
589 v4sf z = _mm_mul_ps(x,x);
591 y = _mm_mul_ps(y, z);
592 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
593 y = _mm_mul_ps(y, z);
594 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
595 y = _mm_mul_ps(y, z);
596 y = _mm_mul_ps(y, z);
597 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
598 y = _mm_sub_ps(y, tmp);
599 y = _mm_add_ps(y, *(v4sf*)_ps_1);
603 v4sf y2 = *(v4sf*)_ps_sincof_p0;
604 y2 = _mm_mul_ps(y2, z);
605 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
606 y2 = _mm_mul_ps(y2, z);
607 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
608 y2 = _mm_mul_ps(y2, z);
609 y2 = _mm_mul_ps(y2, x);
610 y2 = _mm_add_ps(y2, x);
614 y2 = _mm_and_ps(xmm3, y2);
615 y = _mm_andnot_ps(xmm3, y);
616 y = _mm_add_ps(y,y2);
618 y = _mm_xor_ps(y, sign_bit);
625void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
626 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
628 v4si emm0, emm2, emm4;
630 v2si mm0, mm1, mm2, mm3, mm4, mm5;
634 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
636 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
639 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
643 emm2 = _mm_cvttps_epi32(y);
646 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
647 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
648 y = _mm_cvtepi32_ps(emm2);
653 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
654 emm0 = _mm_slli_epi32(emm0, 29);
655 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
658 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
659 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
660 v4sf poly_mask = _mm_castsi128_ps(emm2);
663 xmm3 = _mm_movehl_ps(xmm3, y);
664 mm2 = _mm_cvttps_pi32(y);
665 mm3 = _mm_cvttps_pi32(xmm3);
668 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
669 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
670 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
671 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
673 y = _mm_cvtpi32x2_ps(mm2, mm3);
679 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
680 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
681 mm0 = _mm_slli_pi32(mm0, 29);
682 mm1 = _mm_slli_pi32(mm1, 29);
683 v4sf swap_sign_bit_sin;
684 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
688 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
689 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
690 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
691 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
693 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
698 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
699 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
700 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
701 xmm1 = _mm_mul_ps(y, xmm1);
702 xmm2 = _mm_mul_ps(y, xmm2);
703 xmm3 = _mm_mul_ps(y, xmm3);
704 x = _mm_add_ps(x, xmm1);
705 x = _mm_add_ps(x, xmm2);
706 x = _mm_add_ps(x, xmm3);
709 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
710 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
711 emm4 = _mm_slli_epi32(emm4, 29);
712 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
715 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
716 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
717 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
718 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
719 mm4 = _mm_slli_pi32(mm4, 29);
720 mm5 = _mm_slli_pi32(mm5, 29);
722 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
726 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
730 v4sf z = _mm_mul_ps(x,x);
731 y = *(v4sf*)_ps_coscof_p0;
733 y = _mm_mul_ps(y, z);
734 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
735 y = _mm_mul_ps(y, z);
736 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
737 y = _mm_mul_ps(y, z);
738 y = _mm_mul_ps(y, z);
739 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
740 y = _mm_sub_ps(y, tmp);
741 y = _mm_add_ps(y, *(v4sf*)_ps_1);
745 v4sf y2 = *(v4sf*)_ps_sincof_p0;
746 y2 = _mm_mul_ps(y2, z);
747 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
748 y2 = _mm_mul_ps(y2, z);
749 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
750 y2 = _mm_mul_ps(y2, z);
751 y2 = _mm_mul_ps(y2, x);
752 y2 = _mm_add_ps(y2, x);
756 v4sf ysin2 = _mm_and_ps(xmm3, y2);
757 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
758 y2 = _mm_sub_ps(y2,ysin2);
759 y = _mm_sub_ps(y, ysin1);
761 xmm1 = _mm_add_ps(ysin1,ysin2);
762 xmm2 = _mm_add_ps(y,y2);
765 *s = _mm_xor_ps(xmm1, sign_bit_sin);
766 *c = _mm_xor_ps(xmm2, sign_bit_cos);
Definition: sse_mathfun.h:149