33    #pragma clang diagnostic ignored "-Wc++11-narrowing" 
   35    #pragma GCC diagnostic ignored "-Wnarrowing" 
   38    #pragma warning(disable: 4838)  
   45# define ALIGN16_BEG __declspec(align(16)) 
   49# define ALIGN16_END __attribute__((aligned(16))) 
   56# include <emmintrin.h> 
   63#define _PS_CONST(Name, Val)                                            \ 
   64  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 
   65#define _PI32_CONST(Name, Val)                                            \ 
   66  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 
   67#define _PS_CONST_TYPE(Name, Type, Val)                                 \ 
   68  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 
   73_PS_CONST_TYPE(min_norm_pos, 
int, 0x00800000);
 
   74_PS_CONST_TYPE(mant_mask, 
int, 0x7f800000);
 
   75_PS_CONST_TYPE(inv_mant_mask, 
int, ~0x7f800000);
 
   77_PS_CONST_TYPE(sign_mask, 
int, 0x80000000);
 
   78_PS_CONST_TYPE(inv_sign_mask, 
int, ~0x80000000);
 
   84_PI32_CONST(0x7f, 0x7f);
 
   86_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
 
   87_PS_CONST(cephes_log_p0, 7.0376836292E-2);
 
   88_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
 
   89_PS_CONST(cephes_log_p2, 1.1676998740E-1);
 
   90_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
 
   91_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
 
   92_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
 
   93_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
 
   94_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
 
   95_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
 
   96_PS_CONST(cephes_log_q1, -2.12194440e-4);
 
   97_PS_CONST(cephes_log_q2, 0.693359375);
 
   99#if defined (__MINGW32__) 
  108inline __m128 my_movehl_ps(__m128 a, 
const __m128 b) {
 
  115#warning "redefined _mm_movehl_ps (see gcc bug 21179)" 
  116#define _mm_movehl_ps my_movehl_ps 
  118inline __m128 my_cmplt_ps(__m128 a, 
const __m128 b) {
 
  126inline __m128 my_cmpgt_ps(__m128 a, 
const __m128 b) {
 
  134inline __m128 my_cmpeq_ps(__m128 a, 
const __m128 b) {
 
  142#warning "redefined _mm_cmpxx_ps functions..." 
  143#define _mm_cmplt_ps my_cmplt_ps 
  144#define _mm_cmpgt_ps my_cmpgt_ps 
  145#define _mm_cmpeq_ps my_cmpeq_ps 
  154#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \ 
  155    xmm_mm_union u; u.xmm = xmm_;                   \ 
  160#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \ 
  161    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \ 
  175  v4sf one = *(v4sf*)_ps_1;
 
  177  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
 
  179  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  
 
  183  COPY_XMM_TO_MM(x, mm0, mm1);
 
  184  mm0 = _mm_srli_pi32(mm0, 23);
 
  185  mm1 = _mm_srli_pi32(mm1, 23);
 
  187  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
 
  190  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
 
  191  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
 
  195  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
 
  196  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
 
  197  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
 
  200  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
 
  201  v4sf e = _mm_cvtepi32_ps(emm0);
 
  204  e = _mm_add_ps(e, one);
 
  212  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
 
  213  v4sf tmp = _mm_and_ps(x, mask);
 
  214  x = _mm_sub_ps(x, one);
 
  215  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
 
  216  x = _mm_add_ps(x, tmp);
 
  219  v4sf z = _mm_mul_ps(x,x);
 
  221  v4sf y = *(v4sf*)_ps_cephes_log_p0;
 
  222  y = _mm_mul_ps(y, x);
 
  223  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
 
  224  y = _mm_mul_ps(y, x);
 
  225  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
 
  226  y = _mm_mul_ps(y, x);
 
  227  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
 
  228  y = _mm_mul_ps(y, x);
 
  229  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
 
  230  y = _mm_mul_ps(y, x);
 
  231  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
 
  232  y = _mm_mul_ps(y, x);
 
  233  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
 
  234  y = _mm_mul_ps(y, x);
 
  235  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
 
  236  y = _mm_mul_ps(y, x);
 
  237  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
 
  238  y = _mm_mul_ps(y, x);
 
  240  y = _mm_mul_ps(y, z);
 
  243  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
 
  244  y = _mm_add_ps(y, tmp);
 
  247  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 
  248  y = _mm_sub_ps(y, tmp);
 
  250  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
 
  251  x = _mm_add_ps(x, y);
 
  252  x = _mm_add_ps(x, tmp);
 
  253  x = _mm_or_ps(x, invalid_mask); 
 
  257_PS_CONST(exp_hi,   88.3762626647949f);
 
  258_PS_CONST(exp_lo,   -88.3762626647949f);
 
  260_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
 
  261_PS_CONST(cephes_exp_C1, 0.693359375);
 
  262_PS_CONST(cephes_exp_C2, -2.12194440e-4);
 
  264_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
 
  265_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
 
  266_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
 
  267_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
 
  268_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
 
  269_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
 
  272  v4sf tmp = _mm_setzero_ps(), fx;
 
  278  v4sf one = *(v4sf*)_ps_1;
 
  280  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
 
  281  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
 
  284  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
 
  285  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
 
  290  tmp = _mm_movehl_ps(tmp, fx);
 
  291  mm0 = _mm_cvttps_pi32(fx);
 
  292  mm1 = _mm_cvttps_pi32(tmp);
 
  294  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
 
  296  emm0 = _mm_cvttps_epi32(fx);
 
  297  tmp  = _mm_cvtepi32_ps(emm0);
 
  300  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
 
  301  mask = _mm_and_ps(mask, one);
 
  302  fx = _mm_sub_ps(tmp, mask);
 
  304  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
 
  305  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
 
  306  x = _mm_sub_ps(x, tmp);
 
  307  x = _mm_sub_ps(x, z);
 
  311  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
 
  312  y = _mm_mul_ps(y, x);
 
  313  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
 
  314  y = _mm_mul_ps(y, x);
 
  315  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
 
  316  y = _mm_mul_ps(y, x);
 
  317  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
 
  318  y = _mm_mul_ps(y, x);
 
  319  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
 
  320  y = _mm_mul_ps(y, x);
 
  321  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
 
  322  y = _mm_mul_ps(y, z);
 
  323  y = _mm_add_ps(y, x);
 
  324  y = _mm_add_ps(y, one);
 
  328  z = _mm_movehl_ps(z, fx);
 
  329  mm0 = _mm_cvttps_pi32(fx);
 
  330  mm1 = _mm_cvttps_pi32(z);
 
  331  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
 
  332  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
 
  333  mm0 = _mm_slli_pi32(mm0, 23); 
 
  334  mm1 = _mm_slli_pi32(mm1, 23);
 
  337  COPY_MM_TO_XMM(mm0, mm1, pow2n);
 
  340  emm0 = _mm_cvttps_epi32(fx);
 
  341  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
 
  342  emm0 = _mm_slli_epi32(emm0, 23);
 
  343  v4sf pow2n = _mm_castsi128_ps(emm0);
 
  345  y = _mm_mul_ps(y, pow2n);
 
  349_PS_CONST(minus_cephes_DP1, -0.78515625);
 
  350_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 
  351_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 
  352_PS_CONST(sincof_p0, -1.9515295891E-4);
 
  353_PS_CONST(sincof_p1,  8.3321608736E-3);
 
  354_PS_CONST(sincof_p2, -1.6666654611E-1);
 
  355_PS_CONST(coscof_p0,  2.443315711809948E-005);
 
  356_PS_CONST(coscof_p1, -1.388731625493765E-003);
 
  357_PS_CONST(coscof_p2,  4.166664568298827E-002);
 
  358_PS_CONST(cephes_FOPI, 1.27323954473516); 
 
  390  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 
  395  v2si mm0, mm1, mm2, mm3;
 
  399  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 
  401  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
 
  404  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 
  409  emm2 = _mm_cvttps_epi32(y);
 
  411  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 
  412  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 
  413  y = _mm_cvtepi32_ps(emm2);
 
  415  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 
  416  emm0 = _mm_slli_epi32(emm0, 29);
 
  423  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 
  424  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 
  426  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
 
  427  v4sf poly_mask = _mm_castsi128_ps(emm2);
 
  428  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 
  431  xmm2 = _mm_movehl_ps(xmm2, y);
 
  432  mm2 = _mm_cvttps_pi32(y);
 
  433  mm3 = _mm_cvttps_pi32(xmm2);
 
  435  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 
  436  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 
  437  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 
  438  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 
  439  y = _mm_cvtpi32x2_ps(mm2, mm3);
 
  441  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 
  442  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 
  443  mm0 = _mm_slli_pi32(mm0, 29);
 
  444  mm1 = _mm_slli_pi32(mm1, 29);
 
  446  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 
  447  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 
  448  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 
  449  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 
  450  v4sf swap_sign_bit, poly_mask;
 
  451  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
 
  452  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 
  453  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 
  459  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 
  460  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 
  461  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 
  462  xmm1 = _mm_mul_ps(y, xmm1);
 
  463  xmm2 = _mm_mul_ps(y, xmm2);
 
  464  xmm3 = _mm_mul_ps(y, xmm3);
 
  465  x = _mm_add_ps(x, xmm1);
 
  466  x = _mm_add_ps(x, xmm2);
 
  467  x = _mm_add_ps(x, xmm3);
 
  470  y = *(v4sf*)_ps_coscof_p0;
 
  471  v4sf z = _mm_mul_ps(x,x);
 
  473  y = _mm_mul_ps(y, z);
 
  474  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 
  475  y = _mm_mul_ps(y, z);
 
  476  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 
  477  y = _mm_mul_ps(y, z);
 
  478  y = _mm_mul_ps(y, z);
 
  479  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 
  480  y = _mm_sub_ps(y, tmp);
 
  481  y = _mm_add_ps(y, *(v4sf*)_ps_1);
 
  485  v4sf y2 = *(v4sf*)_ps_sincof_p0;
 
  486  y2 = _mm_mul_ps(y2, z);
 
  487  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 
  488  y2 = _mm_mul_ps(y2, z);
 
  489  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 
  490  y2 = _mm_mul_ps(y2, z);
 
  491  y2 = _mm_mul_ps(y2, x);
 
  492  y2 = _mm_add_ps(y2, x);
 
  496  y2 = _mm_and_ps(xmm3, y2); 
 
  497  y = _mm_andnot_ps(xmm3, y);
 
  498  y = _mm_add_ps(y,y2);
 
  500  y = _mm_xor_ps(y, sign_bit);
 
  507  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
 
  511  v2si mm0, mm1, mm2, mm3;
 
  514  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 
  517  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 
  521  emm2 = _mm_cvttps_epi32(y);
 
  523  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 
  524  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 
  525  y = _mm_cvtepi32_ps(emm2);
 
  527  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
 
  530  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
 
  531  emm0 = _mm_slli_epi32(emm0, 29);
 
  533  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 
  534  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 
  536  v4sf sign_bit = _mm_castsi128_ps(emm0);
 
  537  v4sf poly_mask = _mm_castsi128_ps(emm2);
 
  540  xmm2 = _mm_movehl_ps(xmm2, y);
 
  541  mm2 = _mm_cvttps_pi32(y);
 
  542  mm3 = _mm_cvttps_pi32(xmm2);
 
  545  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 
  546  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 
  547  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 
  548  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 
  550  y = _mm_cvtpi32x2_ps(mm2, mm3);
 
  553  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
 
  554  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
 
  559  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
 
  560  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
 
  561  mm0 = _mm_slli_pi32(mm0, 29);
 
  562  mm1 = _mm_slli_pi32(mm1, 29);
 
  564  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 
  565  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 
  567  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 
  568  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 
  570  v4sf sign_bit, poly_mask;
 
  571  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
 
  572  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 
  577  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 
  578  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 
  579  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 
  580  xmm1 = _mm_mul_ps(y, xmm1);
 
  581  xmm2 = _mm_mul_ps(y, xmm2);
 
  582  xmm3 = _mm_mul_ps(y, xmm3);
 
  583  x = _mm_add_ps(x, xmm1);
 
  584  x = _mm_add_ps(x, xmm2);
 
  585  x = _mm_add_ps(x, xmm3);
 
  588  y = *(v4sf*)_ps_coscof_p0;
 
  589  v4sf z = _mm_mul_ps(x,x);
 
  591  y = _mm_mul_ps(y, z);
 
  592  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 
  593  y = _mm_mul_ps(y, z);
 
  594  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 
  595  y = _mm_mul_ps(y, z);
 
  596  y = _mm_mul_ps(y, z);
 
  597  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 
  598  y = _mm_sub_ps(y, tmp);
 
  599  y = _mm_add_ps(y, *(v4sf*)_ps_1);
 
  603  v4sf y2 = *(v4sf*)_ps_sincof_p0;
 
  604  y2 = _mm_mul_ps(y2, z);
 
  605  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 
  606  y2 = _mm_mul_ps(y2, z);
 
  607  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 
  608  y2 = _mm_mul_ps(y2, z);
 
  609  y2 = _mm_mul_ps(y2, x);
 
  610  y2 = _mm_add_ps(y2, x);
 
  614  y2 = _mm_and_ps(xmm3, y2); 
 
  615  y = _mm_andnot_ps(xmm3, y);
 
  616  y = _mm_add_ps(y,y2);
 
  618  y = _mm_xor_ps(y, sign_bit);
 
  625void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
 
  626  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
 
  628  v4si emm0, emm2, emm4;
 
  630  v2si mm0, mm1, mm2, mm3, mm4, mm5;
 
  634  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 
  636  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
 
  639  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 
  643  emm2 = _mm_cvttps_epi32(y);
 
  646  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 
  647  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 
  648  y = _mm_cvtepi32_ps(emm2);
 
  653  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 
  654  emm0 = _mm_slli_epi32(emm0, 29);
 
  655  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
 
  658  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 
  659  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 
  660  v4sf poly_mask = _mm_castsi128_ps(emm2);
 
  663  xmm3 = _mm_movehl_ps(xmm3, y);
 
  664  mm2 = _mm_cvttps_pi32(y);
 
  665  mm3 = _mm_cvttps_pi32(xmm3);
 
  668  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 
  669  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 
  670  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 
  671  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 
  673  y = _mm_cvtpi32x2_ps(mm2, mm3);
 
  679  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 
  680  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 
  681  mm0 = _mm_slli_pi32(mm0, 29);
 
  682  mm1 = _mm_slli_pi32(mm1, 29);
 
  683  v4sf swap_sign_bit_sin;
 
  684  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
 
  688  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 
  689  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 
  690  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 
  691  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 
  693  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 
  698  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 
  699  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 
  700  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 
  701  xmm1 = _mm_mul_ps(y, xmm1);
 
  702  xmm2 = _mm_mul_ps(y, xmm2);
 
  703  xmm3 = _mm_mul_ps(y, xmm3);
 
  704  x = _mm_add_ps(x, xmm1);
 
  705  x = _mm_add_ps(x, xmm2);
 
  706  x = _mm_add_ps(x, xmm3);
 
  709  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
 
  710  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
 
  711  emm4 = _mm_slli_epi32(emm4, 29);
 
  712  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
 
  715  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
 
  716  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
 
  717  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
 
  718  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
 
  719  mm4 = _mm_slli_pi32(mm4, 29);
 
  720  mm5 = _mm_slli_pi32(mm5, 29);
 
  722  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
 
  726  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
 
  730  v4sf z = _mm_mul_ps(x,x);
 
  731  y = *(v4sf*)_ps_coscof_p0;
 
  733  y = _mm_mul_ps(y, z);
 
  734  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 
  735  y = _mm_mul_ps(y, z);
 
  736  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 
  737  y = _mm_mul_ps(y, z);
 
  738  y = _mm_mul_ps(y, z);
 
  739  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 
  740  y = _mm_sub_ps(y, tmp);
 
  741  y = _mm_add_ps(y, *(v4sf*)_ps_1);
 
  745  v4sf y2 = *(v4sf*)_ps_sincof_p0;
 
  746  y2 = _mm_mul_ps(y2, z);
 
  747  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 
  748  y2 = _mm_mul_ps(y2, z);
 
  749  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 
  750  y2 = _mm_mul_ps(y2, z);
 
  751  y2 = _mm_mul_ps(y2, x);
 
  752  y2 = _mm_add_ps(y2, x);
 
  756  v4sf ysin2 = _mm_and_ps(xmm3, y2);
 
  757  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
 
  758  y2 = _mm_sub_ps(y2,ysin2);
 
  759  y = _mm_sub_ps(y, ysin1);
 
  761  xmm1 = _mm_add_ps(ysin1,ysin2);
 
  762  xmm2 = _mm_add_ps(y,y2);
 
  765  *s = _mm_xor_ps(xmm1, sign_bit_sin);
 
  766  *c = _mm_xor_ps(xmm2, sign_bit_cos);
 
Definition: sse_mathfun.h:149