Ejemplo n.º 1
0
/* natural logarithm computed for 4 simultaneous float 
return NaN for x <= 0
*/
__m128 log_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

   x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos);  /* cut off denormalized stuff */

#ifndef USE_SSE2
   /* part 1: x = frexpf(x, &e); */
   COPY_XMM_TO_MM(x, mm0, mm1);
   mm0 = _mm_srli_pi32(mm0, 23);
   mm1 = _mm_srli_pi32(mm1, 23);
#else
   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
   /* keep only the fractional part */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
   x = _mm_or_ps(x, *(__m128*)_ps_0p5);

#ifndef USE_SSE2
   /* now e=mm0:mm1 contain the really base-2 exponent */
   mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
   __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
   _mm_empty(); /* bye bye mmx */
#else
   emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f);
   __m128 e = _mm_cvtepi32_ps(emm0);
#endif

   e = _mm_add_ps(e, one);

   /* part2: 
   if( x < SQRTHF ) {
   e -= 1;
   x = x + x - 1.0;
   } else { x = x - 1.0; }
   */
   __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
   __m128 tmp = _mm_and_ps(x, mask);
   x = _mm_sub_ps(x, one);
   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
   x = _mm_add_ps(x, tmp);


   __m128 z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_log_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
   y = _mm_mul_ps(y, x);

   y = _mm_mul_ps(y, z);


   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
   y = _mm_add_ps(y, tmp);


   tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);

   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
   x = _mm_add_ps(x, y);
   x = _mm_add_ps(x, tmp);
   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
   return x;
}
Ejemplo n.º 2
0
test (__m64  __A, __m64  __B)
{
  return _mm_cvtpi32x2_ps (__A, __B);
}
Ejemplo n.º 3
0
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) {
   __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
   __m128i emm0, emm2, emm4;
#else
   __m64 mm0, mm1, mm2, mm3, mm4, mm5;
#endif
   sign_bit_sin = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in emm2 */
   emm2 = _mm_cvttps_epi32(y);

   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm4 = emm2;

   /* get the swap sign flag for the sine */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);

   /* get the polynom selection mask for the sine*/
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm2:mm3 */
   xmm3 = _mm_movehl_ps(xmm3, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm3);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);

   mm4 = mm2;
   mm5 = mm3;

   /* get the swap sign flag for the sine */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   __m128 swap_sign_bit_sin;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);

   /* get the polynom selection mask for the sine */

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 poly_mask;
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif

   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

#ifdef USE_SSE2
   emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
   emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
   emm4 = _mm_slli_epi32(emm4, 29);
   __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
#else
   /* get the sign flag for the cosine */
   mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
   mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
   mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
   mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
   mm4 = _mm_slli_pi32(mm4, 29);
   mm5 = _mm_slli_pi32(mm5, 29);
   __m128 sign_bit_cos;
   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
   _mm_empty(); /* good-bye mmx */
#endif

   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   __m128 z = _mm_mul_ps(x,x);
   y = *(__m128*)_ps_coscof_p0;

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   __m128 ysin2 = _mm_and_ps(xmm3, y2);
   __m128 ysin1 = _mm_andnot_ps(xmm3, y);
   y2 = _mm_sub_ps(y2,ysin2);
   y = _mm_sub_ps(y, ysin1);

   xmm1 = _mm_add_ps(ysin1,ysin2);
   xmm2 = _mm_add_ps(y,y2);

   /* update the sign */
   *s = _mm_xor_ps(xmm1, sign_bit_sin);
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
Ejemplo n.º 4
0
/* almost the same as sin_ps */
__m128 cos_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
   __m128i emm0, emm2;
#else
   __m64 mm0, mm1, mm2, mm3;
#endif
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2);

   /* get the swap sign flag */
   emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask */
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

   __m128 sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);


   mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2);

   /* get the swap sign flag in mm0:mm1 and the 
   polynom selection mask in mm2:mm3 */

   mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);

   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());

   __m128 sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   _mm_empty(); /* good-bye mmx */
#endif
   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);

   return y;
}
Ejemplo n.º 5
0
__m128 exp_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
   __m128 tmp = _mm_setzero_ps(), fx;
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
   x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);

   /* express exp(x) as exp(g + n*log(2)) */
   fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
   fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);

   /* how to perform a floorf with SSE: just below */
#ifndef USE_SSE2
   /* step 1 : cast to int */
   tmp = _mm_movehl_ps(tmp, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(tmp);
   /* step 2 : cast back to float */
   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
#else
   emm0 = _mm_cvttps_epi32(fx);
   tmp  = _mm_cvtepi32_ps(emm0);
#endif
   /* if greater, substract 1 */
   __m128 mask = _mm_cmpgt_ps(tmp, fx);    
   mask = _mm_and_ps(mask, one);
   fx = _mm_sub_ps(tmp, mask);

   tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
   __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
   x = _mm_sub_ps(x, tmp);
   x = _mm_sub_ps(x, z);

   z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_exp_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, x);
   y = _mm_add_ps(y, one);

   /* build 2^n */
#ifndef USE_SSE2
   z = _mm_movehl_ps(z, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(z);
   mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
   mm0 = _mm_slli_pi32(mm0, 23); 
   mm1 = _mm_slli_pi32(mm1, 23);

   __m128 pow2n; 
   COPY_MM_TO_XMM(mm0, mm1, pow2n);
   _mm_empty();
#else
   emm0 = _mm_cvttps_epi32(fx);
   emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f);
   emm0 = _mm_slli_epi32(emm0, 23);
   __m128 pow2n = _mm_castsi128_ps(emm0);
#endif
   y = _mm_mul_ps(y, pow2n);
   return y;
}
Ejemplo n.º 6
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
Ejemplo n.º 7
0
__m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cvtpi32x2_ps
  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
  return _mm_cvtpi32x2_ps(a, b);
}