/**
*******************************************************************************
*
* @brief
*  Compute 8x4 SAD
*
* @par   Description
*  Compute 8x4 sum of absolute differences between source and reference block
*
* @param[in] pu1_src
*  Source buffer
*
* @param[in] pu1_ref
*  Reference buffer
*
* @param[in] src_strd
*  Source stride
*
* @param[in] ref_strd
*  Reference stride
*
* @param[in] wd
*  Assumed to be 8
*
* @param[in] ht
*  Assumed to be 4

* @returns
*  SAD
*
* @remarks
*
*******************************************************************************
*/
WORD32 icv_sad_8x4_ssse3(UWORD8 *pu1_src,
                         UWORD8 *pu1_ref,
                         WORD32 src_strd,
                         WORD32 ref_strd,
                         WORD32 wd,
                         WORD32 ht)
{
    WORD32 sad;
    __m128 src_r0, src_r1;
    __m128 ref_r0, ref_r1;
    __m128i res_r0, res_r1;

    UNUSED(wd);
    UNUSED(ht);
    ASSERT(wd == 8);
    ASSERT(ht == 4);

    /* Load source */
    src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
    pu1_src += src_strd;

    src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
    pu1_src += src_strd;

    src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
    pu1_src += src_strd;

    src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
    pu1_src += src_strd;


    /* Load reference */
    ref_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r0 = _mm_loadh_pi (ref_r0, (__m64 *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r1 = _mm_loadh_pi (ref_r1, (__m64 *) (pu1_ref));
    pu1_ref += ref_strd;

    /* Compute SAD for each row */
    res_r0 = _mm_sad_epu8((__m128i)src_r0, (__m128i)ref_r0);
    res_r1 = _mm_sad_epu8((__m128i)src_r1, (__m128i)ref_r1);

    /* Accumulate SAD */
    res_r0 = _mm_add_epi64(res_r0,  res_r1);
    res_r0 = _mm_add_epi64(res_r0, _mm_srli_si128(res_r0, 8));

    sad  = _mm_cvtsi128_si32(res_r0);

    return sad;
}
Example #2
0
int main() {

	__v4sf x;
	 _mm_loadh_pi(x, 0);
	const __v2sf* cx = 0;
    __builtin_ia32_loadhps(x, cx);

	{
		__v4sf a = {1,2,3,4};
		__v4sf b = {5,6,7,8};
		__v4sf c = {9,10,11,12};

		printf4vector(&a);
		printf4vector(&b);
		printf4vector(&c);

		//builtin "ia32_mulps" not supported by clang -- use _mm_mul_ps
		__v4sf tmp = _mm_mul_ps(a, b);   // a * b 
		printf4vector(&tmp);

		//builtin "ia32_addps" not supported by clang -- use _mm_add_ps
		__v4sf e =   _mm_add_ps(tmp, c);    // e = (a * b) + c
		printf4vector(&e);
	}
	return 0;
}
Example #3
0
__m128 test_loadh_pi(__m128 x, void* y) {
  // CHECK: define {{.*}} @test_loadh_pi
  // CHECK: load <2 x float>* {{.*}}, align 1{{$}}
  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  return _mm_loadh_pi(x,y);
}
Example #4
0
int main()
{
	float array[4] = {1.f, 2.f, 3.f, 4.f};
	__m128 a = {5.f, 6.f, 7.f, 8.f};

	__m128 value1 = _mm_loadh_pi(a, (__m64*)array);
    float* value = (float*)&value1;

	printf("_mm_loadh_pi: %f %f %f %f\n", value[0], value[1], value[2], value[3]);

	return 0;
}
Example #5
0
void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
                            int height, const uint8_t *ref, int ref_stride) {
  /* comp_pred and pred must be 16 byte aligned. */
  assert(((intptr_t)comp_pred & 0xf) == 0);
  assert(((intptr_t)pred & 0xf) == 0);
  if (width > 8) {
    int x, y;
    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; x += 16) {
        const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
        const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
        const __m128i avg = _mm_avg_epu8(p, r);
        _mm_store_si128((__m128i *)(comp_pred + x), avg);
      }
      comp_pred += width;
      pred += width;
      ref += ref_stride;
    }
  } else {  // width must be 4 or 8.
    int i;
    // Process 16 elements at a time. comp_pred and pred have width == stride
    // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
    // all divisible by 16 so just ref needs to be massaged when loading.
    for (i = 0; i < width * height; i += 16) {
      const __m128i p = _mm_load_si128((const __m128i *)pred);
      __m128i r;
      __m128i avg;
      if (width == ref_stride) {
        r = _mm_loadu_si128((const __m128i *)ref);
        ref += 16;
      } else if (width == 4) {
        r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
                          loadu_uint32(ref + 2 * ref_stride),
                          loadu_uint32(ref + ref_stride), loadu_uint32(ref));

        ref += 4 * ref_stride;
      } else {
        const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
        assert(width == 8);
        r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0),
                                          (const __m64 *)(ref + ref_stride)));

        ref += 2 * ref_stride;
      }
      avg = _mm_avg_epu8(p, r);
      _mm_store_si128((__m128i *)comp_pred, avg);

      pred += 16;
      comp_pred += 16;
    }
  }
}
void decomp_gamma2_plus( spinor_array src, halfspinor_array dst) 
{
  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm6;
  __m128 xmm7;
  __m128 xmm8;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm9;
  __m128 xmm10;
  __m128 xmm11;


  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

#if 0
  /* Load up the spinors */
  xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]);
  xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]);
  xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]);

  xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]);
  xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]);
  xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]);

  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]);
  xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]);
  xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]);

  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]);
  xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]);
  xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]);

#endif
 
  /* Swap the lower components */
  xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0xb1);
  xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0xb1);
  xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0xb1);

  xmm9 = _mm_xor_ps(xmm6, signs14.vector);
  xmm10 = _mm_xor_ps(xmm7, signs14.vector);
  xmm11 = _mm_xor_ps(xmm8, signs14.vector);

  /* Add */
  xmm0 = _mm_add_ps(xmm0, xmm9);
  xmm1 = _mm_add_ps(xmm1, xmm10);
  xmm2 = _mm_add_ps(xmm2, xmm11);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);


}
Example #7
0
M_Matrix44
M_MatrixInvert44_SSE(M_Matrix44 A)
{
	M_Matrix44 Ainv;
	float *src = &A.m[0][0];
	float *dst = &Ainv.m[0][0];
	__m128 minor0, minor1, minor2, minor3;
	__m128 row0, row1, row2, row3;
	__m128 det, tmp1;

	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src)),
	                                          (__m64 *)(src+4));
	row1	= _mm_loadh_pi(_mm_loadl_pi(row1, (__m64 *)(src+8)),
	                                          (__m64 *)(src+12));
	row0	= _mm_shuffle_ps(tmp1, row1, 0x88);
	row1	= _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src+2)),
	                                          (__m64 *)(src+6));
	row3	= _mm_loadh_pi(_mm_loadl_pi(row3, (__m64 *)(src+10)),
	                                          (__m64 *)(src+14));
	row2	= _mm_shuffle_ps(tmp1, row3, 0x88);
	row3	= _mm_shuffle_ps(row3, tmp1, 0xDD);

	tmp1	= _mm_mul_ps(row2, row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0	= _mm_mul_ps(row1, tmp1);
	minor1	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1	= _mm_shuffle_ps(minor1, minor1, 0x4E);

	tmp1	= _mm_mul_ps(row1, row2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3	= _mm_shuffle_ps(minor3, minor3, 0x4E);

	tmp1	= _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2	= _mm_shuffle_ps(row2, row2, 0x4E);
	minor0	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2	= _mm_shuffle_ps(minor2, minor2, 0x4E);

	tmp1	= _mm_mul_ps(row0, row1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3	= _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2	= _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));

	tmp1	= _mm_mul_ps(row0, row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2	= _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));

	tmp1	= _mm_mul_ps(row0, row2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);

	det	= _mm_mul_ps(row0, minor0);
	det	= _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det	= _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1	= _mm_rcp_ss(det);
	det	= _mm_sub_ss(_mm_add_ss(tmp1, tmp1),
	                     _mm_mul_ss(det, _mm_mul_ss(tmp1,tmp1)));
	det	= _mm_shuffle_ps(det, det, 0x00);

	minor0	= _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64 *)(dst), minor0);
	_mm_storeh_pi((__m64 *)(dst+2), minor0);

	minor1	= _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64 *)(dst+4), minor1);
	_mm_storeh_pi((__m64 *)(dst+6), minor1);
	
	minor2	= _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64 *)(dst+8), minor2);
	_mm_storeh_pi((__m64 *)(dst+10), minor2);
	
	minor3	= _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64 *)(dst+12), minor3);
	_mm_storeh_pi((__m64 *)(dst+14), minor3);

	return (Ainv);
}
test (__m128 a, __m64 *p)
{
  return _mm_loadh_pi (a, p); 
}
Example #9
0
static inline int    sacIsSampleDegenerate(PROSAC_HEST* p){
	unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3];
	
	/**
	 * Pack the matches selected by the SAC algorithm.
	 * Must be packed  points[0:7]  = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3}
	 *                 points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3}
	 * Gather 4 points into the vector
	 */
	
	__m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]);
	src10        = _mm_loadh_pi(src10, (__m64*)&p->src[i1]);
	__m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]);
	src32        = _mm_loadh_pi(src32, (__m64*)&p->src[i3]);
	__m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]);
	dst10        = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]);
	__m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]);
	dst32        = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]);
	
	
	/**
	 * If the matches' source points have common x and y coordinates, abort.
	 */
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[2].x
	 * packedPoints[0].y == packedPoints[2].y
	 * packedPoints[1].x == packedPoints[3].x
	 * packedPoints[1].y == packedPoints[3].y
	 */
	
	__m128 chkEq0 = _mm_cmpeq_ps(src10, src32);
	
	/**
	 * Check:
	 * packedPoints[1].x == packedPoints[2].x
	 * packedPoints[1].y == packedPoints[2].y
	 * packedPoints[0].x == packedPoints[3].x
	 * packedPoints[0].y == packedPoints[3].y
	 */
	
	__m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32);
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[1].x
	 * packedPoints[0].y == packedPoints[1].y
	 * packedPoints[2].x == packedPoints[3].x
	 * packedPoints[2].y == packedPoints[3].y
	 */
	
	__m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)),
	                             _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2)));
	
	/* Verify */
	if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){
		return 1;
	}
	
	/* If the matches do not satisfy the strong geometric constraint, abort. */
	
	/**
	 * p6420x   = (p6.x, p4.x, p2.x, p0.x)
	 * p6420y   = (p6.y, p4.y, p2.y, p0.y)
	 * p7531x   = (p7.x, p5.x, p3.x, p1.x)
	 * p7531y   = (p7.y, p5.y, p3.y, p1.y)
	 * crosssd0 = p6420y - p7531y                     = (cross2d0, cross0d0, cross2s0, cross0s0)
	 * crosssd1 = p7531x - p6420x                     = (cross2d1, cross0d1, cross2s1, cross0s1)
	 * crosssd2 = p6420x * p7531y  -  p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2)
	 * 
	 * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0)
	 * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1)
	 * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2)
	 * 
	 * dotsd0   = shufcrosssd0 * p6420x +
	 *            shufcrosssd1 * p6420y + 
	 *            shufcrosssd2
	 *          = (dotd0, dotd2, dots0, dots2)
	 * dotsd1   = shufcrosssd0 * p7531x +
	 *            shufcrosssd1 * p7531y + 
	 *            shufcrosssd2
	 *          = (dotd1, dotd3, dots1, dots3)
	 * 
	 * dots     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0))
	 * dotd     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2))
	 *            movmaskps(dots ^ dotd)
	 */
	
	__m128 p3210x       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p3210y       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7654x       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7654y       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p6420x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p6420y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7531x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7531y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1));
	
	__m128 crosssd0     = _mm_sub_ps(p6420y, p7531y);
	__m128 crosssd1     = _mm_sub_ps(p7531x, p6420x);
	__m128 crosssd2     = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x));
	
	__m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1));
	
	__m128 dotsd0       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x),
	                                            _mm_mul_ps(shufcrosssd1, p6420y)),
	                                 shufcrosssd2);
	__m128 dotsd1       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x),
	                                            _mm_mul_ps(shufcrosssd1, p7531y)),
	                                 shufcrosssd2);
	
	__m128 dots         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1));
	__m128 dotd         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3));
	
	//if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){
	if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){
		return 1;
	}
	
	
	/* Otherwise, proceed with evaluation */
	_mm_store_ps((float*)&p->pkdPts[0], src10);
	_mm_store_ps((float*)&p->pkdPts[2], src32);
	_mm_store_ps((float*)&p->pkdPts[4], dst10);
	_mm_store_ps((float*)&p->pkdPts[6], dst32);
	
	return 0;
}
Example #10
0
// Does inverse according to Cramers Rule
// See ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
void Mat44::Cramers_Inverse_SSE(const Mat44 *out, f32 &detv) const
{
	f32 *src = (f32*)&mat;

	__m128 minor0=_mm_setzero_ps(), minor1=_mm_setzero_ps(), minor2=_mm_setzero_ps(), minor3=_mm_setzero_ps();
	__m128 row0=_mm_setzero_ps(),   row1=_mm_setzero_ps(),   row2=_mm_setzero_ps(),   row3=_mm_setzero_ps();
	__m128 det=_mm_setzero_ps(),    tmp1=_mm_setzero_ps();

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	
	_mm_store_ss(&detv, det);

	Mat44 t;
	if(out)
	{
		src = (f32*)out->mat;
	}
	else
	{
		src = t.mat;
	}

	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);
};
void decomp_gamma3_plus( spinor_array src, halfspinor_array dst) 
{
  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  __m128 xmm6;
  __m128 xmm7;


  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

#if 0
  /* Load up the spinors */
  xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]);
  xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]);
  xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]);

  xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]);
  xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]);
  xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]);

  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]);
  xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]);
  xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]);

  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]);
  xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]);
  xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]);

#endif
 
  /* sub */
  xmm0 = _mm_add_ps(xmm0, xmm3);
  xmm1 = _mm_add_ps(xmm1, xmm4);
  xmm2 = _mm_add_ps(xmm2, xmm5);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);


}
Example #12
0
// function that implements the kernel of the seismic modeling algorithm
void seismic_exec(float **VEL, float **PPF, float **APF, float **NPF, float* seismicPulseVector, int spPosX, int spPosY, int xDim, int yDim, int timeSteps)
{
  int i,j;  // spatial loops counters
  int t;    // time loop counter
  #ifdef _VERBOSE
    int progressTimer = -1;
  #endif

  // make sure packing _all_ the data into sets of 4 element is ok
  assert( xDim % 4 == 0 );

  #ifdef _VERBOSE
    printf("processing...\n");
    printf("point of explosion = %d, %d\n", spPosX, spPosY);
  #endif

  // there are 16 XMM registers in 64 bit mode, so there is no need to spill to stack
  __m128 s_ppf, s_vel, s_actual, s_above1, s_left1, s_under1, s_right1, s_two, s_sixteen, s_sixty;
  __m128 s_above2, s_under2, s_left2, s_right2;

  float two[4] = {2.0f, 2.0f, 2.0f, 2.0f };
  float sixteen[4] = {16.0f,16.0f,16.0f,16.0f};
  float sixty[4] = {60.f,60.f,60.f,60.f};

  // preload XMM registers with constant values.
  s_two = _mm_load_ps( two );
  s_sixteen = _mm_load_ps( sixteen );
  s_sixty = _mm_load_ps( sixty );

  // time loop
  for (t = 0; t < timeSteps; t++)
  {
    #ifdef _VVERBOSE
      printf("----------------------------------------------\ntimestep: %d\n\n", t );
    #endif

    // add pulse
    APF[spPosY][spPosX] += seismicPulseVector[t];

    for(i=2; i<(yDim-2); i++)
    {
      for(j=2 + ALIGNMENT_OFFSET; j<(xDim-2); j+=4)
      {
        s_ppf = _mm_load_ps( &(PPF[i][j]) );
        s_vel = _mm_load_ps( &(VEL[i][j]) );
        s_actual = _mm_load_ps( &(APF[i][j]) );

        s_left1 = _mm_load_ps( &(APF[i-1][j]) );
        s_left2 = _mm_load_ps( &(APF[i-2][j]) );
        s_right2 = _mm_load_ps( &(APF[i+2][j]) );
        s_right1 = _mm_load_ps( &(APF[i+1][j]) );
        s_above1 = _mm_loadu_ps( &(APF[i][j-1]) );
        s_under1 = _mm_loadu_ps( &(APF[i][j+1]) );
        s_above2 = _mm_loadl_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(1, 0, 0, 0)),
                                &(APF[i][j-2]));

        s_under2 = _mm_loadh_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(0, 0, 3, 2)),
                                &(APF[i][j+4]));

        // sum elements with an offset of one
        s_under1 = _mm_add_ps( s_under1, _mm_add_ps( s_above1, _mm_add_ps( s_left1, s_right1)));
        // sum elements with an offset of two
        s_above2 = _mm_add_ps( s_left2, _mm_add_ps( s_right2, _mm_add_ps( s_under2, s_above2)));
        // multiply with 16
        s_under1 = _mm_mul_ps( s_sixteen, s_under1 );
        //
        s_under1 = _mm_sub_ps( _mm_sub_ps( s_under1,  s_above2), _mm_mul_ps( s_sixty, s_actual ) );
        s_under1 = _mm_add_ps( _mm_mul_ps( s_vel, s_under1), _mm_sub_ps(_mm_mul_ps( s_two, s_actual ), s_ppf) );

        // save the result
        _mm_store_ps( &(NPF[i][j]), s_under1);

        #ifdef _VVERBOSE
          printf("[%d][%d]\n", i, j);
        #endif
      }
      #ifdef _VVERBOSE
        printf("\n");
      #endif
    }

    #ifdef _VERBOSE
//     shows one # at each 10% of the total processing time
      if (t/(timeSteps/10) > progressTimer )
      {
        printf("#");
        progressTimer++;
        fflush(stdout);
      }
    #endif

    // switch pointers instead of copying data
    PPF = APF;
    APF = NPF;
    NPF = PPF;
  }

  #ifdef _VERBOSE
    printf("\nend process!\n");
  #endif
}
Example #13
0
static inline double
calc_output_single (SINC_FILTER *filter, const increment_t increment, const increment_t start_filter_index)
{
#ifdef RESAMPLER_SSE_OPT
	__m128i increment4;
	__m128 left128,right128;
	float left,right;
#else
	double left,right;
#endif
	const coeff_t * const __restrict coeffs = filter->coeffs;
	const float * const __restrict buffer = filter->buffer;
	increment_t	filter_index, max_filter_index ;
	int			data_index, coeff_count;

	/* Convert input parameters into fixed point. */
	max_filter_index = int_to_fp (filter->coeff_half_len) ;

	/* First apply the left half of the filter. */
	filter_index = start_filter_index ;
	coeff_count = (max_filter_index - filter_index) / increment ;
	filter_index = filter_index + coeff_count * increment ;
	data_index = filter->b_current - coeff_count ;

#ifdef RESAMPLER_SSE_OPT
	increment4 = _mm_set_epi32(increment * 3, increment * 2, increment, 0);

	left128 = _mm_setzero_ps();
	while(filter_index >= increment * 3)
	{
#ifdef USE_WINDOWS_CODE
		__m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4);
		__m128i fractioni = _mm_and_si128(indx,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1)));
#else
		Windows__m128i indx;
		indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4);
		__m128i fractioni = _mm_and_si128(indx.m128i,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1)));
#endif
		__m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below
		__m128 icoeff,icoeffp1,icoeffd,fraction;
#ifdef _DEBUG
		icoeff0 = icoeff2 = _mm_setzero_ps();
#endif
#ifdef USE_WINDOWS_CODE
		indx = _mm_srai_epi32(indx, SHIFT_BITS);
#else
		indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS);
#endif

		icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1]));
		icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3]));

		icoeff   = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0));
		icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1));

		icoeffd = _mm_sub_ps(icoeffp1, icoeff);
		fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE));
		icoeff = _mm_add_ps(icoeff,_mm_mul_ps(icoeffd, fraction));

		left128 = _mm_add_ps(left128,_mm_mul_ps(icoeff, _mm_loadu_ps(buffer + data_index)));

		data_index += 4;
		filter_index -= increment * 4;
	}
#endif
	left = 0.;

	while (filter_index >= MAKE_INCREMENT_T(0))
	{
		coeff_t fraction = fp_to_float(filter_index);
		int indx = fp_to_int(filter_index);

		coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]);

		left += icoeff * buffer[data_index];

		filter_index -= increment;
		data_index++;
	}

	/* Now apply the right half of the filter. */
	filter_index = increment - start_filter_index ;
	coeff_count = (max_filter_index - filter_index) / increment ;
	filter_index = filter_index + coeff_count * increment ;
	data_index = filter->b_current + 1 + coeff_count ;

#ifdef RESAMPLER_SSE_OPT
	right128 = _mm_setzero_ps();
	while (filter_index > increment * 3)
	{
#ifdef USE_WINDOWS_CODE
		__m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4);
		__m128i fractioni = _mm_and_si128(indx, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1)));
#else
		Windows__m128i indx;
		indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4);
		__m128i fractioni = _mm_and_si128(indx.m128i, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1)));
#endif
		__m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below
		__m128 icoeff,icoeffp1,icoeffd,fraction,data;
#ifdef _DEBUG
		icoeff0 = icoeff2 = _mm_setzero_ps();
#endif
#ifdef USE_WINDOWS_CODE
		indx = _mm_srai_epi32(indx, SHIFT_BITS);
#else
		indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS);
#endif

		icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1]));
		icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3]));

		icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0));
		icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1));

		icoeffd = _mm_sub_ps(icoeffp1, icoeff);
		fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE));
		icoeff = _mm_add_ps(icoeff, _mm_mul_ps(icoeffd, fraction));

		data = _mm_loadu_ps(buffer + (data_index - 3));
		right128 = _mm_add_ps(right128,_mm_mul_ps(icoeff, _mm_shuffle_ps(data,data,_MM_SHUFFLE(0,1,2,3))));

		data_index -= 4;
		filter_index -= increment * 4;
	}
#endif
	right = 0.;

	while (filter_index > MAKE_INCREMENT_T(0))
	{
		coeff_t fraction = fp_to_float(filter_index);
		int indx = fp_to_int(filter_index);

		coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]);

		right += icoeff * buffer[data_index];

		filter_index -= increment;
		data_index--;
	}

	return (
#ifdef RESAMPLER_SSE_OPT
		_mm_cvtss_f32(horizontal_add(left128)) + _mm_cvtss_f32(horizontal_add(right128)) +
#endif
		left + right) ;
} /* calc_output_single */
Example #14
0
// from intel 
Matrix4x4SSE &Matrix4x4SSE::Invert(void)
{
	float *src = &m_Vec0[0];

	__m128 minor0, minor1, minor2, minor3;
	__m128 det;

	// fool compiler only..
	__m128 tmp1 = m_Vec0.m_Vec;
	__m128 row0 = m_Vec0.m_Vec;
	__m128 row1 = m_Vec1.m_Vec;
	__m128 row2 = m_Vec2.m_Vec;
	__m128 row3 = m_Vec3.m_Vec;

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);

	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	// -----------------------------------------------	
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row1);	
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// -----------------------------------------------
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);

	return *this;
}
Example #15
0
// Inverts a 4x4 matrix and returns the determinate
inline float invert_44_matrix(float* src)
{
	// Code pulled from "Streaming SIMD Extensions - Inverse of 4x4 Matrix"
	// by Intel.
	// ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
	__m128 minor0;
	__m128 minor1;
	__m128 minor2;
	__m128 minor3;
	__m128 row0;
	__m128 row1;
	__m128 row2;
	__m128 row3;
	__m128 det;
	__m128 tmp1;
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// -----------------------------------------------
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);

	return det[0];
}
Example #16
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
Example #17
0
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
                                            const unsigned char *_blimit,
                                            const unsigned char *_limit,
                                            const unsigned char *_thresh) {
  __m128i mask, hev, flat, flat2;
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi8(1);
  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
  __m128i abs_p1p0;

  const __m128i thresh =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
  const __m128i blimit =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
  q4p4 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
  q3p3 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
  q2p2 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
  q1p1 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
  p1q1 = _mm_shuffle_epi32(q1p1, 78);
  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
  q0p0 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
  p0q0 = _mm_shuffle_epi32(q0p0, 78);

  {
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
    abs_p1p0 =
        _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    fe = _mm_set1_epi8(0xfe);
    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    abs_p0q0 =
        _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
    abs_p1q1 =
        _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = _mm_max_epu8(abs_p1p0, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;

    work = _mm_max_epu8(
        _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
    mask = _mm_max_epu8(work, mask);
    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

  // lp filter
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
    const __m128i t1 = _mm_set1_epi16(0x1);
    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
    __m128i qs0 = _mm_xor_si128(p0q0, t80);
    __m128i qs1 = _mm_xor_si128(p1q1, t80);
    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;
    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;

    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    work_a = _mm_subs_epi8(qs0, qs0ps0);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    filt = _mm_and_si128(filt, mask);

    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

    filter1 = _mm_unpacklo_epi8(zero, filter1);
    filter1 = _mm_srai_epi16(filter1, 0xB);
    filter2 = _mm_unpacklo_epi8(zero, filter2);
    filter2 = _mm_srai_epi16(filter2, 0xB);

    /* Filter1 >> 3 */
    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);

    /* filt >> 1 */
    filt = _mm_adds_epi16(filter1, t1);
    filt = _mm_srai_epi16(filt, 1);
    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
                            filt);
    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    // loopfilter done

    {
      __m128i work;
      flat = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
      flat = _mm_max_epu8(abs_p1p0, flat);
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
      flat = _mm_subs_epu8(flat, one);
      flat = _mm_cmpeq_epi8(flat, zero);
      flat = _mm_and_si128(flat, mask);

      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
      q5p5 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));

      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
      q6p6 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));

      flat2 = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
          _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));

      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
      q7p7 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));

      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));

      flat2 = _mm_max_epu8(work, flat2);
      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
      flat2 = _mm_subs_epu8(flat2, one);
      flat2 = _mm_cmpeq_epi8(flat2, zero);
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    }

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // flat and wide flat calculations
    {
      const __m128i eight = _mm_set1_epi16(8);
      const __m128i four = _mm_set1_epi16(4);
      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
      __m128i pixelFilter_p, pixelFilter_q;
      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;

      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
      q7_16 = _mm_unpackhi_epi8(q7p7, zero);

      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
                                    _mm_add_epi16(p4_16, p3_16));
      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
                                    _mm_add_epi16(q4_16, q3_16));

      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);

      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
      pixelFilter_p =
          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
      pixetFilter_p2p1p0 = _mm_add_epi16(
          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);

      flat_q0p0 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(p7_16, p7_16);
      sum_q7 = _mm_add_epi16(q7_16, q7_16);
      sum_p3 = _mm_add_epi16(p3_16, p3_16);
      sum_q3 = _mm_add_epi16(q3_16, q3_16);

      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
      flat_q1p1 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
      sum_q3 = _mm_add_epi16(sum_q3, q3_16);

      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);

      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
      flat_q2p2 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    }
    // wide flat
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    flat = _mm_shuffle_epi32(flat, 68);
    flat2 = _mm_shuffle_epi32(flat2, 68);

    q2p2 = _mm_andnot_si128(flat, q2p2);
    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat_q2p2);

    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);

    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);

    q6p6 = _mm_andnot_si128(flat2, q6p6);
    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));

    q5p5 = _mm_andnot_si128(flat2, q5p5);
    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));

    q4p4 = _mm_andnot_si128(flat2, q4p4);
    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));

    q3p3 = _mm_andnot_si128(flat2, q3p3);
    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));

    q2p2 = _mm_andnot_si128(flat2, q2p2);
    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));

    q1p1 = _mm_andnot_si128(flat2, q1p1);
    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));

    q0p0 = _mm_andnot_si128(flat2, q0p0);
    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
  }
}
Example #18
0
ibMtx4& ibMtx4::Invert()
{
	f32* src = &data.a[0][0];
	__m128 minor0, minor1, minor2, minor3;
	__m128 row0, row1, row2, row3;
	__m128 det, tmp1;
#if !defined NDEBUG || defined STATIC
	// Suppress RTC error for uninit vars
	f32 init = 0.f;
	row3 = row1 = tmp1 = _mm_load_ps1( &init );
#endif // NDEBUG
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// -----------------------------------------------
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);

	return *this;
}
void
intrin_sse_mult_su3_na(su3_matrixf* aa, su3_matrixf* bb, su3_matrixf* cc)
{

	 /* XMM Variables */
	 __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5;

	xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->e[0][0]) );
	xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->e[0][1]) );
	xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->e[0][2]) );
	xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&((bb)->e[1][0]) );
	xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&((bb)->e[1][1]) );
	xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&((bb)->e[1][2]) );
	xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[0][1].real) );
	xmm4 = _mm_load_ss((float *)&((aa)->e[1][0].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) );
	xmm5 = _mm_load_ss((float *)&((aa)->e[2][0].real) );
	xmm3 = _mm_shuffle_ps( xmm3, xmm3, 0x00 );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm4 = _mm_shuffle_ps( xmm4, xmm4, 0x00 );
	xmm3 = _mm_mul_ps( xmm3, xmm0 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm5 = _mm_shuffle_ps( xmm5, xmm5, 0x00 );
	xmm4 = _mm_mul_ps( xmm4, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm6 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm5 = _mm_mul_ps( xmm5, xmm0 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[0][2].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm5 = _mm_add_ps( xmm5, xmm6 );
	xmm3 = _mm_add_ps( xmm3, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[1][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][2].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm4 = _mm_add_ps( xmm4, xmm6 );
	xmm5 = _mm_add_ps( xmm5, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[0][0].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xb1 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0xb1 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xb1 );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	 	 xmm0 = _mm_xor_ps( xmm0, _sse_sgn24.xmm );
	 	 xmm1 = _mm_xor_ps( xmm1, _sse_sgn24.xmm );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn24.xmm );
	xmm6 = _mm_mul_ps( xmm6, xmm0 );
	xmm7 = _mm_mul_ps( xmm7, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm6 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][2].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm2 );
	xmm7 = _mm_mul_ps( xmm7, xmm0 );
	xmm5 = _mm_add_ps( xmm5, xmm6 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[0][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm6 );
	xmm5 = _mm_add_ps( xmm5, xmm7 );
	xmm0 = _mm_load_ss((float *)&((aa)->e[0][2].imag) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x00 );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm0 = _mm_mul_ps( xmm0, xmm2 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm0 );
	xmm5 = _mm_add_ps( xmm5, xmm6 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	 	 xmm3 = _mm_xor_ps( xmm3, _sse_sgn24.xmm );
	 	 xmm4 = _mm_xor_ps( xmm4, _sse_sgn24.xmm );
	 	 xmm5 = _mm_xor_ps( xmm5, _sse_sgn24.xmm );
	_mm_storel_pi((__m64 *)&((cc)->e[0][0]), xmm3 );
	_mm_storel_pi((__m64 *)&((cc)->e[1][0]), xmm4 );
	_mm_storel_pi((__m64 *)&((cc)->e[2][0]), xmm5 );
	_mm_storeh_pi((__m64 *)&((cc)->e[0][1]), xmm3 );
	_mm_storeh_pi((__m64 *)&((cc)->e[1][1]), xmm4 );
	_mm_storeh_pi((__m64 *)&((cc)->e[2][1]), xmm5 );
	xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->e[2][0]) );
	xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->e[2][1]) );
	xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->e[2][2]) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 );
	xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) );
	xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm3 = _mm_mul_ps( xmm3, xmm0 );
	xmm4 = _mm_mul_ps( xmm4, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 );
	 	 xmm0 = _mm_xor_ps( xmm0, _sse_sgn24.xmm );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 );
	 	 xmm1 = _mm_xor_ps( xmm1, _sse_sgn24.xmm );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn24.xmm );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm4 = _mm_mul_ps( xmm4, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	 	 xmm3 = _mm_xor_ps( xmm3, _sse_sgn24.xmm );
	_mm_storel_pi((__m64 *)&((cc)->e[0][2]), xmm3 );
	_mm_storeh_pi((__m64 *)&((cc)->e[1][2]), xmm3 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) );
	xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm6 = _mm_add_ps( xmm6, xmm5 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn3.xmm );
	xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	xmm7 = xmm6 ; 
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	 	 xmm6 = _mm_xor_ps( xmm6, _sse_sgn24.xmm );
	_mm_storel_pi((__m64 *)&((cc)->e[2][2]), xmm6 );
}