Example #1
0
File: sse3.cpp Project: BGmot/Qt
int main(int, char**)
{
    volatile __m128d a = _mm_set1_pd(6.28);
    volatile __m128d b = _mm_set1_pd(3.14);
    volatile __m128d result = _mm_addsub_pd(a, b);
    result = _mm_movedup_pd(result);
    return 0;
}
Example #2
0
static __inline __m128d ZMUL(__m128d a, __m128d b)
{
    __m128d ar, ai;

    ar = _mm_movedup_pd(a);       /* ar = [a.r a.r] */
    ar = _mm_mul_pd(ar, b);       /* ar = [a.r*b.r a.r*b.i] */
    ai = _mm_unpackhi_pd(a, a);   /* ai = [a.i a.i] */
    b = _mm_shuffle_pd(b, b, 1);  /* b = [b.i b.r] */
    ai = _mm_mul_pd(ai, b);       /* ai = [a.i*b.i a.i*b.r] */

    return _mm_addsub_pd(ar, ai); /* [a.r*b.r-a.i*b.i a.r*b.i+a.i*b.r] */
}
value complex_mul(value vab, value vcd)
{
    CAMLparam2(vab, vcd);
    CAMLlocal1(vz);
    vz = caml_alloc(Double_array_tag, 2);
    __m128d ab, cd, ac_bd, ba, bc_ad;
    ab = _mm_loadu_pd((double const*) vab);
    cd = _mm_loadu_pd((double const*) vcd);
    ac_bd = _mm_mul_pd(ab, cd);
    ba    = _mm_shuffle_pd(ab, ab, 1);
    bc_ad = _mm_mul_pd(ba, cd);
    _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad));
    CAMLreturn(vz);
}
Example #4
0
dcomplex zdotc_( int*      n,
                 dcomplex* x, int* inc_x,
                 dcomplex* z, int* inc_z )
{
	dcomplex* restrict x1;
	dcomplex* restrict z1;
	int                i;
	v2df_t rho1v;
	v2df_t z11v, z12v;
	v2df_t x1v, x1rv;
	dcomplex rho;
	int    n1 = *n;
	int    incx = *inc_x;
	int    incz = *inc_z;

	x1 = x;
	z1 = z;

	rho1v.v = _mm_setzero_pd();

	{
		v2df_t bcac, adbd;

		for ( i = 0; i < n1; ++i )
		{
			z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
			z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );

			x1v.v  = _mm_load_pd( ( double* )x1 );
			x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
			bcac.v = x1rv.v * z11v.v;
			adbd.v = x1v.v  * z12v.v;
			rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );

			x1 += incx;
			z1 += incz;
		}

		rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );

		rho1v.d[1] = -rho1v.d[1];
	}

	rho.real = rho1v.d[0];
	rho.imag = rho1v.d[1];

	return rho;
}
Example #5
0
// from Intel's sample intrin_double_sample.c
void multiply_SSE3(double xr, double xi, double yr, double yi,
    complex_num *z)
{
    __m128d num1, num2, num3;

    // Duplicates lower vector element into upper vector element.
    //   num1: [x.real, x.real]

    num1 = _mm_loaddup_pd(&xr);

    // Move y elements into a vector
    //   num2: [y.img, y.real]

    num2 = _mm_set_pd(yi, yr);

    // Multiplies vector elements
    //   num3: [(x.real*y.img), (x.real*y.real)]

    num3 = _mm_mul_pd(num2, num1);

    //   num1: [x.img, x.img]

    num1 = _mm_loaddup_pd(&xi);

    // Swaps the vector elements
    //   num2: [y.real, y.img]

    num2 = _mm_shuffle_pd(num2, num2, 1);

    //   num2: [(x.img*y.real), (x.img*y.img)]

    num2 = _mm_mul_pd(num2, num1);

    // Adds upper vector element while subtracting lower vector element
    //   num3: [((x.real *y.img)+(x.img*y.real)),
    //          ((x.real*y.real)-(x.img*y.img))]

    num3 = _mm_addsub_pd(num3, num2);

    // Stores the elements of num3 into z

    _mm_storeu_pd((double *)z, num3);

}
Example #6
0
__m128d test_mm_addsub_pd(__m128d A, __m128d B) {
  // CHECK-LABEL: test_mm_addsub_pd
  // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
  return _mm_addsub_pd(A, B);
}
__m128d test_mm_addsub_pd(__m128d A, __m128d B) {
  // CHECK-LABEL: test_mm_addsub_pd
  // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd
  // CHECK-ASM: addsubpd %xmm{{.*}}, %xmm{{.*}}
  return _mm_addsub_pd(A, B);
}
Example #8
0
/*
 * Subtract off x0 & x1 contribution to all remaining equations using a
 * rank-2 update with mu=2, nu=3, ku=2.  This version is for 16 SSE regs.
 * nu is the # of RHS, ku is the number of equations solved, and mu is
 * unrolled only to enable vectorization & software pipelining of load/use.
 * Loop order is MKN, so that B is kept completely in registers, and
 * C and A are streamed in (and out, for C) from cache during the operation.
 */
ATL_SINLINE void ATL_rk2(ATL_CINT M, const TYPE *pA0, const TYPE *pA1,
                         const TYPE *pB0, const TYPE *pB1,
                         TYPE *C, ATL_CINT ldc0)
{
   ATL_CINT ldc=ldc0+ldc0;
   TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1);
   ATL_INT i;
   ATL_CINT MM = (M&1) ? M-1 : M-2;
   register __m128d B00, B10, B01, B11, B02, B12;
   register __m128d C00, C01, C02, C10, C11, C12;
   register __m128d A, a;

   B00 = _mm_load_pd(pB0);
   B10 = _mm_load_pd(pB1);
   B01 = _mm_load_pd(pB0+2);
   B11 = _mm_load_pd(pB1+2);
   B02 = _mm_load_pd(pB0+4);
   B12 = _mm_load_pd(pB1+4);    	/* iB12, rB12 */


   C00 = _mm_load_pd(pC0);
   C01 = _mm_load_pd(pC1);
   C02 = _mm_load_pd(pC2);
   A = _mm_load_pd(pA0);                		/* iA00, rA00 */
   for (i=0; i < MM; i += 2, pA0 += 4, pA1 += 4, pC0 += 4, pC1 += 4, pC2 += 4)
   {
      register __m128d b;
/*
 *    K=0, M=[0,1], apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA00*rB00, rA00*rB00 */
      C00 = _mm_add_pd(C00, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA00, iA00 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
         C10 = _mm_load_pd(pC0+2);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA1);                		/* iA01, rA01 */
/*
 *    K=0, M=0, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA00*iB00, iA00*iB00 */
      C00 = _mm_addsub_pd(C00, b);
         C11 = _mm_load_pd(pC1+2);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         C12 = _mm_load_pd(pC2+2);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
/*
 *    K=1, M=0, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA01*rB10, rA01*rB10 */
      C00 = _mm_add_pd(C00, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA01, iA01 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA0+2);                /* iA10, rA10 */
/*
 *    K=1, M=0, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA01*iB10, iA01*iB10 */
      C00 = _mm_addsub_pd(C00, b);
         _mm_store_pd(pC0, C00);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         _mm_store_pd(pC1, C01);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
         _mm_store_pd(pC2, C02);
/*
 *    K=0, M=1, apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA10*rB00, rA10*rB00 */
      C10 = _mm_add_pd(C10, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA10, iA10 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C11 = _mm_add_pd(C11, b);
         C00 = _mm_load_pd(pC0+4);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C12 = _mm_add_pd(C12, b);
         A = _mm_load_pd(pA1+2);               		/* iA11, rA11 */
/*
 *    K=0, M=1, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA10*iB00, iA10*iB00 */
      C10 = _mm_addsub_pd(C10, b);
         C01 = _mm_load_pd(pC1+4);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C11 = _mm_addsub_pd(C11, b);
         C02 = _mm_load_pd(pC2+4);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C12 = _mm_addsub_pd(C12, b);
/*
 *    K=1, M=1, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA11*rB10, rA11*rB10 */
      C10 = _mm_add_pd(C10, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA11, iA11 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C11 = _mm_add_pd(C11, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C12 = _mm_add_pd(C12, b);
         A = _mm_load_pd(pA0+4);               		/* iA20, rA20 */
/*
 *    K=1, M=1, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA11*iB10, iA11*iB10 */
      C10 = _mm_addsub_pd(C10, b);
         _mm_store_pd(pC0+2, C10);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C11 = _mm_addsub_pd(C11, b);
         _mm_store_pd(pC1+2, C11);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C12 = _mm_addsub_pd(C12, b);
         _mm_store_pd(pC2+2, C12);
   }
/*
 * Drain pipes
 */
   {
      register __m128d b;
/*
 *    K=0, M=[0,1], apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA00*rB00, rA00*rB00 */
      C00 = _mm_add_pd(C00, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA00, iA00 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA1);                		/* iA01, rA01 */
/*
 *    K=0, M=0, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA00*iB00, iA00*iB00 */
      C00 = _mm_addsub_pd(C00, b);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
/*
 *    K=1, M=0, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA01*rB10, rA01*rB10 */
      C00 = _mm_add_pd(C00, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA01, iA01 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
/*
 *    K=1, M=0, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA01*iB10, iA01*iB10 */
      C00 = _mm_addsub_pd(C00, b);
         _mm_store_pd(pC0, C00);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         _mm_store_pd(pC1, C01);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
         _mm_store_pd(pC2, C02);
      if (!(M&1))
      {
         C10 = _mm_load_pd(pC0+2);
         C11 = _mm_load_pd(pC1+2);
         C12 = _mm_load_pd(pC2+2);
         A = _mm_load_pd(pA0+2);                /* iA10, rA10 */
/*
 *       K=0, M=1, apply real components of B0x
 */
         b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
         b = _mm_mul_pd(b, A);                     /* iA10*rB00, rA10*rB00 */
         C10 = _mm_add_pd(C10, b);
            a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA10, iA10 */
         b = _mm_movedup_pd(B01);
         b = _mm_mul_pd(b, A);
         C11 = _mm_add_pd(C11, b);
         b = _mm_movedup_pd(B02);
         b = _mm_mul_pd(b, A);
         C12 = _mm_add_pd(C12, b);
            A = _mm_load_pd(pA1+2);               		/* iA11, rA11 */
/*
 *       K=0, M=1, apply imaginary components of B0x
 */
         b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
         b = _mm_mul_pd(b, a);                     /* rA10*iB00, iA10*iB00 */
         C10 = _mm_addsub_pd(C10, b);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
         b = _mm_mul_pd(b, a);
         C11 = _mm_addsub_pd(C11, b);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
         b = _mm_mul_pd(b, a);
         C12 = _mm_addsub_pd(C12, b);
/*
 *       K=1, M=1, apply real components of B1x
 */
         b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
         b = _mm_mul_pd(b, A);                     /* iA11*rB10, rA11*rB10 */
         C10 = _mm_add_pd(C10, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA11, iA11 */
         b = _mm_movedup_pd(B11);
         b = _mm_mul_pd(b, A);
         C11 = _mm_add_pd(C11, b);
         b = _mm_movedup_pd(B12);
         b = _mm_mul_pd(b, A);
         C12 = _mm_add_pd(C12, b);
/*
 *       K=1, M=1, apply imaginary components of B1x
 */
         b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
         b = _mm_mul_pd(b, a);                     /* rA11*iB10, iA11*iB10 */
         C10 = _mm_addsub_pd(C10, b);
            _mm_store_pd(pC0+2, C10);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
         b = _mm_mul_pd(b, a);
         C11 = _mm_addsub_pd(C11, b);
            _mm_store_pd(pC1+2, C11);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
         b = _mm_mul_pd(b, a);
         C12 = _mm_addsub_pd(C12, b);
            _mm_store_pd(pC2+2, C12);
      }
   }
}