示例#1
0
文件: sse.cpp 项目: nyue/embree
namespace embree 
{
  const __m128 _mm_lookupmask_ps[16] = {
    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
  };

  const __m128d _mm_lookupmask_pd[4] = {
    _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
    _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
    _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
    _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
  };

}
示例#2
0
文件: arch.hpp 项目: huoyao/Hydro
inline void rotate_left_wm1(F64vec2 *v0, const F64vec2 v1)
{
    //v0 {1.0,  2.0};
    //v1 {3.0,  4.0};

    //v0 {2.0, 3.0, 4.0};
    *v0 = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(v1), _mm_castpd_si128(*v0), 8));
}
示例#3
0
static SIMD_INLINE __m128d sin_vml_pd(__m128d x) {
    SIMD_CONST_SD(1_PI , 0.318309886183790671538);

    SIMD_CONST_SD(PI4_A, 0.78539816290140151978    * -4.0);
    SIMD_CONST_SD(PI4_B, 4.9604678871439933374e-10 * -4.0);
    SIMD_CONST_SD(PI4_C, 1.1258708853173288931e-18 * -4.0);
    SIMD_CONST_SD(PI4_D, 1.7607799325916000908e-27 * -4.0);

    SIMD_CONST_SD(sin_0,-7.97255955009037868891952e-18);
    SIMD_CONST_SD(sin_1, 2.81009972710863200091251e-15);
    SIMD_CONST_SD(sin_2,-7.64712219118158833288484e-13);
    SIMD_CONST_SD(sin_3, 1.60590430605664501629054e-10);
    SIMD_CONST_SD(sin_4,-2.50521083763502045810755e-08);
    SIMD_CONST_SD(sin_5, 2.75573192239198747630416e-06);
    SIMD_CONST_SD(sin_6,-1.98412698412696162806809e-04);
    SIMD_CONST_SD(sin_7, 8.33333333333332974823815e-03);
    SIMD_CONST_SD(sin_8,-1.66666666666666657414808e-01);

    SIMD_CONST_SD(magic, 6755399441055744.0);

    __m128d y = _mm_mul_pd(x, SIMD_GET_PD(1_PI));
    __m128d q = _mm_add_pd(y, SIMD_GET_PD(magic));
    __m128d i = _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(q), 63));

    q = _mm_sub_pd(q, SIMD_GET_PD(magic));
    x = Simd128::mad(q, SIMD_GET_PD(PI4_A), x);
    x = Simd128::mad(q, SIMD_GET_PD(PI4_B), x);
    x = Simd128::mad(q, SIMD_GET_PD(PI4_C), x);
    x = Simd128::mad(q, SIMD_GET_PD(PI4_D), x);

    __m128d xx = _mm_mul_pd(x, x);
    x = _mm_xor_pd(x, i);

    __m128d u = SIMD_GET_PD(sin_0);
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_1));
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_2));
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_3));
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_4));
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_5));
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_6));
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_7));
    u = Simd128::mad(u, xx, SIMD_GET_PD(sin_8));
    u = Simd128::mad(xx, _mm_mul_pd(u, x), x);

    return u;
}
示例#4
0
文件: vmath.c 项目: OpenEdition/bilbo
/* vms_expma:
 *   Compute the component-wise exponential minus <a>:
 *       r[i] <-- e^x[i] - a
 *
 *   The following comments apply to the SSE2 version of this code:
 *
 *   Computation is done four doubles as a time by doing computation in paralell
 *   on two vectors of two doubles using SSE2 intrisics.  If size is not a
 *   multiple of 4, the remaining elements are computed using the stdlib exp().
 *
 *   The computation is done by first doing a range reduction of the argument of
 *   the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5].
 *   Then 2^k can be computed exactly using bit operations to build the double
 *   result and e^f can be efficiently computed with enough precision using a
 *   polynomial approximation.
 *
 *   The polynomial approximation is done with 11th order polynomial computed by
 *   Remez algorithm with the Solya suite, instead of the more classical Pade
 *   polynomial form cause it is better suited to parallel execution. In order
 *   to achieve the same precision, a Pade form seems to require three less
 *   multiplications but need a very costly division, so it will be less
 *   efficient.
 *
 *   The maximum error is less than 1lsb and special cases are correctly
 *   handled:
 *     +inf or +oor  -->   return +inf
 *     -inf or -oor  -->   return  0.0
 *     qNaN or sNaN  -->   return qNaN
 *
 *   This code is copyright 2004-2012 Thomas Lavergne and licenced under the
 *   BSD licence like the remaining of Wapiti.
 */
void xvm_expma(double r[], const double x[], double a, uint64_t N) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
  #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v))))
	assert(r != NULL && ((uintptr_t)r % 16) == 0);
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	const __m128i vl  = _mm_set1_epi64x(0x3ff0000000000000ULL);
	const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL);
	const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL);
	const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL);
	const __m128d hal = xvm_vconst(0x3fe0000000000000ULL);
	const __m128d nan = xvm_vconst(0xfff8000000000000ULL);
	const __m128d inf = xvm_vconst(0x7ff0000000000000ULL);
	const __m128d c1  = xvm_vconst(0x3fe62e4000000000ULL);
	const __m128d c2  = xvm_vconst(0x3eb7f7d1cf79abcaULL);
	const __m128d p0  = xvm_vconst(0x3feffffffffffffeULL);
	const __m128d p1  = xvm_vconst(0x3ff000000000000bULL);
	const __m128d p2  = xvm_vconst(0x3fe0000000000256ULL);
	const __m128d p3  = xvm_vconst(0x3fc5555555553a2aULL);
	const __m128d p4  = xvm_vconst(0x3fa55555554e57d3ULL);
	const __m128d p5  = xvm_vconst(0x3f81111111362f4fULL);
	const __m128d p6  = xvm_vconst(0x3f56c16c25f3bae1ULL);
	const __m128d p7  = xvm_vconst(0x3f2a019fc9310c33ULL);
	const __m128d p8  = xvm_vconst(0x3efa01825f3cb28bULL);
	const __m128d p9  = xvm_vconst(0x3ec71e2bd880fdd8ULL);
	const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL);
	const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL);
	const __m128d va  = _mm_set1_pd(a);
	for (uint64_t n = 0; n < N; n += 4) {
		__m128d mn1, mn2, mi1, mi2;
		__m128d t1,  t2,  d1,  d2;
		__m128d v1,  v2,  w1,  w2;
		__m128i k1,  k2;
		__m128d f1,  f2;
		// Load the next four values
		__m128d x1 = _mm_load_pd(x + n    );
		__m128d x2 = _mm_load_pd(x + n + 2);
		// Check for out of ranges, infinites and NaN
		mn1 = _mm_cmpneq_pd(x1, x1);	mn2 = _mm_cmpneq_pd(x2, x2);
		mi1 = _mm_cmpgt_pd(x1, ehi);	mi2 = _mm_cmpgt_pd(x2, ehi);
		x1  = _mm_max_pd(x1, elo);	x2  = _mm_max_pd(x2, elo);
		// Range reduction: we search k and f such that e^x = 2^k * e^f
		// with f in [-0.5, 0.5]
		t1  = _mm_mul_pd(x1, l2e);	t2  = _mm_mul_pd(x2, l2e);
		t1  = _mm_add_pd(t1, hal);	t2  = _mm_add_pd(t2, hal);
		k1  = _mm_cvttpd_epi32(t1);	k2  = _mm_cvttpd_epi32(t2);
		d1  = _mm_cvtepi32_pd(k1);	d2  = _mm_cvtepi32_pd(k2);
		t1  = _mm_mul_pd(d1, c1);	t2  = _mm_mul_pd(d2, c1);
		f1  = _mm_sub_pd(x1, t1);	f2  = _mm_sub_pd(x2, t2);
		t1  = _mm_mul_pd(d1, c2);	t2  = _mm_mul_pd(d2, c2);
		f1  = _mm_sub_pd(f1, t1);	f2  = _mm_sub_pd(f2, t2);
		// Evaluation of e^f using a 11th order polynom in Horner form
		v1  = _mm_mul_pd(f1, p11);	v2  = _mm_mul_pd(f2, p11);
		v1  = _mm_add_pd(v1, p10);	v2  = _mm_add_pd(v2, p10);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p9);	v2  = _mm_add_pd(v2, p9);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p8);	v2  = _mm_add_pd(v2, p8);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p7);	v2  = _mm_add_pd(v2, p7);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p6);	v2  = _mm_add_pd(v2, p6);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p5);	v2  = _mm_add_pd(v2, p5);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p4);	v2  = _mm_add_pd(v2, p4);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p3);	v2  = _mm_add_pd(v2, p3);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p2);	v2  = _mm_add_pd(v2, p2);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p1);	v2  = _mm_add_pd(v2, p1);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p0);	v2  = _mm_add_pd(v2, p0);
		// Evaluation of 2^k using bitops to achieve exact computation
		k1  = _mm_slli_epi32(k1, 20);	k2  = _mm_slli_epi32(k2, 20);
		k1  = _mm_shuffle_epi32(k1, 0x72);
		k2  = _mm_shuffle_epi32(k2, 0x72);
		k1  = _mm_add_epi32(k1, vl);	k2  = _mm_add_epi32(k2, vl);
		w1  = _mm_castsi128_pd(k1);	w2  = _mm_castsi128_pd(k2);
		// Return to full range to substract <a>
	        v1  = _mm_mul_pd(v1, w1);	v2  = _mm_mul_pd(v2, w2);
		v1  = _mm_sub_pd(v1, va);	v2  = _mm_sub_pd(v2, va);
		// Finally apply infinite and NaN where needed
		v1  = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1));
		v2  = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2));
		v1  = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1));
		v2  = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2));
		// Store the results
		_mm_store_pd(r + n,     v1);
		_mm_store_pd(r + n + 2, v2);
	}
#else
	for (uint64_t n = 0; n < N; n++)
		r[n] = exp(x[n]) - a;
#endif
}
示例#5
0
  c = _mm_madd_epi16(*data2, fir);
  d = _mm_madd_epi16(*data3, fir);
  c = _mm_hadd_epi32(c, d);

  a = _mm_hadd_epi32(a, c);

  return a;
}

void kvz_eight_tap_filter_and_flip_avx2(int8_t filter[4][8], kvz_pixel *src, int16_t src_stride, int16_t* __restrict dst)
{

  //Load 2 rows per xmm register
  __m128i rows01 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride));
  rows01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows01), (double*)(src + 1 * src_stride)));

  __m128i rows23 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride));
  rows23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows23), (double*)(src + 3 * src_stride)));

  __m128i rows45 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride));
  rows45 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows45), (double*)(src + 5 * src_stride)));

  __m128i rows67 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride));
  rows67 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows67), (double*)(src + 7 * src_stride)));

  //Filter rows
  const int dst_stride = MAX_WIDTH;
  kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[0]), (__m128i*)(dst + 0));
  kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[1]), (__m128i*)(dst + 1 * dst_stride));
  kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[2]), (__m128i*)(dst + 2 * dst_stride));
示例#6
0
// The input must be in domain [-1686629712, 1686629712].
//
// I tried to optimize the double to int conversion by using `magic`, but
// it was actually slower than using `_mm_cvttpd_epi32()` and it didn't
// offer greater domain for `x`.
static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) {
    SIMD_CONST_SQ(sign     , SIMD_UINT64_C(0x8000000000000000));
    SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF));
    SIMD_CONST_SI(int32_one, 1);
    SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698);
    SIMD_CONST_SD(DP1      , 7.85398125648498535156e-1);
    SIMD_CONST_SD(DP2      , 3.77489470793079817668e-8);
    SIMD_CONST_SD(DP3      , 2.69515142907905952645e-15);

#define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \
  SIMD_ALIGN_VAR(static const double, name[], 16) = { \
    x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \
    y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \
    x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \
    y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya  \
  }

    DEFINE_DATA(sincos_coeff,
                1.58962301576546568060e-10,-2.50507477628578072866e-8,
                2.75573136213857245213e-6 ,-1.98412698295895385996e-4,
                8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0,

                -1.13585365213876817300e-11, 2.08757008419747316778e-9,
                -2.75573141792967388112e-7 , 2.48015872888517045348e-5,
                -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0);

    __m128d y;
    __m128d sign = x;                                        // Sign bit.

    x = _mm_and_pd(x, SIMD_GET_PD(inv_sign));                // Take the absolute value.
    y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI));                // Integer part of `x * 4 / PI`.

    __m128i ival = _mm_cvttpd_epi32(y);                      // Extract the integer part of y.
    __m128i ione = SIMD_GET_PI(int32_one);

    ival = _mm_add_epi32(ival, ione);                        // j += 1.
    ival = _mm_andnot_si128(ione, ival);                     // j &=~1.

    y = _mm_cvtepi32_pd(ival);
    ival = _mm_unpacklo_epi32(ival, ival);

    sign = _mm_xor_pd(sign,                                  // Swap the sign bit if `j & 4`.
                      _mm_castsi128_pd(_mm_slli_epi64(ival, 61)));
    sign = _mm_and_pd(sign, SIMD_GET_PD(sign));              // Keep only the sign bit.

    // Get the polynom selection mask (j & 2):
    //   1. `0x0000000000000000` => `0    <= x <= PI/4`
    //   2. `0xFFFFFFFFFFFFFFFF` => `PI/4 <  x <= PI/2`
    ival = _mm_slli_epi32(ival, 30);
    ival = _mm_srai_epi32(ival, 31);

    // Extended precision modular arithmetic:
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3)));

    // Get the polynom coefficients for each lane (sin/cos).
    __m128d poly_mask = _mm_castsi128_pd(ival);
    const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) +
                           static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8;

    __m128d xx = _mm_mul_pd(x, x);
    y = coeff[0];
    y = Simd128::mad(y, xx, coeff[1]);
    y = Simd128::mad(y, xx, coeff[2]);
    y = Simd128::mad(y, xx, coeff[3]);
    y = Simd128::mad(y, xx, coeff[4]);
    y = Simd128::mad(y, xx, coeff[5]);
    y = _mm_mul_pd(y, xx);

    __m128d x_or_xx = _mm_or_pd(
                          _mm_and_pd(xx, poly_mask),
                          _mm_andnot_pd(poly_mask, x));

    y = _mm_mul_pd(y, x_or_xx);
    y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6]));
    y = _mm_add_pd(y, coeff[7]);

    return _mm_xor_pd(y, sign);
}