コード例 #1
0
ファイル: main.cpp プロジェクト: minh0722/HPC2015
static void bar(float (& inout)[8]) {
    static __m128 first;
    static __m128 second;
    static __m128 cmp1;
    static __m128 cmp2;
    static __m128 res1;
    static __m128 res2;
    static __m128 temp;
    static __m128 res3;
    static __m128 res4;
    static float result1[4];
    static float result2[4];

	const size_t idx[][2] = {
		{0, 1}, {3, 2}, {4, 5}, {7, 6},
		{0, 2}, {1, 3}, {6, 4}, {7, 5},
		{0, 1}, {2, 3}, {5, 4}, {7, 6},
		{0, 4}, {1, 5}, {2, 6}, {3, 7},
		{0, 2}, {1, 3}, {4, 6}, {5, 7},
		{0, 1}, {2, 3}, {4, 5}, {6, 7}
	};
    // 24 = sizeof(idx)/sizeof(idx[0])
    for(int i = 0 ; i < 24 ; i+=4)
    {   // the first and second are packed vectors of the i-th element to the i-th +3
        // reversed because the _mm_set_ps() reverses the data for some reasons
        first = _mm_set_ps(inout[idx[i+3][0]], inout[idx[i+2][0]], inout[idx[i+1][0]], inout[idx[i][0]]);
        second = _mm_set_ps(inout[idx[i+3][1]], inout[idx[i+2][1]], inout[idx[i+1][1]], inout[idx[i][1]]);

        // cmpge because if cmpgt(greater then) it will be bugged for array with equal data insside ex [1,1,1,1,1] -> [0,0,0,0]
        cmp1 = _mm_cmpge_ps(first, second);
        cmp2 = _mm_cmpge_ps(second, first);

        // the formula
        // x = (c & y) | (!c & x)
        // y = (c & x) | (!c & y)
        // where x and y are elements
        res1 = _mm_and_ps(second, cmp1);
        res2 = _mm_and_ps(first, cmp2);

        res3 = _mm_and_ps(first, cmp1);
        res4 = _mm_and_ps(second, cmp2);

        first = _mm_or_ps(res1, res2);
        second = _mm_or_ps(res3, res4);

        // put them on the positions
        _mm_storeu_ps(result1, first);
        _mm_storeu_ps(result2, second);

        for(int j = 0 ; j < 4 ; ++j)
        {
            inout[idx[i+j][0]] = result1[j];
            inout[idx[i+j][1]] = result2[j];
        }
    }
}
コード例 #2
0
ファイル: softwareOcclusion.cpp プロジェクト: hyp/Arpheg
static inline void boxVerticesToScreenVertices(vec4f vertices[8],const vec4f& screenCenterMul,vec2i screenCenter){
#ifdef ARPHEG_ARCH_X86
	__m128 screenSpaceMul = _mm_load_ps((float*)&screenCenterMul.x);
	__m128 screenCenterOffset = _mm_setr_ps(float(screenCenter.x),float(screenCenter.y),0,0);
	__m128 nearClip = _mm_setzero_ps();
	for(uint32 i = 0;i<8;++i){
		__m128 hv = _mm_load_ps((float*)(vertices + i));
		
		__m128 w  = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(3,3,3,3)); //get the w component
		__m128 z  = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(2,2,2,2));
		hv = _mm_div_ps(hv,w); //Project XYZW to clip space (divide by w)
		
		hv = _mm_mul_ps(hv,screenSpaceMul); //XY to screen space    [-width/2,-height/2 -> width/2,height/2]
		hv = _mm_add_ps(hv,screenCenterOffset);//XY to screen space [0,0 -> width,height]
		__m128 mNoNearClip = _mm_cmpge_ps(z, nearClip );

		//Set to all-0 if near-clipped
		hv = _mm_and_ps(hv, mNoNearClip);

		_mm_store_ps((float*)(vertices + i),hv);
	}
#else
	//TODO
	ScreenSpaceVertex* screenVerts= (ScreenSpaceVertex*)vertices;
	for(uint32 i =0;i<8;++i){
		vertices[i] = vertices[i] * (1.0f/vertices[i].w) ;
		auto v = vertices[i] * screenCenterMul;
		screenVerts[i].pos = vec2i(int32(v.x),int32(v.y))+screenCenter;
	}
#endif
}
コード例 #3
0
    SIMDValue SIMDFloat32x4Operation::OpGreaterThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128_value = _mm_cmpge_ps(tmpaValue.m128_value, tmpbValue.m128_value); // a >= b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
コード例 #4
0
F32 Aabb::testPlane(const Plane& p) const
{
	const Aabb& aabb = *this;

#if ANKI_SIMD == ANKI_SIMD_SSE
	__m128 gezero = _mm_cmpge_ps(p.getNormal().getSimd(), _mm_setzero_ps());

	Vec4 diagMin;
	diagMin.getSimd() =
		_mm_or_ps(_mm_and_ps(gezero, aabb.getMin().getSimd()), _mm_andnot_ps(gezero, aabb.getMax().getSimd()));
#else
	Vec4 diagMin(0.0), diagMax(0.0);
	// set min/max values for x,y,z direction
	for(U i = 0; i < 3; i++)
	{
		if(p.getNormal()[i] >= 0.0)
		{
			diagMin[i] = aabb.getMin()[i];
			diagMax[i] = aabb.getMax()[i];
		}
		else
		{
			diagMin[i] = aabb.getMax()[i];
			diagMax[i] = aabb.getMin()[i];
		}
	}
#endif

	// minimum on positive side of plane, box on positive side
	ANKI_ASSERT(diagMin.w() == 0.0);
	F32 test = p.test(diagMin);
	if(test > 0.0)
	{
		return test;
	}

#if ANKI_SIMD == ANKI_SIMD_SSE
	Vec4 diagMax;
	diagMax.getSimd() =
		_mm_or_ps(_mm_and_ps(gezero, aabb.getMax().getSimd()), _mm_andnot_ps(gezero, aabb.getMin().getSimd()));
#endif

	ANKI_ASSERT(diagMax.w() == 0.0);
	test = p.test(diagMax);
	if(test >= 0.0)
	{
		// min on non-positive side, max on non-negative side, intersection
		return 0.0;
	}
	else
	{
		// max on negative side, box on negative side
		return test;
	}
}
コード例 #5
0
ファイル: overexposed.c プロジェクト: Coshibu/darktable
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
{
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
    {
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
コード例 #6
0
ファイル: rosesimd.c プロジェクト: peihunglin/TSVC_benchmark
// a >= b
void _SIMD_cmpge_ps(__SIMD a, __SIMD b, void** resultPtr)
{
  __SIMD* result = (__SIMD*)malloc(sizeof(__SIMD));
  *resultPtr = result;
#ifdef  USE_SSE
  *result = _mm_cmpge_ps(a,b);
#elif defined USE_AVX
  *result = _mm256_cmp_ps(a,b,29);
#elif defined USE_IBM
  *result = vec_cmpge(a,b);
#endif
}
コード例 #7
0
    SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws)
    {
        X86SIMDValue x86Result = { 0 };
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);
        X86SIMDValue temp, temp2;
        X86SIMDValue two_31_f4, two_31_i4;
        int mask = 0;

        // any lanes < 0 ?
        temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value);
        mask = _mm_movemask_ps(temp.m128_value);
        // negative value are out of range, caller should throw Range Error
        if (mask)
        {
            throws = true;
            return X86SIMDValue::ToSIMDValue(x86Result);
        }
        // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31.
        // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back.
        _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value);
        // any lanes >= 2^31 ?
        temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value);
        // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise
        two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value);
        // subtract 2^31 from lanes >= 2^31, unchanged otherwise.
        v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value);

        // CVTTPS2DQ
        x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value);

        // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment)
        temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ?
        mask = _mm_movemask_ps(temp2.m128_value);
        if (mask)
        {
            throws = true;
            return X86SIMDValue::ToSIMDValue(x86Result);
        }
        // we pass range check

        // add 2^31 values back to adjusted values.
        // Use first bit from the 2^31 float mask (0x4f000...0 << 1)
        // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted
        _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value);
        two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1);
        two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value);
        // add 2^31 back to adjusted values
        // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow.
        x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
コード例 #8
0
static void GF_FUNC_ALIGN VS_CC
float_to_dst_16bit(const float *srcp, uint8_t *d, int width, int height,
                   int src_stride, int dst_stride, float th, int bits)
{
    uint16_t *dstp = (uint16_t *)d;
    dst_stride /= 2;
    __m128 tmax = _mm_set1_ps(th);
    int rshift = 32 - bits;

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 8) {
            __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax);
            __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax);

            __m128i xmi0 = _mm_srli_epi32(_mm_castps_si128(xmf0), rshift);
            __m128i xmi1 = _mm_srli_epi32(_mm_castps_si128(xmf1), rshift);
            xmi0 = mm_cast_epi32(xmi0, xmi1);
            _mm_store_si128((__m128i *)(dstp + x), xmi0);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
コード例 #9
0
static void GF_FUNC_ALIGN VS_CC
float_to_dst_8bit(const float *srcp, uint8_t *dstp, int width, int height,
                  int src_stride, int dst_stride, float th, int bits)
{
    __m128 tmax = _mm_set1_ps(th);

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax);
            __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax);
            __m128 xmf2 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 8), tmax);
            __m128 xmf3 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 12), tmax);
            __m128i xmi0 = _mm_packs_epi32(_mm_castps_si128(xmf0),
                                           _mm_castps_si128(xmf1));
            __m128i xmi1 = _mm_packs_epi32(_mm_castps_si128(xmf2),
                                           _mm_castps_si128(xmf3));
            xmi0 = _mm_packs_epi16(xmi0, xmi1);
            _mm_store_si128((__m128i *)(dstp + x), xmi0);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
コード例 #10
0
inline void GDALCopy4WordsSSE(const float* pValueIn, Tout* const &pValueOut)
{
    float fMaxVal, fMinVal;
    GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
    __m128 xmm = _mm_loadu_ps(pValueIn);

    __m128 xmm_min = _mm_set1_ps(fMinVal);
    __m128 xmm_max = _mm_set1_ps(fMaxVal);
    xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);

#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    __m128 p0d5 = _mm_set1_ps(0.5f);
     if (std::numeric_limits<Tout>::is_signed)
     {
        __m128 m0d5 = _mm_set1_ps(-0.5f);
        //__m128 mask = _mm_cmpge_ps(xmm, _mm_set1_ps(0.f));
        __m128 mask = _mm_cmpge_ps(xmm, p0d5);
        xmm = _mm_add_ps(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5))); /* f >= 0.5f ? f + 0.5f : f - 0.5f */
     }
     else
     {
         xmm = _mm_add_ps(xmm, p0d5);
     }
#endif

#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    __m128i xmm_i = _mm_cvttps_epi32 (xmm);
#else
    __m128i xmm_i = _mm_cvtps_epi32(xmm);
#endif
#if 0
    int aTemp[4];
    _mm_storeu_si128 ( (__m128i *)aTemp, xmm_i);
    pValueOut[0] = (Tout)aTemp[0];
    pValueOut[1] = (Tout)aTemp[1];
    pValueOut[2] = (Tout)aTemp[2];
    pValueOut[3] = (Tout)aTemp[3];
#else
    pValueOut[0] = (Tout)_mm_extract_epi16(xmm_i, 0);
    pValueOut[1] = (Tout)_mm_extract_epi16(xmm_i, 2);
    pValueOut[2] = (Tout)_mm_extract_epi16(xmm_i, 4);
    pValueOut[3] = (Tout)_mm_extract_epi16(xmm_i, 6);
#endif
}
コード例 #11
0
ファイル: bump.cpp プロジェクト: bustercopley/polymorph
// Returns { f, g, f, g }, where f = bump0 (t), g = bump1 (t).
v4f bumps_t::operator () (float t) const
{
  // Compute all four polynomials by Estrin's method, and mask and combine the
  // values according to the region of the graph to which t belongs.
  v4f s = _mm_set1_ps (t);
  v4f S = load4f (S0);
  v4f T = load4f (T0);
  v4f U = load4f (U0);
  v4f V = load4f (V0);
  v4f f01 = load4f (c [0]) + load4f (c [1]) * s;
  v4f f12 = load4f (c [2]) + load4f (c [3]) * s;
  v4f f = f01 + f12 * s * s;
  v4f ltS = _mm_cmplt_ps (s, S);
  v4f geT = _mm_cmpge_ps (s, T);
  v4f x1 = _mm_andnot_ps (_mm_or_ps (ltS, geT), f);
  v4f x2 = _mm_and_ps (ltS, U);
  v4f x3 = _mm_and_ps (geT, V);
  v4f val = _mm_or_ps (_mm_or_ps (x1, x2), x3);
  return _mm_hadd_ps (val, val);
}
コード例 #12
0
ファイル: bump.cpp プロジェクト: bustercopley/polymorph
v4f step_t::operator () (float t) const
{
  // Evaluate the polynomial f by Estrin's method. Return
  //   (0 0 0 0)  if t < t0,
  //   (f f f f)  if t0 <= t < t1,
  //   (1 1 1 1)  if t > t1.
  v4f c4 = load4f (c);
  v4f one = { 1.0f, 1.0f, 1.0f, 1.0f };
  v4f tttt = _mm_set1_ps (t);           // t t t t
  v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t
  v4f f0 = c4 * tt;                     // c0 c1*t c2 c3*t
  v4f ha = _mm_hadd_ps (f0, f0) * tt * tt;
  v4f f = _mm_hadd_ps (ha, ha);         // f f f f
  v4f f1 = _mm_unpacklo_ps (f, one);    // f 1 f 1
  v4f tx = load4f (T);                  // t0  t1 t1 inf
  v4f lo = _mm_movelh_ps (tx, tx);      // t0  t1 t0  t1
  v4f hi = _mm_movehl_ps (tx, tx);      // t1 inf t1 inf
  v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi));
  v4f val = _mm_and_ps (sel, f1);       // f? 1? f? 1?
  return _mm_hadd_ps (val, val);
}
コード例 #13
0
ファイル: vector.hpp プロジェクト: WoLpH/raytracer
inline vec4 operator>=(vec4 a, vec4 b) { return _mm_cmpge_ps(a, b); }
コード例 #14
0
ファイル: sse.c プロジェクト: CharoL/bioinfo-libs
void sse_matrix(int num_seqs, 
		char **q, int *q_len, int max_q_len,
		char **r, int *r_len, int max_r_len,
		float profile[128][128], float gap_open, float gap_extend,
		float *H, float *F, int *C, float *max_score) {
  
  const int depth = 4;

  __m128 h_simd, e_simd, f_simd, diagonal_simd;
  __m128 temp_simd, subst_simd;

  __m128i zeroi   = _mm_set_epi32(0, 0, 0, 0);

  __m128 score_simd = _mm_setzero_ps();
  __m128 zero_simd = _mm_setzero_ps();
  __m128 one_simd = _mm_set1_ps(1);
  __m128 gap_open_simd = _mm_set1_ps(gap_open);
  __m128 gap_extend_simd = _mm_set1_ps(gap_extend);

  __m128 max_de, max_fz;
  __m128 cmp_de, cmp_fz, cmp_de_fz;
  __m128i c;

  int offset, index, idx, j_depth;
  int q_len_depth = depth * max_q_len;
  /*
      for (int i = 0; i < 4; i++) {
        printf("query %i:%s\nref.  %i:%s\n\n", i, q[i], i, r[i]);
      }
  */
  h_simd = zero_simd;
  e_simd = zero_simd;
  
  for (int j = 0; j < max_q_len; j++) {

    j_depth = depth * j;
    
    // left value: gap in reference
    e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), 
			_mm_sub_ps(h_simd, gap_open_simd));

    //    printf("from left: %0.2f\n", ((float *)&e_simd)[0]);
    
    // diagonal value: match or mismatch

    subst_simd = _mm_set_ps((q_len[3] > j) ? profile[q[3][j]][r[3][0]] : -1000.0f,
                            (q_len[2] > j) ? profile[q[2][j]][r[2][0]] : -1000.0f,
                            (q_len[1] > j) ? profile[q[1][j]][r[1][0]] : -1000.0f,
                            (q_len[0] > j) ? profile[q[0][j]][r[0][0]] : -1000.0f);
    /*
    subst_simd = _mm_set_ps(profile[q[3][j]][r[3][0]], 
			    profile[q[2][j]][r[2][0]], 
			    profile[q[1][j]][r[1][0]], 
			    profile[q[0][j]][r[0][0]]);
    */

    diagonal_simd = _mm_add_ps(zero_simd, subst_simd);
    //    printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[0], profile[q[0][j]][r[0][0]], q[0][j], r[0][0], ((float *)&diagonal_simd)[0]);

    cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd);
    max_de = _mm_max_ps(diagonal_simd, e_simd);

    // up value: gap in query
    f_simd = _mm_max_ps(_mm_sub_ps(zero_simd, gap_extend_simd), 
			_mm_sub_ps(zero_simd, gap_open_simd));

    cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd);
    max_fz = _mm_max_ps(f_simd, zero_simd);
    
    //    printf("from up: %0.2f\n", ((float *)&f_simd)[0]);
    
    // get max. value and save it
    cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd);
    h_simd = _mm_max_ps(max_de, max_fz);

    score_simd = _mm_max_ps(score_simd, h_simd);
    //    printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[0]);
    
    // compass (save left, diagonal, up or zero?)
    c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1);
    c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1);
    c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz));

    //    printf("\t\t\t\t\tcompass: %i\n", ((int *)&c)[0]);

    // update matrices
    _mm_store_ps(&H[j_depth], h_simd);
    _mm_store_ps(&F[j_depth], f_simd);
    _mm_store_si128((__m128i *)&C[j_depth], c);

 
    //_mm_store_ps(&D[j_depth], diagonal_simd);

    /*
    offset = j_depth;
    printf("(row, col) = (%i, %i):\t \t%c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f\n", 0, j, q[0][j], r[0][0], profile[q[0][j]][r[0][0]], q[1][j], r[1][0], profile[q[1][j]][r[1][0]], q[2][j], r[2][0], profile[q[2][j]][r[2][0]], q[3][j], r[3][0], profile[q[3][j]][r[3][0]]);
    printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, H[offset], H[offset+1], H[offset+2], H[offset+3]);
    printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, D[offset], D[offset+1], D[offset+2], D[offset+3]);
    printf("(row, col) = (%i, %i):\td\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&diagonal_simd)[0], ((float *)&diagonal_simd)[1], ((float *)&diagonal_simd)[2], ((float *)&diagonal_simd)[3]);

    printf("(row, col) = (%i, %i):\ts\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&subst_simd)[0], ((float *)&subst_simd)[1], ((float *)&subst_simd)[2], ((float *)&subst_simd)[3]);
    */
  }
  //  printf("\n");

  //  exit(-1);
  int target = 0;
  for (int i = 1; i < max_r_len; i++) {
    
    h_simd = zero_simd;
    e_simd = zero_simd;
    temp_simd = zero_simd;

    idx = i * q_len_depth;

    for (int j = 0; j < max_q_len; j++) {
      j_depth = depth * j;
      offset = idx + j_depth;

      // left value: gap in reference
      e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), 
			  _mm_sub_ps(h_simd, gap_open_simd));
 
      //      if (i == 3 && j == 3) printf("from left: %0.2f\n", ((float *)&e_simd)[target]);

      // diagonal value: match or mismatch
      diagonal_simd = _mm_add_ps(temp_simd,
				 _mm_set_ps((q_len[3] > j && r_len[3] > i) ? profile[q[3][j]][r[3][i]] : -1000.0f, 
					    (q_len[2] > j && r_len[2] > i) ? profile[q[2][j]][r[2][i]] : -1000.0f, 
					    (q_len[1] > j && r_len[1] > i) ? profile[q[1][j]][r[1][i]] : -1000.0f, 
					    (q_len[0] > j && r_len[0] > i) ? profile[q[0][j]][r[0][i]] : -1000.0f)
				 );

      cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd);
      max_de = _mm_max_ps(diagonal_simd, e_simd);

      //      if (i == 3 && j == 3)	printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[target], profile[q[target][j]][r[target][i]], q[target][j], r[target][i], ((float *)&diagonal_simd)[target]);
      
      // up value: gap in query
      temp_simd = _mm_load_ps(&H[offset - q_len_depth]);

      f_simd = _mm_load_ps(&F[j_depth]);
      f_simd = _mm_max_ps(_mm_sub_ps(f_simd, gap_extend_simd), 
			  _mm_sub_ps(temp_simd, gap_open_simd));

      cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd);
      max_fz = _mm_max_ps(f_simd, zero_simd);

      //      if (i == 3 && j == 3) printf("from up: %0.2f\n", ((float *)&f_simd)[target]);

      // get max. value
      cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd);
      h_simd = _mm_max_ps(max_de, max_fz);

      score_simd = _mm_max_ps(score_simd, h_simd);

      //      if (i == 3 && j == 3) printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[target]);

      // compass (save left, diagonal, up or zero?)
      c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1);
      c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1);
      c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz));

      // update matrices
      _mm_store_ps(&H[offset], h_simd);
      _mm_store_ps(&F[j_depth], f_simd); 
      _mm_store_si128((__m128i *)&C[offset], c);

      /*
      if (j==0) {
	printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", i, j, D[offset], D[offset+1], D[offset+2], D[offset+3]);
	printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]);
      }
      */
      //      printf("(row, col) = (%i, %i):\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]);
    }
    //    printf("\n");
  }
  _mm_store_ps(max_score, score_simd);
  
  /*

  int rr_len = r_len[0];
  int qq_len = q_len[0];
  printf("r_len[0] = %i, q_len[0] = %i\n", rr_len, qq_len);

  printf("sse\n");
  for (int i = 0; i < rr_len; i++) {
    printf("\t");
    for (int j = 0; j < qq_len; j++) {
      printf("%0.2f\t", H[(i * max_q_len * 4) + (j * 4)]);
    }
    printf("\n");
  }
  */
  /*
  char filename[200];
  for (int i = 0; i < 4; i++) {
    sprintf(filename, "/tmp/sse1-%i.score", i);
    save_float_matrix(H, max_q_len, max_r_len, q[i], q_len[i], r[i], r_len[i], i, 4, filename);
  }
  */
  /*
      for (int i = 0; i < 4; i++) {
        printf("score %i:%0.2f\n\n", i, max_score[i]);
      }
  */
}
コード例 #15
0
//-----------------------------------------------------------------------------------------
// Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer
// If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee
// as visible. If all rasterized AABB pixels are occluded then the occludee is culled
//-----------------------------------------------------------------------------------------
void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	// Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. 
	// so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_set_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_set_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)pRenderTargetPixels; 
	
	// Rasterize the AABB triangles 4 at a time
	for(UINT i = 0; i < AABB_TRIANGLES; i += SSE)
	{
		vFloat4 xformedPos[3];
		Gather(xformedPos, i);

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        vFxPt4 xFormedFxPtPos[3];
		for(int m = 0; m < 3; m++)
		{
			xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X);
			xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y);
			xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z);
			xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W);
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y);
		__m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y);
		__m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X);
		__m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X);
		__m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X);
		triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y));
		triArea = _mm_add_epi32(triArea, C0);

		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW));

		__m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH));

		for(int vv = 0; vv < 3; vv++) 
		{
            // If W (holding 1/w in our case) is not between 0 and 1,
            // then vertex is behind near clip plane (1.0 in our case.
            // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0).
		    __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f));
		    __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f));
            __m128 nearClipMask  = _mm_or_ps(nearClipMask0, nearClipMask1);

			if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask))
			{
                // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE)
                *mVisible = true;
                return;
			}
		}

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < SSE; lane++)
        {
			// Skip triangle if area is zero 
			if(triArea.m128i_i32[lane] <= 0)
			{
				continue;
			}

			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3], oneOverW[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]);
				oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]);
			}

			__m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]);
			zz[0] *= oneOverTotalArea;
			zz[1] *= oneOverTotalArea;
			zz[2] *= oneOverTotalArea;
			
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]);
			__m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]);
			__m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			int rowIdx;
			// To avoid this branching, choose one method to traverse and store the pixel depth
			if(gVisualizeDepthBuffer)
			{
				// Sequentially traverse and store pixel depths contiguously
				rowIdx = (startYy * SCREENW + startXx);
			}
			else
			{
				// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
				// This method provides better perfromance
				rowIdx = (startYy * SCREENW + 2 * startXx);
			}

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0);
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1);
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											row  = _mm_add_epi32(row, _mm_set1_epi32(2)),
											rowIdx = rowIdx + 2 * SCREENW,
											bb0Row = _mm_add_epi32(bb0Row, bb0Inc),
											bb1Row = _mm_add_epi32(bb1Row, bb1Inc),
											bb2Row = _mm_add_epi32(bb2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int idx = rowIdx;
				__m128i alpha = _mm_add_epi32(aa0Col, bb0Row);
				__m128i beta = _mm_add_epi32(aa1Col, bb1Row);
				__m128i gama = _mm_add_epi32(aa2Col, bb2Row);

				int idxIncr;
				if(gVisualizeDepthBuffer)
				{ 
					idxIncr = 2;
				}
				else
				{
					idxIncr = 4;
				}

				for(int c = startXx; c < endXx; c += 2,
												idx = idx + idxIncr,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama));
					
					// Early out if all of this quad's pixels are outside the triangle.
					if(_mm_test_all_zeros(mask, mask))
					{
						continue;
					}

					// Compute barycentric-interpolated depth
			        __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]);
					depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
					depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

					__m128 previousDepthValue;
					if(gVisualizeDepthBuffer)
					{
						previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]);
					}
					else
					{
						previousDepthValue = *(__m128*)&pDepthBuffer[idx];
					}

					__m128 depthMask  = _mm_cmpge_ps( depth, previousDepthValue);
					__m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask));
					if(!_mm_test_all_zeros(finalMask, finalMask))
					{
						*mVisible = true;
						return; //early exit
					}
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}
コード例 #16
0
ファイル: test_sse1.cpp プロジェクト: AVert/emscripten
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
コード例 #17
0
ファイル: main.cpp プロジェクト: minh0722/HPC2015
inline static void bar(float(&inout)[8])
{
	__m128 leftSideElements[6],
		rightSideElements[6],
		leftGERight[6],
		leftLTRight[6],
		leftElementsGE[6],  // swaped elements on the left part of comparison
		leftElementsLT[6],  // not-swaped elements on the left part of comparison
		rightElementsGE[6], // swaped elements on the right part of comparison
		rightElementsLT[6]; // not-swaped elements on the right part of comparison
	float resultLeftElements[6][4], resultRightElements[6][4];

	const size_t idx[][2] = {
			{ 0, 1 }, { 3, 2 }, { 4, 5 }, { 7, 6 },
			{ 0, 2 }, { 1, 3 }, { 6, 4 }, { 7, 5 },
			{ 0, 1 }, { 2, 3 }, { 5, 4 }, { 7, 6 },
			{ 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 },
			{ 0, 2 }, { 1, 3 }, { 4, 6 }, { 5, 7 },
			{ 0, 1 }, { 2, 3 }, { 4, 5 }, { 6, 7 }
	};

	// First row
	leftSideElements[0] = _mm_set_ps(inout[idx[3][0]], inout[idx[2][0]], inout[idx[1][0]], inout[idx[0][0]]);
	rightSideElements[0] = _mm_set_ps(inout[idx[3][1]], inout[idx[2][1]], inout[idx[1][1]], inout[idx[0][1]]);

	leftGERight[0] = _mm_cmpge_ps(leftSideElements[0], rightSideElements[0]); // Something like 0 0 -1 -1.
	leftLTRight[0] = _mm_cmplt_ps(leftSideElements[0], rightSideElements[0]); // Something like -1 -1 0 0.

	// Calculates the values of the elements on the left.
	leftElementsGE[0] = _mm_and_ps(rightSideElements[0], leftGERight[0]); // If the element on left side is bigger or equal to the element on the right side - swaps, so writes the element on the left side to be the element on the right.
	leftElementsLT[0] = _mm_and_ps(leftSideElements[0], leftLTRight[0]);  // If the element on the left side is less than element on the right side - don`t swap and writes the element on left side on it`s place.

	// Calculates the values of the elements on the right
	rightElementsGE[0] = _mm_and_ps(leftSideElements[0], leftGERight[0]);  // If the element on the left side is bigger or equal to the element on the right side - swaps, so writes on the element on the right side to be the element on the left.
	rightElementsLT[0] = _mm_and_ps(rightSideElements[0], leftLTRight[0]); // If the element on the left side is less than element on the right side - don`t swap and writes the element on the right side on it`s place.

	// Now let`s combine the elements, because we have two vectors @leftGERight and @leftLTRight, which are basically inverted, so one OR operation will do it.
	// (in the @leftElemetnsGE will have something like [0, 0, element, element] and in the @leftElemetnsLT will be [element, element, 0, 0]) 
	leftSideElements[0] = _mm_or_ps(leftElementsGE[0], leftElementsLT[0]);
	rightSideElements[0] = _mm_or_ps(rightElementsGE[0], rightElementsLT[0]);

	// Now let`s write them in our array so we can put them in their original places on the given @inout.
	_mm_storeu_ps(resultLeftElements[0], leftSideElements[0]);
	_mm_storeu_ps(resultRightElements[0], rightSideElements[0]);

	// Puts the swaped(if needed) elements on their places.
	inout[idx[0][0]] = resultLeftElements[0][0];
	inout[idx[0][1]] = resultRightElements[0][0];
	inout[idx[1][0]] = resultLeftElements[0][1];
	inout[idx[1][1]] = resultRightElements[0][1];
	inout[idx[2][0]] = resultLeftElements[0][2];
	inout[idx[2][1]] = resultRightElements[0][2];
	inout[idx[3][0]] = resultLeftElements[0][3];
	inout[idx[3][1]] = resultRightElements[0][3];

	// Second row
	leftSideElements[1] = _mm_set_ps(inout[idx[7][0]], inout[idx[6][0]], inout[idx[5][0]], inout[idx[4][0]]);
	rightSideElements[1] = _mm_set_ps(inout[idx[7][1]], inout[idx[6][1]], inout[idx[5][1]], inout[idx[4][1]]);

	leftGERight[1] = _mm_cmpge_ps(leftSideElements[1], rightSideElements[1]);
	leftLTRight[1] = _mm_cmplt_ps(leftSideElements[1], rightSideElements[1]);

	leftElementsGE[1] = _mm_and_ps(rightSideElements[1], leftGERight[1]);
	leftElementsLT[1] = _mm_and_ps(leftSideElements[1], leftLTRight[1]);

	rightElementsGE[1] = _mm_and_ps(leftSideElements[1], leftGERight[1]);
	rightElementsLT[1] = _mm_and_ps(rightSideElements[1], leftLTRight[1]);

	leftSideElements[1] = _mm_or_ps(leftElementsGE[1], leftElementsLT[1]);
	rightSideElements[1] = _mm_or_ps(rightElementsGE[1], rightElementsLT[1]);

	_mm_storeu_ps(resultLeftElements[1], leftSideElements[1]);
	_mm_storeu_ps(resultRightElements[1], rightSideElements[1]);

	inout[idx[4][0]] = resultLeftElements[1][0];
	inout[idx[4][1]] = resultRightElements[1][0];
	inout[idx[5][0]] = resultLeftElements[1][1];
	inout[idx[5][1]] = resultRightElements[1][1];
	inout[idx[6][0]] = resultLeftElements[1][2];
	inout[idx[6][1]] = resultRightElements[1][2];
	inout[idx[7][0]] = resultLeftElements[1][3];
	inout[idx[7][1]] = resultRightElements[1][3];

	// Third row
	leftSideElements[2] = _mm_set_ps(inout[idx[11][0]], inout[idx[10][0]], inout[idx[9][0]], inout[idx[8][0]]);
	rightSideElements[2] = _mm_set_ps(inout[idx[11][1]], inout[idx[10][1]], inout[idx[9][1]], inout[idx[8][1]]);

	leftGERight[2] = _mm_cmpge_ps(leftSideElements[2], rightSideElements[2]);
	leftLTRight[2] = _mm_cmplt_ps(leftSideElements[2], rightSideElements[2]);

	leftElementsGE[2] = _mm_and_ps(rightSideElements[2], leftGERight[2]);
	leftElementsLT[2] = _mm_and_ps(leftSideElements[2], leftLTRight[2]);

	rightElementsGE[2] = _mm_and_ps(leftSideElements[2], leftGERight[2]);
	rightElementsLT[2] = _mm_and_ps(rightSideElements[2], leftLTRight[2]);

	leftSideElements[2] = _mm_or_ps(leftElementsGE[2], leftElementsLT[2]);
	rightSideElements[2] = _mm_or_ps(rightElementsGE[2], rightElementsLT[2]);

	_mm_storeu_ps(resultLeftElements[2], leftSideElements[2]);
	_mm_storeu_ps(resultRightElements[2], rightSideElements[2]);

	inout[idx[8][0]] = resultLeftElements[2][0];
	inout[idx[8][1]] = resultRightElements[2][0];
	inout[idx[9][0]] = resultLeftElements[2][1];
	inout[idx[9][1]] = resultRightElements[2][1];
	inout[idx[10][0]] = resultLeftElements[2][2];
	inout[idx[10][1]] = resultRightElements[2][2];
	inout[idx[11][0]] = resultLeftElements[2][3];
	inout[idx[11][1]] = resultRightElements[2][3];

	// Fourth row
	leftSideElements[3] = _mm_set_ps(inout[idx[15][0]], inout[idx[14][0]], inout[idx[13][0]], inout[idx[12][0]]);
	rightSideElements[3] = _mm_set_ps(inout[idx[15][1]], inout[idx[14][1]], inout[idx[13][1]], inout[idx[12][1]]);

	leftGERight[3] = _mm_cmpge_ps(leftSideElements[3], rightSideElements[3]);
	leftLTRight[3] = _mm_cmplt_ps(leftSideElements[3], rightSideElements[3]);

	leftElementsGE[3] = _mm_and_ps(rightSideElements[3], leftGERight[3]);
	leftElementsLT[3] = _mm_and_ps(leftSideElements[3], leftLTRight[3]);

	rightElementsGE[3] = _mm_and_ps(leftSideElements[3], leftGERight[3]);
	rightElementsLT[3] = _mm_and_ps(rightSideElements[3], leftLTRight[3]);

	leftSideElements[3] = _mm_or_ps(leftElementsGE[3], leftElementsLT[3]);
	rightSideElements[3] = _mm_or_ps(rightElementsGE[3], rightElementsLT[3]);

	_mm_storeu_ps(resultLeftElements[3], leftSideElements[3]);
	_mm_storeu_ps(resultRightElements[3], rightSideElements[3]);

	inout[idx[12][0]] = resultLeftElements[3][0];
	inout[idx[12][1]] = resultRightElements[3][0];
	inout[idx[13][0]] = resultLeftElements[3][1];
	inout[idx[13][1]] = resultRightElements[3][1];
	inout[idx[14][0]] = resultLeftElements[3][2];
	inout[idx[14][1]] = resultRightElements[3][2];
	inout[idx[15][0]] = resultLeftElements[3][3];
	inout[idx[15][1]] = resultRightElements[3][3];

	// Fifth row
	leftSideElements[4] = _mm_set_ps(inout[idx[19][0]], inout[idx[18][0]], inout[idx[17][0]], inout[idx[16][0]]);
	rightSideElements[4] = _mm_set_ps(inout[idx[19][1]], inout[idx[18][1]], inout[idx[17][1]], inout[idx[16][1]]);

	leftGERight[4] = _mm_cmpge_ps(leftSideElements[4], rightSideElements[4]);
	leftLTRight[4] = _mm_cmplt_ps(leftSideElements[4], rightSideElements[4]);

	leftElementsGE[4] = _mm_and_ps(rightSideElements[4], leftGERight[4]);
	leftElementsLT[4] = _mm_and_ps(leftSideElements[4], leftLTRight[4]);

	rightElementsGE[4] = _mm_and_ps(leftSideElements[4], leftGERight[4]);
	rightElementsLT[4] = _mm_and_ps(rightSideElements[4], leftLTRight[4]);

	leftSideElements[4] = _mm_or_ps(leftElementsGE[4], leftElementsLT[4]);
	rightSideElements[4] = _mm_or_ps(rightElementsGE[4], rightElementsLT[4]);

	_mm_storeu_ps(resultLeftElements[4], leftSideElements[4]);
	_mm_storeu_ps(resultRightElements[4], rightSideElements[4]);

	inout[idx[16][0]] = resultLeftElements[4][0];
	inout[idx[16][1]] = resultRightElements[4][0];
	inout[idx[17][0]] = resultLeftElements[4][1];
	inout[idx[17][1]] = resultRightElements[4][1];
	inout[idx[18][0]] = resultLeftElements[4][2];
	inout[idx[18][1]] = resultRightElements[4][2];
	inout[idx[19][0]] = resultLeftElements[4][3];
	inout[idx[19][1]] = resultRightElements[4][3];

	// Sixth row
	leftSideElements[5] = _mm_set_ps(inout[idx[23][0]], inout[idx[22][0]], inout[idx[21][0]], inout[idx[20][0]]);
	rightSideElements[5] = _mm_set_ps(inout[idx[23][1]], inout[idx[22][1]], inout[idx[21][1]], inout[idx[20][1]]);

	leftGERight[5] = _mm_cmpge_ps(leftSideElements[5], rightSideElements[5]);
	leftLTRight[5] = _mm_cmplt_ps(leftSideElements[5], rightSideElements[5]);

	leftElementsGE[5] = _mm_and_ps(rightSideElements[5], leftGERight[5]);
	leftElementsLT[5] = _mm_and_ps(leftSideElements[5], leftLTRight[5]);

	rightElementsGE[5] = _mm_and_ps(leftSideElements[5], leftGERight[5]);
	rightElementsLT[5] = _mm_and_ps(rightSideElements[5], leftLTRight[5]);

	leftSideElements[5] = _mm_or_ps(leftElementsGE[5], leftElementsLT[5]);
	rightSideElements[5] = _mm_or_ps(rightElementsGE[5], rightElementsLT[5]);

	_mm_storeu_ps(resultLeftElements[5], leftSideElements[5]);
	_mm_storeu_ps(resultRightElements[5], rightSideElements[5]);

	inout[idx[20][0]] = resultLeftElements[5][0];
	inout[idx[20][1]] = resultRightElements[5][0];
	inout[idx[21][0]] = resultLeftElements[5][1];
	inout[idx[21][1]] = resultRightElements[5][1];
	inout[idx[22][0]] = resultLeftElements[5][2];
	inout[idx[22][1]] = resultRightElements[5][2];
	inout[idx[23][0]] = resultLeftElements[5][3];
	inout[idx[23][1]] = resultRightElements[5][3];
}
コード例 #18
0
ファイル: motempl.cpp プロジェクト: 93sam/opencv
void cv::updateMotionHistory( InputArray _silhouette, InputOutputArray _mhi,
                              double timestamp, double duration )
{
    CV_Assert( _silhouette.type() == CV_8UC1 && _mhi.type() == CV_32FC1 );
    CV_Assert( _silhouette.sameSize(_mhi) );

    float ts = (float)timestamp;
    float delbound = (float)(timestamp - duration);

    CV_OCL_RUN(_mhi.isUMat() && _mhi.dims() <= 2,
               ocl_updateMotionHistory(_silhouette, _mhi, ts, delbound))

    Mat silh = _silhouette.getMat(), mhi = _mhi.getMat();
    Size size = silh.size();

    if( silh.isContinuous() && mhi.isContinuous() )
    {
        size.width *= size.height;
        size.height = 1;
    }

#if CV_SSE2
    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);
#endif

    for(int y = 0; y < size.height; y++ )
    {
        const uchar* silhData = silh.ptr<uchar>(y);
        float* mhiData = mhi.ptr<float>(y);
        int x = 0;

#if CV_SSE2
        if( useSIMD )
        {
            __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound);
            for( ; x <= size.width - 8; x += 8 )
            {
                __m128i z = _mm_setzero_si128();
                __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z);
                __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z));
                __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4);
                __m128 fz = _mm_setzero_ps();

                v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4));
                v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4));

                __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz));
                __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz));

                v0 = _mm_xor_ps(v0, m0);
                v1 = _mm_xor_ps(v1, m1);

                _mm_storeu_ps(mhiData + x, v0);
                _mm_storeu_ps(mhiData + x + 4, v1);
            }
        }
#endif

        for( ; x < size.width; x++ )
        {
            float val = mhiData[x];
            val = silhData[x] ? ts : val < delbound ? 0 : val;
            mhiData[x] = val;
        }
    }
}
コード例 #19
0
	IntersectionData intersectRaySpheres(const Ray& ray, const vector<int>& spheresIndices,
			const Spheres& spheres)
	{
		const int maxSpheresToCheck = 4;
		IntersectionData result;
		result.intersection = false;
		result.tIntersection = numeric_limits<float>::max();

		int remainder = spheresIndices.size() % maxSpheresToCheck;
		bool canUseSIMD = (remainder < spheresIndices.size());

		int nonSIMDStartPos = 0;

		if(canUseSIMD)
		{
			const int spheresToSIMDCheck = spheresIndices.size() - remainder;
			nonSIMDStartPos = spheresToSIMDCheck;
			//Vec4Float a = _mm_set1_ps(1.f); when rayDir is normalized a is 1
			Vec4Float b = _mm_set1_ps(0.f);
			Vec4Float c = b;
			Vec4Float D = c;

			Vec4Float centerCoords[3], radiuses;

			for(int i = 0; i < spheresToSIMDCheck; i += 4)
			{
				for(int j = 0; j < 3; ++j)
				{
					centerCoords[j] = _mm_set_ps(
							spheres.centerCoords[j][spheresIndices[i]], spheres.centerCoords[j][spheresIndices[i + 1]],
							spheres.centerCoords[j][spheresIndices[i + 2]], spheres.centerCoords[j][spheresIndices[i + 3]]
					);

					radiuses = _mm_set_ps(
							spheres.radiuses[spheresIndices[i]], spheres.radiuses[spheresIndices[i + 1]],
							spheres.radiuses[spheresIndices[i + 2]], spheres.radiuses[spheresIndices[i + 2]]
					);

					b += 2.f * ray.direction.coords[j] * (ray.origin.coords[j] - centerCoords[j]);
					c += (ray.origin.coords[j] - centerCoords[j]) * (ray.origin.coords[j] - centerCoords[j]);
				}
				D = b * b - 4.f * c;

				Vec4Float mask = _mm_cmpge_ps(D, _mm_set_ps1(0.f));
				Vec4Float squareRootD = _mm_sqrt_ps(D);
				D = _mm_and_ps(squareRootD, mask);

				Vec4Float t1, t2;
				t1 = _mm_or_ps((-b - squareRootD) * 0.5f, _mm_andnot_ps(mask, D));
				t2 = _mm_or_ps((-b + squareRootD) * 0.5f, _mm_andnot_ps(mask, D));

				float tRes = result.tIntersection;
				for(int j = 0; j < 4; ++j)
				{
					if(t1[j] >= 0 && t1[j] < tRes)
					{
						tRes = t1[j];
					}
					if(t2[j] >= 0 && t2[j] < tRes)
					{
						tRes = t2[j];
					}
				}

				if(tRes	< result.tIntersection)

					result.intersection = true;
					result.tIntersection = tRes;
				}
			}

			for(int i = nonSIMDStartPos; i < spheresIndices.size(); ++i)
			{
				IntersectionData data;
				int idx = spheresIndices[i];
				Sphere sphere;
				sphere.center.x = spheres.centerCoords[0][idx];
				sphere.center.y = spheres.centerCoords[1][idx];
				sphere.center.z = spheres.centerCoords[2][idx];
				sphere.radius = spheres.radiuses[idx];
				data = intersectSingleSphere(ray, sphere);

				if(data.intersection && data.tIntersection < result.tIntersection)
				{
					result = data;
				}
			}

			return result;
	}
コード例 #20
0
ファイル: simd.hpp プロジェクト: JerryCao1985/psychopath
inline float4 gte(const float4& a, const float4& b)
{
	return float4(_mm_cmpge_ps(a.data, b.data));
}
コード例 #21
0
ファイル: sse.hpp プロジェクト: bobbyluig/Eclipse
RETf CMPGE(const __m128 x, const __m128 y) { return _mm_cmpge_ps(x, y); }
コード例 #22
0
ファイル: motempl.cpp プロジェクト: SCS-B3C/OpenCV2-2
/* motion templates */
CV_IMPL void
cvUpdateMotionHistory( const void* silhouette, void* mhimg,
                       double timestamp, double mhi_duration )
{
    CvMat  silhstub, *silh = cvGetMat(silhouette, &silhstub);
    CvMat  mhistub, *mhi = cvGetMat(mhimg, &mhistub);

    if( !CV_IS_MASK_ARR( silh ))
        CV_Error( CV_StsBadMask, "" );

    if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 )
        CV_Error( CV_StsUnsupportedFormat, "" );

    if( !CV_ARE_SIZES_EQ( mhi, silh ))
        CV_Error( CV_StsUnmatchedSizes, "" );

    CvSize size = cvGetMatSize( mhi );

    int mhi_step = mhi->step;
    int silh_step = silh->step;

    if( CV_IS_MAT_CONT( mhi->type & silh->type ))
    {
        size.width *= size.height;
        mhi_step = silh_step = CV_STUB_STEP;
        size.height = 1;
    }

    float ts = (float)timestamp;
    float delbound = (float)(timestamp - mhi_duration);
    int x, y;
#if CV_SSE2
    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);
#endif
    
    for( y = 0; y < size.height; y++ )
    {
        const uchar* silhData = silh->data.ptr + silh->step*y;
        float* mhiData = (float*)(mhi->data.ptr + mhi->step*y);
        x = 0;
        
#if CV_SSE2
        if( useSIMD )
        {
            __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound);
            for( ; x <= size.width - 8; x += 8 )
            {
                __m128i z = _mm_setzero_si128();
                __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z);
                __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z));
                __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4);
                __m128 fz = _mm_setzero_ps();
                
                v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4));
                v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4));

                __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz));
                __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz));
                
                v0 = _mm_xor_ps(v0, m0);
                v1 = _mm_xor_ps(v1, m1);
                
                _mm_storeu_ps(mhiData + x, v0);
                _mm_storeu_ps(mhiData + x + 4, v1);
            }
        }
#endif
        
        for( ; x < size.width; x++ )
        {
            float val = mhiData[x];
            val = silhData[x] ? ts : val < delbound ? 0 : val;
            mhiData[x] = val;
        }
    }
}
コード例 #23
0
ファイル: sse-builtins.c プロジェクト: CODECOMMUNITY/clang
__m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) {
  // CHECK-LABEL: @test_mm_cmpge_ps
  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
  return _mm_cmpge_ps(__a, __b);
}
コード例 #24
0
ファイル: permutohedral.cpp プロジェクト: zjeagleeye/eagleeye
void Permutohedral::init ( const float* feature, int feature_size, int N )
{
	// Compute the lattice coordinates for each feature [there is going to be a lot of magic here
	N_ = N;
	d_ = feature_size;
	HashTable hash_table( d_, N_/**(d_+1)*/ );

	const int blocksize = sizeof(__m128) / sizeof(float);
	const __m128 invdplus1   = _mm_set1_ps( 1.0f / (d_+1) );
	const __m128 dplus1      = _mm_set1_ps( d_+1 );
	const __m128 Zero        = _mm_set1_ps( 0 );
	const __m128 One         = _mm_set1_ps( 1 );

	// Allocate the class memory
	if (offset_) delete [] offset_;
	offset_ = new int[ (d_+1)*(N_+16) ];
	memset( offset_, 0, (d_+1)*(N_+16)*sizeof(int) );

	if (barycentric_) delete [] barycentric_;
	barycentric_ = new float[ (d_+1)*(N_+16) ];
	memset( barycentric_, 0, (d_+1)*(N_+16)*sizeof(float) );

	// Allocate the local memory
	__m128 * scale_factor = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
	__m128 * f            = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
	__m128 * elevated     = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
	__m128 * rem0         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
	__m128 * rank         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 );
	float * barycentric = new float[(d_+2)*blocksize];
	short * canonical = new short[(d_+1)*(d_+1)];
	short * key = new short[d_+1];

	// Compute the canonical simplex
	for( int i=0; i<=d_; i++ ){
		for( int j=0; j<=d_-i; j++ )
			canonical[i*(d_+1)+j] = i;
		for( int j=d_-i+1; j<=d_; j++ )
			canonical[i*(d_+1)+j] = i - (d_+1);
	}

	// Expected standard deviation of our filter (p.6 in [Adams etal 2010])
	float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1);
	// Compute the diagonal part of E (p.5 in [Adams etal 2010])
	for( int i=0; i<d_; i++ )
		scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( float((i+2)*(i+1) ) * inv_std_dev) );

	// Setup the SSE rounding
#ifndef __SSE4_1__
	const unsigned int old_rounding = _mm_getcsr();
	_mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST );
#endif

	// Compute the simplex each feature lies in
	for( int k=0; k<N_; k+=blocksize ){
		// Load the feature from memory
		float * ff = (float*)f;
		for( int j=0; j<d_; j++ )
			for( int i=0; i<blocksize; i++ )
				ff[ j*blocksize + i ] = k+i < N_ ? feature[ (k+i)*d_+j ] : 0.0;

		// Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010])

		// sm contains the sum of 1..n of our faeture vector
		__m128 sm = Zero;
		for( int j=d_; j>0; j-- ){
			__m128 cf = f[j-1]*scale_factor[j-1];
			elevated[j] = sm - _mm_set1_ps(j)*cf;
			sm += cf;
		}
		elevated[0] = sm;

		// Find the closest 0-colored simplex through rounding
		__m128 sum = Zero;
		for( int i=0; i<=d_; i++ ){
			__m128 v = invdplus1 * elevated[i];
#ifdef __SSE4_1__
			v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT );
#else
			v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) );
#endif
			rem0[i] = v*dplus1;
			sum += v;
		}

		// Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values)
		for( int i=0; i<=d_; i++ )
			rank[i] = Zero;
		for( int i=0; i<d_; i++ ){
			__m128 di = elevated[i] - rem0[i];
			for( int j=i+1; j<=d_; j++ ){
				__m128 dj = elevated[j] - rem0[j];
				__m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) );
				rank[i] += c;
				rank[j] += One-c;
			}
		}

		// If the point doesn't lie on the plane (sum != 0) bring it back
		for( int i=0; i<=d_; i++ ){
			rank[i] += sum;
			__m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) );
			__m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) );
			rank[i] += add-sub;
			rem0[i] += add-sub;
		}

		// Compute the barycentric coordinates (p.10 in [Adams etal 2010])
		for( int i=0; i<(d_+2)*blocksize; i++ )
			barycentric[ i ] = 0;
		for( int i=0; i<=d_; i++ ){
			__m128 v = (elevated[i] - rem0[i])*invdplus1;

			// Didn't figure out how to SSE this
			float * fv = (float*)&v;
			float * frank = (float*)&rank[i];
			for( int j=0; j<blocksize; j++ ){
				int p = d_-frank[j];
				barycentric[j*(d_+2)+p  ] += fv[j];
				barycentric[j*(d_+2)+p+1] -= fv[j];
			}
		}

		// The rest is not SSE'd
		for( int j=0; j<blocksize; j++ ){
			// Wrap around
			barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1];

			float * frank = (float*)rank;
			float * frem0 = (float*)rem0;
			// Compute all vertices and their offset
			for( int remainder=0; remainder<=d_; remainder++ ){
				for( int i=0; i<d_; i++ ){
					key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ];
				}
				offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true );
				barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ];
			}
		}
	}
	_mm_free( scale_factor );
	_mm_free( f );
	_mm_free( elevated );
	_mm_free( rem0 );
	_mm_free( rank );
	delete [] barycentric;
	delete [] canonical;
	delete [] key;

	// Reset the SSE rounding
#ifndef __SSE4_1__
	_mm_setcsr( old_rounding );
#endif

	// This is normally fast enough so no SSE needed here
	// Find the Neighbors of each lattice point

	// Get the number of vertices in the lattice
	M_ = hash_table.size();

	// Create the neighborhood structure
	if(blur_neighbors_) delete[] blur_neighbors_;
	blur_neighbors_ = new Neighbors[ (d_+1)*M_ ];

	short * n1 = new short[d_+1];
	short * n2 = new short[d_+1];

	// For each of d+1 axes,
	for( int j = 0; j <= d_; j++ ){
		for( int i=0; i<M_; i++ ){
			const short * key = hash_table.getKey( i );
			for( int k=0; k<d_; k++ ){
				n1[k] = key[k] - 1;
				n2[k] = key[k] + 1;
			}
			n1[j] = key[j] + d_;
			n2[j] = key[j] - d_;

			blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 );
			blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 );
		}
	}
	delete[] n1;
	delete[] n2;
}
コード例 #25
0
ファイル: colorout.c プロジェクト: cherrot/darktable
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK);

  if(!isnan(d->cmatrix[0]))
  {
    //fprintf(stderr,"Using cmatrix codepath\n");
    // convert to rgb using matrix
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + (size_t)ch*roi_in->width *j;
      float *out = (float*)ovoid + (size_t)ch*roi_out->width*j;
      const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]);

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2)))));

        _mm_stream_ps(out,t);
      }
    }
    _mm_sfence();
    // apply profile
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + (size_t)ch*roi_in->width *j;
      float *out = (float*)ovoid + (size_t)ch*roi_out->width*j;

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        for(int i=0; i<3; i++)
          if (d->lut[i][0] >= 0.0f)
          {
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
          }
      }
    }
  }
  else
  {
    //fprintf(stderr,"Using xform codepath\n");
    const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f);
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out)
#endif
    for (int k=0; k<roi_out->height; k++)
    {
      const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

      if(!gamutcheck)
      {
        cmsDoTransform(d->xform, in, out, roi_out->width);
      } else {
        void *rgb = dt_alloc_align(16, 4*sizeof(float)*roi_out->width);
        cmsDoTransform(d->xform, in, rgb, roi_out->width);
        float *rgbptr = (float *)rgb;
        for (int j=0; j<roi_out->width; j++,rgbptr+=4,out+=4)
        {
          const __m128 pixel = _mm_load_ps(rgbptr);
          const __m128 ingamut = _mm_cmpge_ps(pixel, _mm_setzero_ps());
          const __m128 result = _mm_or_ps(_mm_andnot_ps(ingamut, outofgamutpixel),
                                          _mm_and_ps(ingamut, pixel));
          _mm_stream_ps(out, result);
        }
        dt_free_align(rgb);
      }
    }
    _mm_sfence();
  }

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}