C++ (Cpp) _mm_blendv_ps 예제들

예제 #1

0

파일 보기

파일: nx_sse_float.hpp 프로젝트: mywoodstock/nxsimd

    inline vector4f select(const vector4fb& cond, const vector4f& a, const vector4f& b)
    {
#if SSE_INSTR_SET >= 5  // SSE 4.1
        return _mm_blendv_ps(b, a, cond);
#else
        return _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b));
#endif
    }

예제 #2

0

파일 보기

파일: dct_simd.cpp 프로젝트: ArtisticCoding/OpenCP

void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh)
{
	__m128 ms0 = _mm_load_ps(src);
	__m128 ms1 = _mm_load_ps(src + 4);
	const __m128 mm = _mm_set1_ps(0.5f);
	__m128 a = _mm_add_ps(ms0, ms1);
	__m128 b = _mm_sub_ps(ms0, ms1);

	__m128 t1 = _mm_unpacklo_ps(a, b);
	__m128 t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);

	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth);
	ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk);
#ifdef _KEEP_00_COEF_
	ms0 = _mm_blend_ps(ms0, a, 1);
#endif
	msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth);
	ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk);

	a = _mm_add_ps(ms0, ms1);
	b = _mm_sub_ps(ms0, ms1);

	t1 = _mm_unpacklo_ps(a, b);
	t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	_mm_store_ps(dest, a);
	_mm_store_ps(dest + 4, b);
}

예제 #3

0

파일 보기

파일: dct_simd.cpp 프로젝트: ArtisticCoding/OpenCP

void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh)
{
	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);
	const __m128 zeros = _mm_setzero_ps();
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 p03 = _mm_add_ps(s0, s3);
	__m128 p12 = _mm_add_ps(s1, s2);
	__m128 m03 = _mm_sub_ps(s0, s3);
	__m128 m12 = _mm_sub_ps(s1, s2);

	__m128 v = _mm_add_ps(p03, p12);
	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	// keep 00 coef.
	__m128 v2 = _mm_blendv_ps(zeros, v, msk);
	v2 = _mm_blend_ps(v2, v, 1);
	_mm_store_ps(d, v2);

	v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 4, v);

	v = _mm_sub_ps(p03, p12);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 8, v);

	v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 12, v);
}

예제 #4

0

파일 보기

파일: minmaxfilter.cpp 프로젝트: ArtisticCoding/OpenCP

	void blurRemoveMinMax_(const Mat& src, Mat& dest, const int r)
	{
		const Size ksize = Size(2 * r + 1, 2 * r + 1);
		if (src.data != dest.data)src.copyTo(dest);

		Mat xv;
		Mat nv;
		Mat element = Mat::ones(2 * r + 1, 2 * r + 1, CV_8U);
		dilate(src, xv, element);
		erode(src, nv, element);

		Mat mind;
		Mat maxd;
		Mat mask;
		absdiff(src, nv, mind);//can move to loop
		absdiff(src, xv, maxd);//
		min(mind, maxd, mask);//

		T* n = nv.ptr<T>(0);
		T* x = xv.ptr<T>(0);
		T* d = dest.ptr<T>(0);
		T* nd = mind.ptr<T>(0);
		T* mk = mask.ptr<T>(0);

		int remsize = src.size().area();

#if CV_SSE4_1
		if (src.depth() == CV_8U)
		{

			const int ssesize = src.size().area() / 16;
			remsize = src.size().area() - ssesize * 16;
			for (int i = 0; i < ssesize; i++)
			{
				__m128i mmk = _mm_load_si128((__m128i*)mk);
				__m128i mnd = _mm_load_si128((__m128i*)nd);

				__m128i mmn = _mm_load_si128((__m128i*)n);
				__m128i mmx = _mm_load_si128((__m128i*)x);
				__m128i msk = _mm_cmpeq_epi8(mnd, mmk);
				_mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk));
				nd += 16;
				mk += 16;
				d += 16;
				n += 16;
				x += 16;
			}
		}
		else if (src.depth() == CV_16S || src.depth() == CV_16U)
		{

			const int ssesize = src.size().area() / 8;
			remsize = src.size().area() - ssesize * 8;
			for (int i = 0; i < ssesize; i++)
			{
				__m128i mmk = _mm_load_si128((__m128i*)mk);
				__m128i mnd = _mm_load_si128((__m128i*)nd);

				__m128i mmn = _mm_load_si128((__m128i*)n);
				__m128i mmx = _mm_load_si128((__m128i*)x);
				__m128i msk = _mm_cmpeq_epi16(mnd, mmk);
				_mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk));
				nd += 8;
				mk += 8;
				d += 8;
				n += 8;
				x += 8;
			}
		}
		else if (src.depth() == CV_32F)
		{

			const int ssesize = src.size().area() / 4;
			remsize = src.size().area() - ssesize * 4;
			for (int i = 0; i < ssesize; i++)
			{
				__m128 mmk = _mm_load_ps((float*)mk);
				__m128 mnd = _mm_load_ps((float*)nd);

				__m128 mmn = _mm_load_ps((float*)n);
				__m128 mmx = _mm_load_ps((float*)x);
				__m128 msk = _mm_cmpeq_ps(mnd, mmk);
				_mm_stream_ps((float*)d, _mm_blendv_ps(mmx, mmn, msk));
				nd += 4;
				mk += 4;
				d += 4;
				n += 4;
				x += 4;
			}
		}
		else if (src.depth() == CV_64F)
		{
			const int ssesize = src.size().area() / 2;
			remsize = src.size().area() - ssesize * 2;
			for (int i = 0; i < ssesize; i++)
			{
				__m128d mmk = _mm_load_pd((double*)mk);
				__m128d mnd = _mm_load_pd((double*)nd);

				__m128d mmn = _mm_load_pd((double*)n);
				__m128d mmx = _mm_load_pd((double*)x);
				__m128d msk = _mm_cmpeq_pd(mnd, mmk);
				_mm_stream_pd((double*)d, _mm_blendv_pd(mmx, mmn, msk));
				nd += 2;
				mk += 2;
				d += 2;
				n += 2;
				x += 2;
			}
		}
#endif
		for (int i = 0; i < remsize; i++)
		{
			{
				if (nd[i] == mk[i])
				{
					d[i] = n[i];
				}
				else
				{
					d[i] = x[i];
				}
			}
		}
	}

예제 #5

0

파일 보기

파일: mandelbrot.c 프로젝트: NassimAMGHAR/chp

// --------------------------------------------------------------
vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter)
// --------------------------------------------------------------
{
    // version avec test de sortie en float
    
    vuint32   iter  = _mm_set1_epi32(0);
    vfloat32  fiter = _mm_set_ps(0,0,0,0);

    vfloat32 x,y,t,t2,zero,un,deux,quatre; 
    // COMPLETER ICI
    int test,i = 0;
    // initialisation des variables
    x      = _mm_set_ps(0,0,0,0);    
    y      = _mm_set_ps(0,0,0,0);
    deux   = _mm_set_ps(2,2,2,2);
    quatre = _mm_set_ps(4,4,4,4);
    un     = _mm_set_ps(1,1,1,1);
    zero   = _mm_set_ps(0,0,0,0);
    
    // iteration zero
	t  = _mm_mul_ps(x, x);
	t2 = _mm_mul_ps(y, y);
	         
	y  = _mm_mul_ps(x,y);
	y  = _mm_mul_ps(y,deux);
	y  = _mm_add_ps(y,b);

	x = _mm_sub_ps(t,t2);
	x = _mm_add_ps(x,a);
    
    // calcul
    while(i<max_iter && _mm_movemask_ps(t) != 15){   
	    

	    t  = _mm_mul_ps(x, x);
        t2 = _mm_mul_ps(y, y);
              
	    y  = _mm_mul_ps(_mm_mul_ps(x,y),deux);        
	    y  = _mm_add_ps(y,b);

	    x = _mm_sub_ps(t,t2);
	    x = _mm_add_ps(x,a);	

	    t2 = _mm_add_ps(t,t2);
	    
	    t2 = _mm_cmple_ps(t2,quatre); 
 			
	    t = _mm_blendv_ps(zero,un,t2);
			
	    fiter = _mm_add_ps(fiter,t);		
			
	    t = _mm_cmpeq_ps(t, zero);
	    //display_vfloat32(t,"%f\t","T :: ");
	    //printf(" MASK::%d \n",_mm_movemask_ps(t));
	    
	    i+=1;
  	}
 	
	
	iter = _mm_cvtps_epi32(fiter);

    return iter;
}

예제 #6

0

파일 보기

파일: minmax_intrn.c 프로젝트: Lingcc/Clang-LLVM-tutorial

void minmax_vec(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_)
{
	// We suppose that pointers are aligned on an 16-byte boundary
	
	// Initialise SSE registers
	__m128i sse_idx_min = _mm_setzero_si128();
	__m128i sse_idx_max = _mm_setzero_si128();
	__m128 sse_min = _mm_set1_ps(FLT_MAX);
	__m128 sse_max = _mm_set1_ps(FLT_MIN);

	// We will unroll the for-loop by for, thus doing
	// (n/4) iterations.
	const uint32_t n_sse = n & ~3ULL;

	__m128i sse_idx = _mm_set_epi32(3, 2, 1, 0);
	const __m128i sse_4 = _mm_set1_epi32(4);

	for (uint32_t i = 0; i < n_sse; i += 4) {
		const __m128 sse_v = _mm_load_ps(&buf[i]);
		const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min);
		const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max);

		sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min);
		sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max);

		sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); 
		sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); 

		sse_idx = _mm_add_epi32(sse_idx, sse_4);
	}

	// SSE reduction
	float __attribute__((aligned(16))) mins[4];
	float __attribute__((aligned(16))) maxs[4];
	_mm_store_ps(mins, sse_min);
	_mm_store_ps(maxs, sse_max);

	float min = mins[0];
	float max = maxs[0];
	uint32_t idx_min = _mm_extract_epi32(sse_idx_min, 0);
	uint32_t idx_max = _mm_extract_epi32(sse_idx_max, 0);
	// Unrolled by GCC
	for (int i = 1; i < 4; i++) {
		float v = mins[i];
		if (v < min) {
			min = v;
			idx_min = _mm_extract_epi32(sse_idx_min, i);
		}
		v = maxs[i];
		if (v > max) {
			max = v;
			idx_max = _mm_extract_epi32(sse_idx_max, i);
		}
	}


	// Epilogue
	for (uint32_t i = n_sse; i < n; i++) {
		const float v = buf[i];
		if (v < min) {
			min = v;
			idx_min = i;
		}
		if (v > max) {
			max = v;
			idx_max = i;
		}
	}

	*idx_min_ = idx_min;
	*min_ = min;
	*idx_max_ = idx_max;
	*max_ = max;
}

예제 #7

0

파일 보기

파일: minmax_intrn.c 프로젝트: Lingcc/Clang-LLVM-tutorial

void minmax_vec2(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_)
{
	// We suppose that pointers are aligned on an 16-byte boundary
	
	// Initialise SSE registers
	__m128i sse_idx_min = _mm_setzero_si128();
	__m128i sse_idx_max = _mm_setzero_si128();
	__m128 sse_min = _mm_set1_ps(FLT_MAX);
	__m128 sse_max = _mm_set1_ps(FLT_MIN);

	// We will unroll the for-loop by for, thus doing
	// (n/4) iterations.
	const uint32_t n_sse = n & ~3ULL;

	__m128i sse_idx = _mm_set_epi32(3, 2, 1, 0);
	const __m128i sse_4 = _mm_set1_epi32(4);

	for (uint32_t i = 0; i < n_sse; i += 4) {
		const __m128 sse_v = _mm_load_ps(&buf[i]);
		const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min);
		const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max);

		sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min);
		sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max);

		sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); 
		sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); 

		sse_idx = _mm_add_epi32(sse_idx, sse_4);
	}

	// SSE reduction
	__m128 sse_min_permute = _mm_shuffle_epi32(sse_min, 2 | (3<<2));
	__m128 sse_max_permute = _mm_shuffle_epi32(sse_max, 2 | (3<<2));
	__m128i sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 2 | (3<<2));
	__m128i sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 2 | (3<<2));

	__m128 sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min);
	__m128 sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max);
	sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min);
	sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max);
	sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); 
	sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); 

	sse_min_permute = _mm_shuffle_epi32(sse_min, 1);
	sse_max_permute = _mm_shuffle_epi32(sse_max, 1);
	sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 1);
	sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 1);

	sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min);
	sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max);
	sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min);
	sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max);
	sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); 
	sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); 

	// Epilogue
	float min, max;
	uint32_t idx_min, idx_max;
	_mm_store_ss(&min, sse_min);
	_mm_store_ss(&max, sse_max);
	idx_min = _mm_extract_epi32(sse_idx_min, 0);
	idx_max = _mm_extract_epi32(sse_idx_max, 0);

	for (uint32_t i = n_sse; i < n; i++) {
		const float v = buf[i];
		if (v < min) {
			min = v;
			idx_min = i;
		}
		if (v > max) {
			max = v;
			idx_max = i;
		}
	}

	*idx_min_ = idx_min;
	*min_ = min;
	*idx_max_ = idx_max;
	*max_ = max;
}

예제 #8

0

파일 보기

파일: sse41-builtins.c 프로젝트: ashwinma/clang_trunk

__m128 test_blendv_ps(__m128 V1, __m128 V2, __m128 V3) {
  // CHECK-LABEL: test_blendv_ps
  // CHECK: call <4 x float> @llvm.x86.sse41.blendvps
  // CHECK-ASM: blendvps %xmm{{.*}}, %xmm{{.*}}
  return _mm_blendv_ps(V1, V2, V3);
}

예제 #9

0

파일 보기

파일: dct_simd.cpp 프로젝트: ArtisticCoding/OpenCP

void fDCT2D8x4_and_threshold_keep00_32f(const float* x, float* y, float thresh)
{
	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);
	const __m128 zeros = _mm_setzero_ps();

	__m128 c0 = _mm_load_ps(x);
	__m128 c1 = _mm_load_ps(x + 56);
	__m128 t0 = _mm_add_ps(c0, c1);
	__m128 t7 = _mm_sub_ps(c0, c1);

	c1 = _mm_load_ps(x + 48);
	c0 = _mm_load_ps(x + 8);
	__m128 t1 = _mm_add_ps(c0, c1);
	__m128 t6 = _mm_sub_ps(c0, c1);

	c1 = _mm_load_ps(x + 40);
	c0 = _mm_load_ps(x + 16);
	__m128 t2 = _mm_add_ps(c0, c1);
	__m128 t5 = _mm_sub_ps(c0, c1);

	c0 = _mm_load_ps(x + 24);
	c1 = _mm_load_ps(x + 32);
	__m128 t3 = _mm_add_ps(c0, c1);
	__m128 t4 = _mm_sub_ps(c0, c1);

	/*
	c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
	c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
	c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
	c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;
	*/

	c0 = _mm_add_ps(t0, t3);
	__m128 c3 = _mm_sub_ps(t0, t3);
	c1 = _mm_add_ps(t1, t2);
	__m128 c2 = _mm_sub_ps(t1, t2);

	/*
	c0 = t0 + t3; c3 = t0 - t3;
	c1 = t1 + t2; c2 = t1 - t2;
	*/

	const __m128 invsqrt2h = _mm_set_ps1(0.353554f);

	__m128 v = _mm_mul_ps(_mm_add_ps(c0, c1), invsqrt2h);
	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	// keep 00 coef.
	__m128 v2 = _mm_blendv_ps(zeros, v, msk);
	v2 = _mm_blend_ps(v2, v, 1);
	_mm_store_ps(y, v2);

	v = _mm_mul_ps(_mm_sub_ps(c0, c1), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 32, v);

	/*y[0] = c0 + c1;
	y[4] = c0 - c1;*/

	__m128 w0 = _mm_set_ps1(0.541196f);
	__m128 w1 = _mm_set_ps1(1.306563f);
	v = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(w0, c2), _mm_mul_ps(w1, c3)), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 16, v);

	v = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(w0, c3), _mm_mul_ps(w1, c2)), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 48, v);
	/*
	y[2] = c2 * r[6] + c3 * r[2];
	y[6] = c3 * r[6] - c2 * r[2];
	*/

	w0 = _mm_set_ps1(1.175876f);
	w1 = _mm_set_ps1(0.785695f);
	c3 = _mm_add_ps(_mm_mul_ps(w0, t4), _mm_mul_ps(w1, t7));
	c0 = _mm_sub_ps(_mm_mul_ps(w0, t7), _mm_mul_ps(w1, t4));
	/*
	c3 = t4 * r[3] + t7 * r[5];
	c0 = t7 * r[3] - t4 * r[5];
	*/

	w0 = _mm_set_ps1(1.387040f);
	w1 = _mm_set_ps1(0.275899f);
	c2 = _mm_add_ps(_mm_mul_ps(w0, t5), _mm_mul_ps(w1, t6));
	c1 = _mm_sub_ps(_mm_mul_ps(w0, t6), _mm_mul_ps(w1, t5));
	/*
	c2 = t5 * r[1] + t6 * r[7];
	c1 = t6 * r[1] - t5 * r[7];
	*/

	v = _mm_mul_ps(_mm_sub_ps(c0, c2), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);

	_mm_store_ps(y + 24, v);

	v = _mm_mul_ps(_mm_sub_ps(c3, c1), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 40, v);
	//y[5] = c3 - c1; y[3] = c0 - c2;

	const __m128 invsqrt2 = _mm_set_ps1(0.707107f);
	c0 = _mm_mul_ps(_mm_add_ps(c0, c2), invsqrt2);
	c3 = _mm_mul_ps(_mm_add_ps(c3, c1), invsqrt2);
	//c0 = (c0 + c2) * invsqrt2;
	//c3 = (c3 + c1) * invsqrt2;

	v = _mm_mul_ps(_mm_add_ps(c0, c3), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 8, v);

	v = _mm_mul_ps(_mm_sub_ps(c0, c3), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);

	_mm_store_ps(y + 56, v);
	//y[1] = c0 + c3; y[7] = c0 - c3;

	/*for(i = 0;i < 8;i++)
	{
	y[i] *= invsqrt2h;
	}*/
}

예제 #10

0

파일 보기

파일: kernels.cpp 프로젝트: caomw/r4r

double CChiSquaredKernel<float>::Evaluate(float* x, float* y) {

#ifndef __SSE4_1__

    /* only cast at the end to guarantee that we get the same
     * result as when using SSE4 registers
     */
    float result, num;
    result = 0;

    for(size_t i=0; i<m_n; i++) {

        num = x[i]*y[i];

        if(num>0)               // this implies that x+y!=0 if x,y>0
            result += num/(x[i]+y[i]);

    }

    return static_cast<double>(result);

#else

    __m128* px = (__m128*)x;
    __m128* py = (__m128*)y;


    __m128 sum = _mm_set1_ps(0.0f);
    __m128 mzero = _mm_set1_ps(0.0f);

    for(size_t i=0; i<m_offset/4; i++) {

        __m128 num = _mm_mul_ps(px[i],py[i]);
        __m128 denom = _mm_add_ps(px[i],py[i]);
        __m128 invdenom = _mm_rcp_ps(denom);

        // find nonzeros in numerator
        __m128 nans = _mm_cmpeq_ps(num,mzero);
        __m128 factors = _mm_blendv_ps(invdenom,mzero,nans);

        // compute product
        __m128 temp = _mm_mul_ps(num,factors);

        // add
        sum = _mm_add_ps(sum,temp);

    }

    float result[4] = {0,0,0,0};
    _mm_storeu_ps(result,sum);

    float fresult  = result[0] + result[1] + result[2] + result[3];

    // add offset
    float num;

    for(size_t i=m_offset; i<m_n; i++)  {

        num = x[i]*y[i];

        if(num>0)                      // this implies that x+y!=0 if x,y>0
            fresult += num/(x[i]+y[i]);

    }

    return static_cast<double>(fresult);

#endif

}

예제 #11

0

파일 보기

파일: sse41-builtins.c 프로젝트: JaredCJR/clang

__m128 test_mm_blendv_ps(__m128 V1, __m128 V2, __m128 V3) {
  // CHECK-LABEL: test_mm_blendv_ps
  // CHECK: call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
  return _mm_blendv_ps(V1, V2, V3);
}

예제 #12

0

파일 보기

파일: DepthBufferRasterizerSSEST.cpp 프로젝트: Habaut/intel_occlusion_cull

//-------------------------------------------------------------------------------
// For each tile go through all the bins and process all the triangles in it.
// Rasterize each triangle to the CPU depth buffer. 
//-------------------------------------------------------------------------------
void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_setr_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; 

	// Based on TaskId determine which tile to process
	UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS;
    UINT tileX = tileId % screenWidthInTiles;
    UINT tileY = tileId / screenWidthInTiles;

    int tileStartX = tileX * TILE_WIDTH_IN_PIXELS;
	int tileEndX   = tileStartX + TILE_WIDTH_IN_PIXELS - 1;
	
	int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS;
	int tileEndY   = tileStartY + TILE_HEIGHT_IN_PIXELS - 1;

	ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx);

	UINT bin = 0;
	UINT binIndex = 0;
	UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX;
	UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX;
	UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];

	__m128 gatherBuf[4][3];
	bool done = false;
	bool allBinsEmpty = true;
	mNumRasterizedTris[idx][tileId] = numTrisInBin;
	while(!done)
	{
		// Loop through all the bins and process 4 binned traingles at a time
		UINT ii;
		int numSimdTris = 0;
		for(ii = 0; ii < SSE; ii++)
		{
			while(numTrisInBin <= 0)
			{
				 // This bin is empty.  Move to next bin.
				if(++bin >= 1)
				{
					break;
				}
				numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];
				mNumRasterizedTris[idx][tileId] += numTrisInBin;
				binIndex = 0;
			}
			if(!numTrisInBin)
			{
				break; // No more tris in the bins
			}
			USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx);
			allBinsEmpty = false;
			numSimdTris++; 

			++binIndex;
			--numTrisInBin;
		}
		done = bin >= NUM_XFORMVERTS_TASKS;
		
		if(allBinsEmpty)
		{
			return;
		}

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		__m128 Z[3];
		for(int i = 0; i < 3; i++)
		{
			__m128 v0 = gatherBuf[0][i];
			__m128 v1 = gatherBuf[1][i];
			__m128 v2 = gatherBuf[2][i];
			__m128 v3 = gatherBuf[3][i];

  			// transpose into SoA layout
			_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
			fxPtX[i] = _mm_cvtps_epi32(v0);
			fxPtY[i] = _mm_cvtps_epi32(v1);
			Z[i] = v2;
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea);

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX));

		__m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < numSimdTris; lane++)
        {
			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]);
			}
									
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better perfromance
			int rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane]));
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane]));
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane]));

			__m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row);
			__m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row);
			__m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			__m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											rowIdx += 2 * SCREENW,
											sum0Row = _mm_add_epi32(sum0Row, bb0Inc),
											sum1Row = _mm_add_epi32(sum1Row, bb1Inc),
											sum2Row = _mm_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m128i alpha = sum0Row;
				__m128i beta = sum1Row;
				__m128i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m128 depth = zz[0];
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

				for(int c = startXx; c < endXx; c += 2,
												index += 4,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc),
												depth = _mm_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama);
					
					__m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]);
					__m128 mergedDepth = _mm_max_ps(depth, previousDepthValue);
					__m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask));
					_mm_store_ps(&pDepthBuffer[index], finalDepth);
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}