示例#1
0
KW_INLINE b8 kw_equals(kw_quat q1, kw_quat q2){
    __m128 temp = _mm_cmpeq_ps(q1.simd, q2.simd);
    return _mm_test_all_ones(_mm_castps_si128(temp));
}
示例#2
0
KW_INLINE b8 kw_equals(kw_vec2 v1, kw_vec2 v2){
    __m128 temp = _mm_cmpeq_ps(v1.simd, v2.simd);
    return _mm_test_all_ones(_mm_castps_si128(temp));
}
示例#3
0
 static inline __m128 gen_ones(void)
 {
     __m128 x = gen_zero();
     __m128 ones = _mm_cmpeq_ps(x, x);
     return ones;
 }
示例#4
0
文件: kernels.cpp 项目: caomw/r4r
double CChiSquaredKernel<float>::Evaluate(float* x, float* y) {

#ifndef __SSE4_1__

    /* only cast at the end to guarantee that we get the same
     * result as when using SSE4 registers
     */
    float result, num;
    result = 0;

    for(size_t i=0; i<m_n; i++) {

        num = x[i]*y[i];

        if(num>0)               // this implies that x+y!=0 if x,y>0
            result += num/(x[i]+y[i]);

    }

    return static_cast<double>(result);

#else

    __m128* px = (__m128*)x;
    __m128* py = (__m128*)y;


    __m128 sum = _mm_set1_ps(0.0f);
    __m128 mzero = _mm_set1_ps(0.0f);

    for(size_t i=0; i<m_offset/4; i++) {

        __m128 num = _mm_mul_ps(px[i],py[i]);
        __m128 denom = _mm_add_ps(px[i],py[i]);
        __m128 invdenom = _mm_rcp_ps(denom);

        // find nonzeros in numerator
        __m128 nans = _mm_cmpeq_ps(num,mzero);
        __m128 factors = _mm_blendv_ps(invdenom,mzero,nans);

        // compute product
        __m128 temp = _mm_mul_ps(num,factors);

        // add
        sum = _mm_add_ps(sum,temp);

    }

    float result[4] = {0,0,0,0};
    _mm_storeu_ps(result,sum);

    float fresult  = result[0] + result[1] + result[2] + result[3];

    // add offset
    float num;

    for(size_t i=m_offset; i<m_n; i++)  {

        num = x[i]*y[i];

        if(num>0)                      // this implies that x+y!=0 if x,y>0
            fresult += num/(x[i]+y[i]);

    }

    return static_cast<double>(fresult);

#endif

}
示例#5
0
// --------------------------------------------------------------
vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter)
// --------------------------------------------------------------
{
    // version avec test de sortie en float

    vuint32   iter  = _mm_set1_epi32(0);
    vfloat32  fiter = _mm_set_ps(0,0,0,0);

    vfloat32 x,y,t,t2,zero,un,deux,quatre;
    // COMPLETER ICI
    int test,i = 0;
    // initialisation des variables
    x      = _mm_set_ps(0,0,0,0);
    y      = _mm_set_ps(0,0,0,0);
    deux   = _mm_set_ps(2,2,2,2);
    quatre = _mm_set_ps(4,4,4,4);
    un     = _mm_set_ps(1,1,1,1);
    zero   = _mm_set_ps(0,0,0,0);

    // iteration zero
    t  = _mm_mul_ps(x, x);
    t2 = _mm_mul_ps(y, y);

    y  = _mm_mul_ps(x,y);
    y  = _mm_mul_ps(y,deux);
    y  = _mm_add_ps(y,b);

    x = _mm_sub_ps(t,t2);
    x = _mm_add_ps(x,a);

    // calcul
    while(i<max_iter && _mm_movemask_ps(t) != 15) {


        t  = _mm_mul_ps(x, x);
        t2 = _mm_mul_ps(y, y);

        y  = _mm_mul_ps(_mm_mul_ps(x,y),deux);
        y  = _mm_add_ps(y,b);

        x = _mm_sub_ps(t,t2);
        x = _mm_add_ps(x,a);

        t2 = _mm_add_ps(t,t2);

        t2 = _mm_cmple_ps(t2,quatre);

        t = _mm_blendv_ps(zero,un,t2);

        fiter = _mm_add_ps(fiter,t);

        t = _mm_cmpeq_ps(t, zero);
        //display_vfloat32(t,"%f\t","T :: ");
        //printf(" MASK::%d \n",_mm_movemask_ps(t));

        i+=1;
    }


    iter = _mm_cvtps_epi32(fiter);

    return iter;
}
void blurRemoveMinMax_(Mat& src, Mat& dest, const int r)
{
	const Size ksize = Size(2*r+1,2*r+1);
	if(src.data!=dest.data)src.copyTo(dest);

	Mat xv;
	Mat nv;
	Mat element = Mat::ones(2*r+1,2*r+1,CV_8U);
	dilate(src,xv,element);
	erode(src,nv,element);

	Mat mind;
	Mat maxd;
	Mat mask;
	absdiff(src,nv,mind);//can move to loop
	absdiff(src,xv,maxd);//
	min(mind,maxd,mask);//

	T* n = nv.ptr<T>(0);
	T* x = xv.ptr<T>(0);
	T* d = dest.ptr<T>(0);
	T* nd = mind.ptr<T>(0);
	T* mk = mask.ptr<T>(0);

	int remsize = src.size().area();

#if CV_SSE4_1
	if(src.depth()==CV_8U)
	{

		const int ssesize = src.size().area()/16;
		remsize = src.size().area()-ssesize*16;	
		for(int i=0;i<ssesize;i++)
		{
			__m128i mmk = _mm_load_si128((__m128i*)mk);
			__m128i mnd = _mm_load_si128((__m128i*)nd);

			__m128i mmn = _mm_load_si128((__m128i*)n);
			__m128i mmx = _mm_load_si128((__m128i*)x);
			__m128i msk =  _mm_cmpeq_epi8(mnd,mmk);
			_mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk));
			nd+=16;
			mk+=16;
			d+=16;
			n+=16;
			x+=16;
		}
	}
	else if(src.depth()==CV_16S || src.depth()==CV_16U)
	{

		const int ssesize = src.size().area()/8;
		remsize = src.size().area()-ssesize*8;	
		for(int i=0;i<ssesize;i++)
		{
			__m128i mmk = _mm_load_si128((__m128i*)mk);
			__m128i mnd = _mm_load_si128((__m128i*)nd);

			__m128i mmn = _mm_load_si128((__m128i*)n);
			__m128i mmx = _mm_load_si128((__m128i*)x);
			__m128i msk =  _mm_cmpeq_epi16(mnd,mmk);
			_mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk));
			nd+=8;
			mk+=8;
			d+=8;
			n+=8;
			x+=8;
		}
	}
	else if(src.depth()==CV_32F)
	{

		const int ssesize = src.size().area()/4;
		remsize = src.size().area()-ssesize*4;	
		for(int i=0;i<ssesize;i++)
		{
			__m128 mmk = _mm_load_ps((float*)mk);
			__m128 mnd = _mm_load_ps((float*)nd);

			__m128 mmn = _mm_load_ps((float*)n);
			__m128 mmx = _mm_load_ps((float*)x);
			__m128 msk =  _mm_cmpeq_ps(mnd,mmk);
			_mm_store_ps((float*)d,_mm_blendv_ps(mmx,mmn,msk));
			nd+=4;
			mk+=4;
			d+=4;
			n+=4;
			x+=4;
		}
	}
	else if(src.depth()==CV_64F)
	{

		const int ssesize = src.size().area()/2;
		remsize = src.size().area()-ssesize*2;	
		for(int i=0;i<ssesize;i++)
		{
			__m128d mmk = _mm_load_pd((double*)mk);
			__m128d mnd = _mm_load_pd((double*)nd);

			__m128d mmn = _mm_load_pd((double*)n);
			__m128d mmx = _mm_load_pd((double*)x);
			__m128d msk =  _mm_cmpeq_pd(mnd,mmk);
			_mm_store_pd((double*)d,_mm_blendv_pd(mmx,mmn,msk));
			nd+=2;
			mk+=2;
			d+=2;
			n+=2;
			x+=2;
		}
	}
#endif
	for(int i=0;i<remsize;i++)
	{
		{
			if(nd[i]==mk[i])
			{
				d[i]=n[i];
			}
			else
			{
				d[i]=x[i];
			}
		}
	}
}
示例#7
0
__m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) {
  // CHECK-LABEL: @test_mm_cmpeq_ps
  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0)
  return _mm_cmpeq_ps(__a, __b);
}
示例#8
0
inline static bool is_zero (Iu32vec4 &val) {
  __m128 v128 = _mm_castsi128_ps((__m128i)val);
  __m128 a = _mm_setzero_ps();
  a = _mm_cmpeq_ps(a, v128);
  return _mm_movemask_ps(a) == 0xf;
}
示例#9
0
static void compute_step_tv_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], unsigned x, unsigned y, double *tv) {
        const __m128 minf = _mm_set_ps1(INFINITY);
        const __m128 mzero = _mm_set_ps1(0.);

        __m128 g_xs[3] = {0};
        __m128 g_ys[3] = {0};
        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];
                __m128 here = _mm_load_ps(p(aux->fdata, x, y, w, h));
                // forward gradient x
                g_xs[c] = _mm_loadu_ps(p(aux->fdata, x+1, y, w, h)) - here;
                // forward gradient y
                g_ys[c] = _mm_loadu_ps(p(aux->fdata, x, y+1, w, h)) - here;
        }
        // norm
        __m128 g_norm = mzero;
        for(unsigned c = 0; c < nchannel; c++) {
                g_norm += SQR(g_xs[c]);
                g_norm += SQR(g_ys[c]);
        }
        g_norm = _mm_sqrt_ps(g_norm);

        float alpha = 1./sqrtf(nchannel);
        *tv += alpha * g_norm[0];
        *tv += alpha * g_norm[1];
        *tv += alpha * g_norm[2];
        *tv += alpha * g_norm[3];

        __m128 malpha = _mm_set_ps1(alpha);

        // set zeroes to infinity
        g_norm = _mm_or_ps(g_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g_norm, mzero)));

        // compute derivatives
        for(unsigned c = 0; c < nchannel; c++) {
                __m128 g_x = g_xs[c];
                __m128 g_y = g_ys[c];
                struct aux *aux = &auxs[c];

                // N.B. for numerical stability and same exact result as the c version,
                // we must calculate the objective gradient at x+1 before x
                {
                        float *pobj_r = p(aux->obj_gradient, x+1, y, w, h);
                        __m128 obj_r = _mm_loadu_ps(pobj_r);
                        obj_r += malpha * g_x / g_norm;
                        _mm_storeu_ps(pobj_r, obj_r);
                }

                {
                        float *pobj = p(aux->obj_gradient, x, y, w, h);
                        __m128 obj = _mm_load_ps(pobj);
                        obj += malpha * -(g_x + g_y) / g_norm;
                        _mm_store_ps(pobj, obj);
                }

                {
                        float *pobj_b = p(aux->obj_gradient, x, y+1, w, h);
                        __m128 obj_b = _mm_load_ps(pobj_b);
                        obj_b += malpha * g_y / g_norm;
                        _mm_store_ps(pobj_b, obj_b);
                }
        }
        // store
        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];
                _mm_store_ps(p(aux->temp[0], x, y, w, h), g_xs[c]);
                _mm_store_ps(p(aux->temp[1], x, y, w, h), g_ys[c]);
        }
}
示例#10
0
static void compute_step_tv2_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], float alpha, unsigned x, unsigned y, double *tv2) {
        __m128 g_xxs[3] = {0};
        __m128 g_xy_syms[3] = {0};
        __m128 g_yys[3] = {0};

        const __m128 mtwo = _mm_set_ps1(2.);
        const __m128 minf = _mm_set_ps1(INFINITY);
        const __m128 mzero = _mm_set_ps1(0.);

        __m128 malpha = _mm_set_ps1(alpha * 1./sqrtf(nchannel));

        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];

                __m128 g_x = _mm_load_ps(p(aux->temp[0], x, y, w, h));
                __m128 g_y = _mm_load_ps(p(aux->temp[1], x, y, w, h));

                // backward x
                g_xxs[c] = g_x - _mm_loadu_ps(p(aux->temp[0], x-1, y, w, h));
                // backward x
                __m128 g_yx = g_y - _mm_loadu_ps(p(aux->temp[1], x-1, y, w, h));
                // backward y
                __m128 g_xy = g_x - _mm_load_ps(p(aux->temp[0], x, y-1, w, h));
                // backward y
                g_yys[c] = g_y - _mm_load_ps(p(aux->temp[1], x, y-1, w, h));
                // symmetrize
                g_xy_syms[c] = (g_xy + g_yx) / mtwo;
        }

        // norm
        __m128 g2_norm = mzero;
        for(unsigned c = 0; c < nchannel; c++) {
                g2_norm += SQR(g_xxs[c]) + mtwo * SQR(g_xy_syms[c]) + SQR(g_yys[c]);
        }
        g2_norm = _mm_sqrt_ps(g2_norm);

        __m128 alpha_norm = malpha * g2_norm;
        *tv2 += alpha_norm[0];
        *tv2 += alpha_norm[1];
        *tv2 += alpha_norm[2];
        *tv2 += alpha_norm[3];

        // set zeroes to infinity
        g2_norm = _mm_or_ps(g2_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g2_norm, mzero)));

        for(unsigned c = 0; c < nchannel; c++) {
                __m128 g_xx = g_xxs[c];
                __m128 g_yy = g_yys[c];
                __m128 g_xy_sym = g_xy_syms[c];
                struct aux *aux = &auxs[c];

                // N.B. for same exact result as the c version,
                // we must calculate the objective gradient from right to left
                {
                        float *pobj_ur = p(aux->obj_gradient, x+1, y-1, w, h);
                        __m128 obj_ur = _mm_loadu_ps(pobj_ur);
                        obj_ur += malpha * ((-g_xy_sym) / g2_norm);
                        _mm_storeu_ps(pobj_ur, obj_ur);
                }

                {
                        float *pobj_r = p(aux->obj_gradient, x+1, y, w, h);
                        __m128 obj_r = _mm_loadu_ps(pobj_r);
                        obj_r += malpha * ((g_xy_sym + g_xx) / g2_norm);
                        _mm_storeu_ps(pobj_r, obj_r);
                }

                {
                        float *pobj_u = p(aux->obj_gradient, x, y-1, w, h);
                        __m128 obj_u = _mm_load_ps(pobj_u);
                        obj_u += malpha * ((g_yy + g_xy_sym) / g2_norm);
                        _mm_store_ps(pobj_u, obj_u);
                }

                {
                        float *pobj = p(aux->obj_gradient, x, y, w, h);
                        __m128 obj = _mm_load_ps(pobj);
                        obj += malpha * (-(mtwo * g_xx + mtwo * g_xy_sym + mtwo * g_yy) / g2_norm);
                        _mm_store_ps(pobj, obj);
                }

                {
                        float *pobj_b = p(aux->obj_gradient, x, y+1, w, h);
                        __m128 obj_b = _mm_load_ps(pobj_b);
                        obj_b += malpha * ((g_yy + g_xy_sym) / g2_norm);
                        _mm_store_ps(pobj_b, obj_b);
                }

                {
                        float *pobj_l = p(aux->obj_gradient, x-1, y, w, h);
                        __m128 obj_l = _mm_loadu_ps(pobj_l);
                        obj_l += malpha * ((g_xy_sym + g_xx) / g2_norm);
                        _mm_storeu_ps(pobj_l, obj_l);
                }

                {
                        float *pobj_lb = p(aux->obj_gradient, x-1, y+1, w, h);
                        __m128 obj_lb = _mm_loadu_ps(pobj_lb);
                        obj_lb += malpha * ((-g_xy_sym) / g2_norm);
                        _mm_storeu_ps(pobj_lb, obj_lb);
                }
        }
}
示例#11
0
 inline vector4fb operator==(const vector4f& lhs, const vector4f& rhs)
 {
     return _mm_cmpeq_ps(lhs, rhs);
 }