KW_INLINE b8 kw_equals(kw_quat q1, kw_quat q2){ __m128 temp = _mm_cmpeq_ps(q1.simd, q2.simd); return _mm_test_all_ones(_mm_castps_si128(temp)); }
KW_INLINE b8 kw_equals(kw_vec2 v1, kw_vec2 v2){ __m128 temp = _mm_cmpeq_ps(v1.simd, v2.simd); return _mm_test_all_ones(_mm_castps_si128(temp)); }
static inline __m128 gen_ones(void) { __m128 x = gen_zero(); __m128 ones = _mm_cmpeq_ps(x, x); return ones; }
double CChiSquaredKernel<float>::Evaluate(float* x, float* y) { #ifndef __SSE4_1__ /* only cast at the end to guarantee that we get the same * result as when using SSE4 registers */ float result, num; result = 0; for(size_t i=0; i<m_n; i++) { num = x[i]*y[i]; if(num>0) // this implies that x+y!=0 if x,y>0 result += num/(x[i]+y[i]); } return static_cast<double>(result); #else __m128* px = (__m128*)x; __m128* py = (__m128*)y; __m128 sum = _mm_set1_ps(0.0f); __m128 mzero = _mm_set1_ps(0.0f); for(size_t i=0; i<m_offset/4; i++) { __m128 num = _mm_mul_ps(px[i],py[i]); __m128 denom = _mm_add_ps(px[i],py[i]); __m128 invdenom = _mm_rcp_ps(denom); // find nonzeros in numerator __m128 nans = _mm_cmpeq_ps(num,mzero); __m128 factors = _mm_blendv_ps(invdenom,mzero,nans); // compute product __m128 temp = _mm_mul_ps(num,factors); // add sum = _mm_add_ps(sum,temp); } float result[4] = {0,0,0,0}; _mm_storeu_ps(result,sum); float fresult = result[0] + result[1] + result[2] + result[3]; // add offset float num; for(size_t i=m_offset; i<m_n; i++) { num = x[i]*y[i]; if(num>0) // this implies that x+y!=0 if x,y>0 fresult += num/(x[i]+y[i]); } return static_cast<double>(fresult); #endif }
// -------------------------------------------------------------- vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter) // -------------------------------------------------------------- { // version avec test de sortie en float vuint32 iter = _mm_set1_epi32(0); vfloat32 fiter = _mm_set_ps(0,0,0,0); vfloat32 x,y,t,t2,zero,un,deux,quatre; // COMPLETER ICI int test,i = 0; // initialisation des variables x = _mm_set_ps(0,0,0,0); y = _mm_set_ps(0,0,0,0); deux = _mm_set_ps(2,2,2,2); quatre = _mm_set_ps(4,4,4,4); un = _mm_set_ps(1,1,1,1); zero = _mm_set_ps(0,0,0,0); // iteration zero t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(x,y); y = _mm_mul_ps(y,deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); // calcul while(i<max_iter && _mm_movemask_ps(t) != 15) { t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(_mm_mul_ps(x,y),deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); t2 = _mm_add_ps(t,t2); t2 = _mm_cmple_ps(t2,quatre); t = _mm_blendv_ps(zero,un,t2); fiter = _mm_add_ps(fiter,t); t = _mm_cmpeq_ps(t, zero); //display_vfloat32(t,"%f\t","T :: "); //printf(" MASK::%d \n",_mm_movemask_ps(t)); i+=1; } iter = _mm_cvtps_epi32(fiter); return iter; }
void blurRemoveMinMax_(Mat& src, Mat& dest, const int r) { const Size ksize = Size(2*r+1,2*r+1); if(src.data!=dest.data)src.copyTo(dest); Mat xv; Mat nv; Mat element = Mat::ones(2*r+1,2*r+1,CV_8U); dilate(src,xv,element); erode(src,nv,element); Mat mind; Mat maxd; Mat mask; absdiff(src,nv,mind);//can move to loop absdiff(src,xv,maxd);// min(mind,maxd,mask);// T* n = nv.ptr<T>(0); T* x = xv.ptr<T>(0); T* d = dest.ptr<T>(0); T* nd = mind.ptr<T>(0); T* mk = mask.ptr<T>(0); int remsize = src.size().area(); #if CV_SSE4_1 if(src.depth()==CV_8U) { const int ssesize = src.size().area()/16; remsize = src.size().area()-ssesize*16; for(int i=0;i<ssesize;i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi8(mnd,mmk); _mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk)); nd+=16; mk+=16; d+=16; n+=16; x+=16; } } else if(src.depth()==CV_16S || src.depth()==CV_16U) { const int ssesize = src.size().area()/8; remsize = src.size().area()-ssesize*8; for(int i=0;i<ssesize;i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi16(mnd,mmk); _mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk)); nd+=8; mk+=8; d+=8; n+=8; x+=8; } } else if(src.depth()==CV_32F) { const int ssesize = src.size().area()/4; remsize = src.size().area()-ssesize*4; for(int i=0;i<ssesize;i++) { __m128 mmk = _mm_load_ps((float*)mk); __m128 mnd = _mm_load_ps((float*)nd); __m128 mmn = _mm_load_ps((float*)n); __m128 mmx = _mm_load_ps((float*)x); __m128 msk = _mm_cmpeq_ps(mnd,mmk); _mm_store_ps((float*)d,_mm_blendv_ps(mmx,mmn,msk)); nd+=4; mk+=4; d+=4; n+=4; x+=4; } } else if(src.depth()==CV_64F) { const int ssesize = src.size().area()/2; remsize = src.size().area()-ssesize*2; for(int i=0;i<ssesize;i++) { __m128d mmk = _mm_load_pd((double*)mk); __m128d mnd = _mm_load_pd((double*)nd); __m128d mmn = _mm_load_pd((double*)n); __m128d mmx = _mm_load_pd((double*)x); __m128d msk = _mm_cmpeq_pd(mnd,mmk); _mm_store_pd((double*)d,_mm_blendv_pd(mmx,mmn,msk)); nd+=2; mk+=2; d+=2; n+=2; x+=2; } } #endif for(int i=0;i<remsize;i++) { { if(nd[i]==mk[i]) { d[i]=n[i]; } else { d[i]=x[i]; } } } }
__m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) { // CHECK-LABEL: @test_mm_cmpeq_ps // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0) return _mm_cmpeq_ps(__a, __b); }
inline static bool is_zero (Iu32vec4 &val) { __m128 v128 = _mm_castsi128_ps((__m128i)val); __m128 a = _mm_setzero_ps(); a = _mm_cmpeq_ps(a, v128); return _mm_movemask_ps(a) == 0xf; }
static void compute_step_tv_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], unsigned x, unsigned y, double *tv) { const __m128 minf = _mm_set_ps1(INFINITY); const __m128 mzero = _mm_set_ps1(0.); __m128 g_xs[3] = {0}; __m128 g_ys[3] = {0}; for(unsigned c = 0; c < nchannel; c++) { struct aux *aux = &auxs[c]; __m128 here = _mm_load_ps(p(aux->fdata, x, y, w, h)); // forward gradient x g_xs[c] = _mm_loadu_ps(p(aux->fdata, x+1, y, w, h)) - here; // forward gradient y g_ys[c] = _mm_loadu_ps(p(aux->fdata, x, y+1, w, h)) - here; } // norm __m128 g_norm = mzero; for(unsigned c = 0; c < nchannel; c++) { g_norm += SQR(g_xs[c]); g_norm += SQR(g_ys[c]); } g_norm = _mm_sqrt_ps(g_norm); float alpha = 1./sqrtf(nchannel); *tv += alpha * g_norm[0]; *tv += alpha * g_norm[1]; *tv += alpha * g_norm[2]; *tv += alpha * g_norm[3]; __m128 malpha = _mm_set_ps1(alpha); // set zeroes to infinity g_norm = _mm_or_ps(g_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g_norm, mzero))); // compute derivatives for(unsigned c = 0; c < nchannel; c++) { __m128 g_x = g_xs[c]; __m128 g_y = g_ys[c]; struct aux *aux = &auxs[c]; // N.B. for numerical stability and same exact result as the c version, // we must calculate the objective gradient at x+1 before x { float *pobj_r = p(aux->obj_gradient, x+1, y, w, h); __m128 obj_r = _mm_loadu_ps(pobj_r); obj_r += malpha * g_x / g_norm; _mm_storeu_ps(pobj_r, obj_r); } { float *pobj = p(aux->obj_gradient, x, y, w, h); __m128 obj = _mm_load_ps(pobj); obj += malpha * -(g_x + g_y) / g_norm; _mm_store_ps(pobj, obj); } { float *pobj_b = p(aux->obj_gradient, x, y+1, w, h); __m128 obj_b = _mm_load_ps(pobj_b); obj_b += malpha * g_y / g_norm; _mm_store_ps(pobj_b, obj_b); } } // store for(unsigned c = 0; c < nchannel; c++) { struct aux *aux = &auxs[c]; _mm_store_ps(p(aux->temp[0], x, y, w, h), g_xs[c]); _mm_store_ps(p(aux->temp[1], x, y, w, h), g_ys[c]); } }
static void compute_step_tv2_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], float alpha, unsigned x, unsigned y, double *tv2) { __m128 g_xxs[3] = {0}; __m128 g_xy_syms[3] = {0}; __m128 g_yys[3] = {0}; const __m128 mtwo = _mm_set_ps1(2.); const __m128 minf = _mm_set_ps1(INFINITY); const __m128 mzero = _mm_set_ps1(0.); __m128 malpha = _mm_set_ps1(alpha * 1./sqrtf(nchannel)); for(unsigned c = 0; c < nchannel; c++) { struct aux *aux = &auxs[c]; __m128 g_x = _mm_load_ps(p(aux->temp[0], x, y, w, h)); __m128 g_y = _mm_load_ps(p(aux->temp[1], x, y, w, h)); // backward x g_xxs[c] = g_x - _mm_loadu_ps(p(aux->temp[0], x-1, y, w, h)); // backward x __m128 g_yx = g_y - _mm_loadu_ps(p(aux->temp[1], x-1, y, w, h)); // backward y __m128 g_xy = g_x - _mm_load_ps(p(aux->temp[0], x, y-1, w, h)); // backward y g_yys[c] = g_y - _mm_load_ps(p(aux->temp[1], x, y-1, w, h)); // symmetrize g_xy_syms[c] = (g_xy + g_yx) / mtwo; } // norm __m128 g2_norm = mzero; for(unsigned c = 0; c < nchannel; c++) { g2_norm += SQR(g_xxs[c]) + mtwo * SQR(g_xy_syms[c]) + SQR(g_yys[c]); } g2_norm = _mm_sqrt_ps(g2_norm); __m128 alpha_norm = malpha * g2_norm; *tv2 += alpha_norm[0]; *tv2 += alpha_norm[1]; *tv2 += alpha_norm[2]; *tv2 += alpha_norm[3]; // set zeroes to infinity g2_norm = _mm_or_ps(g2_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g2_norm, mzero))); for(unsigned c = 0; c < nchannel; c++) { __m128 g_xx = g_xxs[c]; __m128 g_yy = g_yys[c]; __m128 g_xy_sym = g_xy_syms[c]; struct aux *aux = &auxs[c]; // N.B. for same exact result as the c version, // we must calculate the objective gradient from right to left { float *pobj_ur = p(aux->obj_gradient, x+1, y-1, w, h); __m128 obj_ur = _mm_loadu_ps(pobj_ur); obj_ur += malpha * ((-g_xy_sym) / g2_norm); _mm_storeu_ps(pobj_ur, obj_ur); } { float *pobj_r = p(aux->obj_gradient, x+1, y, w, h); __m128 obj_r = _mm_loadu_ps(pobj_r); obj_r += malpha * ((g_xy_sym + g_xx) / g2_norm); _mm_storeu_ps(pobj_r, obj_r); } { float *pobj_u = p(aux->obj_gradient, x, y-1, w, h); __m128 obj_u = _mm_load_ps(pobj_u); obj_u += malpha * ((g_yy + g_xy_sym) / g2_norm); _mm_store_ps(pobj_u, obj_u); } { float *pobj = p(aux->obj_gradient, x, y, w, h); __m128 obj = _mm_load_ps(pobj); obj += malpha * (-(mtwo * g_xx + mtwo * g_xy_sym + mtwo * g_yy) / g2_norm); _mm_store_ps(pobj, obj); } { float *pobj_b = p(aux->obj_gradient, x, y+1, w, h); __m128 obj_b = _mm_load_ps(pobj_b); obj_b += malpha * ((g_yy + g_xy_sym) / g2_norm); _mm_store_ps(pobj_b, obj_b); } { float *pobj_l = p(aux->obj_gradient, x-1, y, w, h); __m128 obj_l = _mm_loadu_ps(pobj_l); obj_l += malpha * ((g_xy_sym + g_xx) / g2_norm); _mm_storeu_ps(pobj_l, obj_l); } { float *pobj_lb = p(aux->obj_gradient, x-1, y+1, w, h); __m128 obj_lb = _mm_loadu_ps(pobj_lb); obj_lb += malpha * ((-g_xy_sym) / g2_norm); _mm_storeu_ps(pobj_lb, obj_lb); } } }
inline vector4fb operator==(const vector4f& lhs, const vector4f& rhs) { return _mm_cmpeq_ps(lhs, rhs); }