C++ (Cpp) _mm_movemask_pd Exemples

Exemple #1

0

Afficher le fichier

Fichier : taylor.c Projet : serprex/Mandelbrot

void*drawman(void*x){
	int c=col++;
	unsigned _m=mx,mxx=16777216/_m;
	double _x=xx,_y=yy,_w=wh;
	do{
		__m128d cr=_mm_set1_pd(_x+_w*c);
		for(int j=0;j<512;j+=2){
			__m128d zr=cr,
				zi=_mm_set_pd(_y+_w*j,_y+_w*(j+1)),ci=zi,
				zr2=_mm_mul_pd(zr,zr),zi2=_mm_mul_pd(zi,zi);
			unsigned mk=mx-1;
			uint64_t kk[2]__attribute__((aligned(16)))={mk,mk};
			__m128i k=_mm_load_si128((__m128i*)kk);
			do{
				zi=_mm_mul_pd(zi,zr);
				zi=_mm_add_pd(_mm_add_pd(zi,zi),ci);
				zr=_mm_add_pd(_mm_sub_pd(zr2,zi2),cr);
				zr2=_mm_mul_pd(zr,zr);
				zi2=_mm_mul_pd(zi,zi);
				__m128d n=_mm_cmplt_pd(_mm_add_pd(zr2,zi2),_mm_set1_pd(4));
				if(!_mm_movemask_pd(n))break;
				k=_mm_add_epi64(k,_mm_castpd_si128(n));
			}while(--mk);
			_mm_store_si128((__m128i*)kk,k);
			manor[c][j]=kk[1]*mxx>>16;
			manor[c][j+1]=kk[0]*mxx>>16;
		}
		done[c>>6]|=1ULL<<(c&63);
		c=col++;
	}while(c<512&&!pull);
}

Exemple #2

0

Afficher le fichier

Fichier : sse2-builtins.c Projet : lucasmrthomaz/clang

int test_mm_movemask_pd(__m128d A) {
  // DAG-LABEL: test_mm_movemask_pd
  // DAG: call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %{{.*}})
  //
  // ASM-LABEL: test_mm_movemask_pd
  // ASM: movmskpd
  return _mm_movemask_pd(A);
}

Exemple #3

0

Afficher le fichier

Fichier : f3c-sse.c Projet : ashlaban/fractal-thread

/**
 * Processes two doubles at a time
 */
int
_mandelbrot_2( double const * const c_re_arg, 
	           double const * const c_im_arg, 
	           int                  max_iter 
	         )
{
	__m128d z_re = _mm_load_pd(c_re_arg);
	__m128d z_im = _mm_load_pd(c_im_arg);
	__m128d y_re;
	__m128d y_im;
	__m128d c_re = z_re;
	__m128d c_im = z_im;

	__m128i count = _mm_set1_epi64x(0);

	__m128d md;
	__m128d mt;
	__m128i mi = _mm_set1_epi16(0xffff);;

	__m128d two = _mm_set1_pd(2.0);
	__m128i one = _mm_set1_epi64x(1);

	for (int i = 0; i<max_iter; i+=1)
	{
		// y = z .* z;
		y_re = _mm_mul_pd(z_re, z_re);
		y_im = _mm_mul_pd(z_im, z_im);

		// y = z * z;
		y_re = _mm_sub_pd(y_re, y_im);
		y_im = _mm_mul_pd(z_re, z_im);
		y_im = _mm_add_pd(y_im, y_im);

		// z = z * z + c
		z_re = _mm_add_pd(y_re, c_re);
		z_im = _mm_add_pd(y_im, c_im);

		// if condition
		// md = _mm_add_pd(z_re, z_im);
		// md = _mm_cmplt_pd(md, four);
		md = _mm_cmplt_pd(z_re, two);
		mt = _mm_cmplt_pd(z_im, two);
		md = _mm_and_pd(md, mt);
		mi = _mm_and_si128(mi, (__m128i) md);
		// PRINT_M128I(mi);
		if ( !_mm_movemask_pd(md) ) { break; }

		// count iterations
		count = _mm_add_epi64( count, _mm_and_si128( mi, one) );
	}

	int val;
	count = _mm_add_epi64( _mm_srli_si128(count, 8), count );
	val   = _mm_cvtsi128_si64( count );

	return val;
}

Exemple #4

0

Afficher le fichier

Fichier : fast_math.hpp Projet : AnnaPetrovicheva/opencv

/** @brief Rounds floating-point number to the nearest integer not smaller than the original.

 The function computes an integer i such that:
 \f[i \le \texttt{value} < i+1\f]
 @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
 result is not defined.
 */
CV_INLINE int cvCeil( double value )
{
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
    __m128d t = _mm_set_sd( value );
    int i = _mm_cvtsd_si32(t);
    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
#elif defined __GNUC__
    int i = (int)value;
    return i + (i < value);
#else
    int i = cvRound(value);
    float diff = (float)(i - value);
    return i + (diff < 0);
#endif
}

Exemple #5

0

Afficher le fichier

Fichier : logical.hpp Projet : dlevin256/kfr

KFR_SINTRIN bool bittestall(const f64sse& x) { return !_mm_movemask_pd(*~x); }

Exemple #6

0

Afficher le fichier

Fichier : logical.hpp Projet : dlevin256/kfr

KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); }

Exemple #7

0

Afficher le fichier

Fichier : trigo_sse2.cpp Projet : kobalicek/simdtests

// The input must be in domain [-1686629712, 1686629712].
//
// I tried to optimize the double to int conversion by using `magic`, but
// it was actually slower than using `_mm_cvttpd_epi32()` and it didn't
// offer greater domain for `x`.
static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) {
    SIMD_CONST_SQ(sign     , SIMD_UINT64_C(0x8000000000000000));
    SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF));
    SIMD_CONST_SI(int32_one, 1);
    SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698);
    SIMD_CONST_SD(DP1      , 7.85398125648498535156e-1);
    SIMD_CONST_SD(DP2      , 3.77489470793079817668e-8);
    SIMD_CONST_SD(DP3      , 2.69515142907905952645e-15);

#define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \
  SIMD_ALIGN_VAR(static const double, name[], 16) = { \
    x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \
    y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \
    x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \
    y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya  \
  }

    DEFINE_DATA(sincos_coeff,
                1.58962301576546568060e-10,-2.50507477628578072866e-8,
                2.75573136213857245213e-6 ,-1.98412698295895385996e-4,
                8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0,

                -1.13585365213876817300e-11, 2.08757008419747316778e-9,
                -2.75573141792967388112e-7 , 2.48015872888517045348e-5,
                -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0);

    __m128d y;
    __m128d sign = x;                                        // Sign bit.

    x = _mm_and_pd(x, SIMD_GET_PD(inv_sign));                // Take the absolute value.
    y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI));                // Integer part of `x * 4 / PI`.

    __m128i ival = _mm_cvttpd_epi32(y);                      // Extract the integer part of y.
    __m128i ione = SIMD_GET_PI(int32_one);

    ival = _mm_add_epi32(ival, ione);                        // j += 1.
    ival = _mm_andnot_si128(ione, ival);                     // j &=~1.

    y = _mm_cvtepi32_pd(ival);
    ival = _mm_unpacklo_epi32(ival, ival);

    sign = _mm_xor_pd(sign,                                  // Swap the sign bit if `j & 4`.
                      _mm_castsi128_pd(_mm_slli_epi64(ival, 61)));
    sign = _mm_and_pd(sign, SIMD_GET_PD(sign));              // Keep only the sign bit.

    // Get the polynom selection mask (j & 2):
    //   1. `0x0000000000000000` => `0    <= x <= PI/4`
    //   2. `0xFFFFFFFFFFFFFFFF` => `PI/4 <  x <= PI/2`
    ival = _mm_slli_epi32(ival, 30);
    ival = _mm_srai_epi32(ival, 31);

    // Extended precision modular arithmetic:
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3)));

    // Get the polynom coefficients for each lane (sin/cos).
    __m128d poly_mask = _mm_castsi128_pd(ival);
    const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) +
                           static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8;

    __m128d xx = _mm_mul_pd(x, x);
    y = coeff[0];
    y = Simd128::mad(y, xx, coeff[1]);
    y = Simd128::mad(y, xx, coeff[2]);
    y = Simd128::mad(y, xx, coeff[3]);
    y = Simd128::mad(y, xx, coeff[4]);
    y = Simd128::mad(y, xx, coeff[5]);
    y = _mm_mul_pd(y, xx);

    __m128d x_or_xx = _mm_or_pd(
                          _mm_and_pd(xx, poly_mask),
                          _mm_andnot_pd(poly_mask, x));

    y = _mm_mul_pd(y, x_or_xx);
    y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6]));
    y = _mm_add_pd(y, coeff[7]);

    return _mm_xor_pd(y, sign);
}

Exemple #8

0

Afficher le fichier

Fichier : genborn_sse2_double.c Projet : alexholehouse/gromacs

int 
calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
                              int natoms, gmx_localtop_t *top,
                              const t_atomtypes *atype, double *x, t_nblist *nl,
                              gmx_genborn_t *born)
{
	int i,k,n,ii,is3,ii3,nj0,nj1,offset;
	int jnrA,jnrB,j3A,j3B;
    int *mdtype;
	double shX,shY,shZ;
    int *jjnr;
    double *shiftvec;
    
	double gpi_ai,gpi2;
	double factor;
	double *gb_radius;
    double *vsolv;
    double *work;
    double *dadx;
    
	__m128d ix,iy,iz;
	__m128d jx,jy,jz;
	__m128d dx,dy,dz;
	__m128d tx,ty,tz;
	__m128d rsq,rinv,rinv2,rinv4,rinv6;
	__m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
	__m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
	__m128d mask,icf4,icf6,mask_cmp;
	    
	const __m128d half   = _mm_set1_pd(0.5);
	const __m128d three  = _mm_set1_pd(3.0);
	const __m128d one    = _mm_set1_pd(1.0);
	const __m128d two    = _mm_set1_pd(2.0);
	const __m128d zero   = _mm_set1_pd(0.0);
	const __m128d four   = _mm_set1_pd(4.0);
	
	const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
	const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
	const __m128d still_p4     = _mm_set1_pd(STILL_P4);
    
	factor  = 0.5 * ONE_4PI_EPS0;
    
    gb_radius = born->gb_radius;
    vsolv     = born->vsolv;
    work      = born->gpol_still_work;
	jjnr      = nl->jjnr;
    shiftvec  = fr->shift_vec[0];
    dadx      = fr->dadx;
    
	jnrA = jnrB = 0;
    jx = _mm_setzero_pd();
    jy = _mm_setzero_pd();
    jz = _mm_setzero_pd();
    
	n = 0;
    
	for(i=0;i<natoms;i++)
	{
		work[i]=0;
	}
    
	for(i=0;i<nl->nri;i++)
	{
        ii     = nl->iinr[i];
		ii3	   = ii*3;
        is3    = 3*nl->shift[i];     
        shX    = shiftvec[is3];  
        shY    = shiftvec[is3+1];
        shZ    = shiftvec[is3+2];
        nj0    = nl->jindex[i];      
        nj1    = nl->jindex[i+1];    
        
        ix     = _mm_set1_pd(shX+x[ii3+0]);
		iy     = _mm_set1_pd(shY+x[ii3+1]);
		iz     = _mm_set1_pd(shZ+x[ii3+2]);
		

		/* Polarization energy for atom ai */
		gpi    = _mm_setzero_pd();
		
        rai     = _mm_load1_pd(gb_radius+ii);
        prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);

		for(k=nj0;k<nj1-1;k+=2)
		{
			jnrA        = jjnr[k];   
			jnrB        = jjnr[k+1];
            
            j3A         = 3*jnrA;  
			j3B         = 3*jnrB;
            
            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
            
            GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
			GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
            
			dx          = _mm_sub_pd(ix,jx);
			dy          = _mm_sub_pd(iy,jy);
			dz          = _mm_sub_pd(iz,jz);
            
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            rinv2       = _mm_mul_pd(rinv,rinv);
            rinv4       = _mm_mul_pd(rinv2,rinv2);
            rinv6       = _mm_mul_pd(rinv4,rinv2);
            
            rvdw        = _mm_add_pd(rai,raj);
            ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
            
            mask_cmp    = _mm_cmple_pd(ratio,still_p5inv);

            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
            if( 0 == _mm_movemask_pd(mask_cmp) )
            {
                /* if ratio>still_p5inv for ALL elements */
                ccf         = one;
                dccf        = _mm_setzero_pd();
            }
            else 
            {
                ratio       = _mm_min_pd(ratio,still_p5inv);
                theta       = _mm_mul_pd(ratio,still_pip5);
                gmx_mm_sincos_pd(theta,&sinq,&cosq);
                term        = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
                ccf         = _mm_mul_pd(term,term);
                dccf        = _mm_mul_pd(_mm_mul_pd(two,term),
                                         _mm_mul_pd(sinq,theta));
            }

            prod        = _mm_mul_pd(still_p4,vaj);
            icf4        = _mm_mul_pd(ccf,rinv4);
            icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
                        
            GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
            
            gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
            
            _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
            dadx+=2;
            _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
            dadx+=2;
		} 
        
        if(k<nj1)
		{
			jnrA        = jjnr[k];   
            
            j3A         = 3*jnrA;  
            
            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
            
            GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
			GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
            
			dx          = _mm_sub_sd(ix,jx);
			dy          = _mm_sub_sd(iy,jy);
			dz          = _mm_sub_sd(iz,jz);
            
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            rinv2       = _mm_mul_sd(rinv,rinv);
            rinv4       = _mm_mul_sd(rinv2,rinv2);
            rinv6       = _mm_mul_sd(rinv4,rinv2);
            
            rvdw        = _mm_add_sd(rai,raj);
            ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
            
            mask_cmp    = _mm_cmple_sd(ratio,still_p5inv);
            
            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
            if( 0 == _mm_movemask_pd(mask_cmp) )
            {
                /* if ratio>still_p5inv for ALL elements */
                ccf         = one;
                dccf        = _mm_setzero_pd();
            }
            else 
            {
                ratio       = _mm_min_sd(ratio,still_p5inv);
                theta       = _mm_mul_sd(ratio,still_pip5);
                gmx_mm_sincos_pd(theta,&sinq,&cosq);
                term        = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
                ccf         = _mm_mul_sd(term,term);
                dccf        = _mm_mul_sd(_mm_mul_sd(two,term),
                                         _mm_mul_sd(sinq,theta));
            }
            
            prod        = _mm_mul_sd(still_p4,vaj);
            icf4        = _mm_mul_sd(ccf,rinv4);
            icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);

            GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
            
            gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
            
            _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
            dadx+=2;
            _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
            dadx+=2;
		} 
        gmx_mm_update_1pot_pd(gpi,work+ii);
	}
    
	/* Sum up the polarization energy from other nodes */
	if(PARTDECOMP(cr))
	{
		gmx_sum(natoms, work, cr);
	}
	else if(DOMAINDECOMP(cr))
	{
		dd_atom_sum_real(cr->dd, work);
	}
	
	/* Compute the radii */
	for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
	{		
		if(born->use[i] != 0)
		{
			gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
			gpi2             = gpi_ai * gpi_ai;
			born->bRad[i]   = factor*gmx_invsqrt(gpi2);
			fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
		}
	}
    
	/* Extra (local) communication required for DD */
	if(DOMAINDECOMP(cr))
	{
		dd_atom_spread_real(cr->dd, born->bRad);
		dd_atom_spread_real(cr->dd, fr->invsqrta);
	}
    
	return 0;	
}