void*drawman(void*x){ int c=col++; unsigned _m=mx,mxx=16777216/_m; double _x=xx,_y=yy,_w=wh; do{ __m128d cr=_mm_set1_pd(_x+_w*c); for(int j=0;j<512;j+=2){ __m128d zr=cr, zi=_mm_set_pd(_y+_w*j,_y+_w*(j+1)),ci=zi, zr2=_mm_mul_pd(zr,zr),zi2=_mm_mul_pd(zi,zi); unsigned mk=mx-1; uint64_t kk[2]__attribute__((aligned(16)))={mk,mk}; __m128i k=_mm_load_si128((__m128i*)kk); do{ zi=_mm_mul_pd(zi,zr); zi=_mm_add_pd(_mm_add_pd(zi,zi),ci); zr=_mm_add_pd(_mm_sub_pd(zr2,zi2),cr); zr2=_mm_mul_pd(zr,zr); zi2=_mm_mul_pd(zi,zi); __m128d n=_mm_cmplt_pd(_mm_add_pd(zr2,zi2),_mm_set1_pd(4)); if(!_mm_movemask_pd(n))break; k=_mm_add_epi64(k,_mm_castpd_si128(n)); }while(--mk); _mm_store_si128((__m128i*)kk,k); manor[c][j]=kk[1]*mxx>>16; manor[c][j+1]=kk[0]*mxx>>16; } done[c>>6]|=1ULL<<(c&63); c=col++; }while(c<512&&!pull); }
int test_mm_movemask_pd(__m128d A) { // DAG-LABEL: test_mm_movemask_pd // DAG: call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %{{.*}}) // // ASM-LABEL: test_mm_movemask_pd // ASM: movmskpd return _mm_movemask_pd(A); }
/** * Processes two doubles at a time */ int _mandelbrot_2( double const * const c_re_arg, double const * const c_im_arg, int max_iter ) { __m128d z_re = _mm_load_pd(c_re_arg); __m128d z_im = _mm_load_pd(c_im_arg); __m128d y_re; __m128d y_im; __m128d c_re = z_re; __m128d c_im = z_im; __m128i count = _mm_set1_epi64x(0); __m128d md; __m128d mt; __m128i mi = _mm_set1_epi16(0xffff);; __m128d two = _mm_set1_pd(2.0); __m128i one = _mm_set1_epi64x(1); for (int i = 0; i<max_iter; i+=1) { // y = z .* z; y_re = _mm_mul_pd(z_re, z_re); y_im = _mm_mul_pd(z_im, z_im); // y = z * z; y_re = _mm_sub_pd(y_re, y_im); y_im = _mm_mul_pd(z_re, z_im); y_im = _mm_add_pd(y_im, y_im); // z = z * z + c z_re = _mm_add_pd(y_re, c_re); z_im = _mm_add_pd(y_im, c_im); // if condition // md = _mm_add_pd(z_re, z_im); // md = _mm_cmplt_pd(md, four); md = _mm_cmplt_pd(z_re, two); mt = _mm_cmplt_pd(z_im, two); md = _mm_and_pd(md, mt); mi = _mm_and_si128(mi, (__m128i) md); // PRINT_M128I(mi); if ( !_mm_movemask_pd(md) ) { break; } // count iterations count = _mm_add_epi64( count, _mm_and_si128( mi, one) ); } int val; count = _mm_add_epi64( _mm_srli_si128(count, 8), count ); val = _mm_cvtsi128_si64( count ); return val; }
/** @brief Rounds floating-point number to the nearest integer not smaller than the original. The function computes an integer i such that: \f[i \le \texttt{value} < i+1\f] @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the result is not defined. */ CV_INLINE int cvCeil( double value ) { #if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__) __m128d t = _mm_set_sd( value ); int i = _mm_cvtsd_si32(t); return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t)); #elif defined __GNUC__ int i = (int)value; return i + (i < value); #else int i = cvRound(value); float diff = (float)(i - value); return i + (diff < 0); #endif }
KFR_SINTRIN bool bittestall(const f64sse& x) { return !_mm_movemask_pd(*~x); }
KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); }
// The input must be in domain [-1686629712, 1686629712]. // // I tried to optimize the double to int conversion by using `magic`, but // it was actually slower than using `_mm_cvttpd_epi32()` and it didn't // offer greater domain for `x`. static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) { SIMD_CONST_SQ(sign , SIMD_UINT64_C(0x8000000000000000)); SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF)); SIMD_CONST_SI(int32_one, 1); SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698); SIMD_CONST_SD(DP1 , 7.85398125648498535156e-1); SIMD_CONST_SD(DP2 , 3.77489470793079817668e-8); SIMD_CONST_SD(DP3 , 2.69515142907905952645e-15); #define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \ SIMD_ALIGN_VAR(static const double, name[], 16) = { \ x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \ y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \ x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \ y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya \ } DEFINE_DATA(sincos_coeff, 1.58962301576546568060e-10,-2.50507477628578072866e-8, 2.75573136213857245213e-6 ,-1.98412698295895385996e-4, 8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0, -1.13585365213876817300e-11, 2.08757008419747316778e-9, -2.75573141792967388112e-7 , 2.48015872888517045348e-5, -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0); __m128d y; __m128d sign = x; // Sign bit. x = _mm_and_pd(x, SIMD_GET_PD(inv_sign)); // Take the absolute value. y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI)); // Integer part of `x * 4 / PI`. __m128i ival = _mm_cvttpd_epi32(y); // Extract the integer part of y. __m128i ione = SIMD_GET_PI(int32_one); ival = _mm_add_epi32(ival, ione); // j += 1. ival = _mm_andnot_si128(ione, ival); // j &=~1. y = _mm_cvtepi32_pd(ival); ival = _mm_unpacklo_epi32(ival, ival); sign = _mm_xor_pd(sign, // Swap the sign bit if `j & 4`. _mm_castsi128_pd(_mm_slli_epi64(ival, 61))); sign = _mm_and_pd(sign, SIMD_GET_PD(sign)); // Keep only the sign bit. // Get the polynom selection mask (j & 2): // 1. `0x0000000000000000` => `0 <= x <= PI/4` // 2. `0xFFFFFFFFFFFFFFFF` => `PI/4 < x <= PI/2` ival = _mm_slli_epi32(ival, 30); ival = _mm_srai_epi32(ival, 31); // Extended precision modular arithmetic: // x = ((x - y * DP1) - y * DP2) - y * DP3 x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3))); // Get the polynom coefficients for each lane (sin/cos). __m128d poly_mask = _mm_castsi128_pd(ival); const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) + static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8; __m128d xx = _mm_mul_pd(x, x); y = coeff[0]; y = Simd128::mad(y, xx, coeff[1]); y = Simd128::mad(y, xx, coeff[2]); y = Simd128::mad(y, xx, coeff[3]); y = Simd128::mad(y, xx, coeff[4]); y = Simd128::mad(y, xx, coeff[5]); y = _mm_mul_pd(y, xx); __m128d x_or_xx = _mm_or_pd( _mm_and_pd(xx, poly_mask), _mm_andnot_pd(poly_mask, x)); y = _mm_mul_pd(y, x_or_xx); y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6])); y = _mm_add_pd(y, coeff[7]); return _mm_xor_pd(y, sign); }
int calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born) { int i,k,n,ii,is3,ii3,nj0,nj1,offset; int jnrA,jnrB,j3A,j3B; int *mdtype; double shX,shY,shZ; int *jjnr; double *shiftvec; double gpi_ai,gpi2; double factor; double *gb_radius; double *vsolv; double *work; double *dadx; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d tx,ty,tz; __m128d rsq,rinv,rinv2,rinv4,rinv6; __m128d ratio,gpi,rai,raj,vai,vaj,rvdw; __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp; __m128d mask,icf4,icf6,mask_cmp; const __m128d half = _mm_set1_pd(0.5); const __m128d three = _mm_set1_pd(3.0); const __m128d one = _mm_set1_pd(1.0); const __m128d two = _mm_set1_pd(2.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d four = _mm_set1_pd(4.0); const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV); const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5); const __m128d still_p4 = _mm_set1_pd(STILL_P4); factor = 0.5 * ONE_4PI_EPS0; gb_radius = born->gb_radius; vsolv = born->vsolv; work = born->gpol_still_work; jjnr = nl->jjnr; shiftvec = fr->shift_vec[0]; dadx = fr->dadx; jnrA = jnrB = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); n = 0; for(i=0;i<natoms;i++) { work[i]=0; } for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); /* Polarization energy for atom ai */ gpi = _mm_setzero_pd(); rai = _mm_load1_pd(gb_radius+ii); prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj); GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_pd(rinv,rinv); rinv4 = _mm_mul_pd(rinv2,rinv2); rinv6 = _mm_mul_pd(rinv4,rinv2); rvdw = _mm_add_pd(rai,raj); ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_pd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_pd(ratio,still_p5inv); theta = _mm_mul_pd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_pd(half,_mm_sub_pd(one,cosq)); ccf = _mm_mul_pd(term,term); dccf = _mm_mul_pd(_mm_mul_pd(two,term), _mm_mul_pd(sinq,theta)); } prod = _mm_mul_pd(still_p4,vaj); icf4 = _mm_mul_pd(ccf,rinv4); icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4)); gpi = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj); GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_sd(rinv,rinv); rinv4 = _mm_mul_sd(rinv2,rinv2); rinv6 = _mm_mul_sd(rinv4,rinv2); rvdw = _mm_add_sd(rai,raj); ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_sd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_sd(ratio,still_p5inv); theta = _mm_mul_sd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_sd(half,_mm_sub_sd(one,cosq)); ccf = _mm_mul_sd(term,term); dccf = _mm_mul_sd(_mm_mul_sd(two,term), _mm_mul_sd(sinq,theta)); } prod = _mm_mul_sd(still_p4,vaj); icf4 = _mm_mul_sd(ccf,rinv4); icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4)); gpi = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } gmx_mm_update_1pot_pd(gpi,work+ii); } /* Sum up the polarization energy from other nodes */ if(PARTDECOMP(cr)) { gmx_sum(natoms, work, cr); } else if(DOMAINDECOMP(cr)) { dd_atom_sum_real(cr->dd, work); } /* Compute the radii */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/ gpi2 = gpi_ai * gpi_ai; born->bRad[i] = factor*gmx_invsqrt(gpi2); fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); } } /* Extra (local) communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); } return 0; }