Пример #1
0
__m128d test_mm_min_sd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_min_sd
  // DAG: call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
  //
  // ASM-LABEL: test_mm_min_sd
  // ASM: minsd
  return _mm_min_sd(A, B);
}
Пример #2
0
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}
test (__m128d s1, __m128d s2)
{
  return _mm_min_sd (s1, s2); 
}
Пример #4
0
int 
calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
                              int natoms, gmx_localtop_t *top,
                              const t_atomtypes *atype, double *x, t_nblist *nl,
                              gmx_genborn_t *born)
{
	int i,k,n,ii,is3,ii3,nj0,nj1,offset;
	int jnrA,jnrB,j3A,j3B;
    int *mdtype;
	double shX,shY,shZ;
    int *jjnr;
    double *shiftvec;
    
	double gpi_ai,gpi2;
	double factor;
	double *gb_radius;
    double *vsolv;
    double *work;
    double *dadx;
    
	__m128d ix,iy,iz;
	__m128d jx,jy,jz;
	__m128d dx,dy,dz;
	__m128d tx,ty,tz;
	__m128d rsq,rinv,rinv2,rinv4,rinv6;
	__m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
	__m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
	__m128d mask,icf4,icf6,mask_cmp;
	    
	const __m128d half   = _mm_set1_pd(0.5);
	const __m128d three  = _mm_set1_pd(3.0);
	const __m128d one    = _mm_set1_pd(1.0);
	const __m128d two    = _mm_set1_pd(2.0);
	const __m128d zero   = _mm_set1_pd(0.0);
	const __m128d four   = _mm_set1_pd(4.0);
	
	const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
	const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
	const __m128d still_p4     = _mm_set1_pd(STILL_P4);
    
	factor  = 0.5 * ONE_4PI_EPS0;
    
    gb_radius = born->gb_radius;
    vsolv     = born->vsolv;
    work      = born->gpol_still_work;
	jjnr      = nl->jjnr;
    shiftvec  = fr->shift_vec[0];
    dadx      = fr->dadx;
    
	jnrA = jnrB = 0;
    jx = _mm_setzero_pd();
    jy = _mm_setzero_pd();
    jz = _mm_setzero_pd();
    
	n = 0;
    
	for(i=0;i<natoms;i++)
	{
		work[i]=0;
	}
    
	for(i=0;i<nl->nri;i++)
	{
        ii     = nl->iinr[i];
		ii3	   = ii*3;
        is3    = 3*nl->shift[i];     
        shX    = shiftvec[is3];  
        shY    = shiftvec[is3+1];
        shZ    = shiftvec[is3+2];
        nj0    = nl->jindex[i];      
        nj1    = nl->jindex[i+1];    
        
        ix     = _mm_set1_pd(shX+x[ii3+0]);
		iy     = _mm_set1_pd(shY+x[ii3+1]);
		iz     = _mm_set1_pd(shZ+x[ii3+2]);
		

		/* Polarization energy for atom ai */
		gpi    = _mm_setzero_pd();
		
        rai     = _mm_load1_pd(gb_radius+ii);
        prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);

		for(k=nj0;k<nj1-1;k+=2)
		{
			jnrA        = jjnr[k];   
			jnrB        = jjnr[k+1];
            
            j3A         = 3*jnrA;  
			j3B         = 3*jnrB;
            
            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
            
            GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
			GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
            
			dx          = _mm_sub_pd(ix,jx);
			dy          = _mm_sub_pd(iy,jy);
			dz          = _mm_sub_pd(iz,jz);
            
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            rinv2       = _mm_mul_pd(rinv,rinv);
            rinv4       = _mm_mul_pd(rinv2,rinv2);
            rinv6       = _mm_mul_pd(rinv4,rinv2);
            
            rvdw        = _mm_add_pd(rai,raj);
            ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
            
            mask_cmp    = _mm_cmple_pd(ratio,still_p5inv);

            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
            if( 0 == _mm_movemask_pd(mask_cmp) )
            {
                /* if ratio>still_p5inv for ALL elements */
                ccf         = one;
                dccf        = _mm_setzero_pd();
            }
            else 
            {
                ratio       = _mm_min_pd(ratio,still_p5inv);
                theta       = _mm_mul_pd(ratio,still_pip5);
                gmx_mm_sincos_pd(theta,&sinq,&cosq);
                term        = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
                ccf         = _mm_mul_pd(term,term);
                dccf        = _mm_mul_pd(_mm_mul_pd(two,term),
                                         _mm_mul_pd(sinq,theta));
            }

            prod        = _mm_mul_pd(still_p4,vaj);
            icf4        = _mm_mul_pd(ccf,rinv4);
            icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
                        
            GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
            
            gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
            
            _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
            dadx+=2;
            _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
            dadx+=2;
		} 
        
        if(k<nj1)
		{
			jnrA        = jjnr[k];   
            
            j3A         = 3*jnrA;  
            
            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
            
            GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
			GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
            
			dx          = _mm_sub_sd(ix,jx);
			dy          = _mm_sub_sd(iy,jy);
			dz          = _mm_sub_sd(iz,jz);
            
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            rinv2       = _mm_mul_sd(rinv,rinv);
            rinv4       = _mm_mul_sd(rinv2,rinv2);
            rinv6       = _mm_mul_sd(rinv4,rinv2);
            
            rvdw        = _mm_add_sd(rai,raj);
            ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
            
            mask_cmp    = _mm_cmple_sd(ratio,still_p5inv);
            
            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
            if( 0 == _mm_movemask_pd(mask_cmp) )
            {
                /* if ratio>still_p5inv for ALL elements */
                ccf         = one;
                dccf        = _mm_setzero_pd();
            }
            else 
            {
                ratio       = _mm_min_sd(ratio,still_p5inv);
                theta       = _mm_mul_sd(ratio,still_pip5);
                gmx_mm_sincos_pd(theta,&sinq,&cosq);
                term        = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
                ccf         = _mm_mul_sd(term,term);
                dccf        = _mm_mul_sd(_mm_mul_sd(two,term),
                                         _mm_mul_sd(sinq,theta));
            }
            
            prod        = _mm_mul_sd(still_p4,vaj);
            icf4        = _mm_mul_sd(ccf,rinv4);
            icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);

            GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
            
            gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
            
            _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
            dadx+=2;
            _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
            dadx+=2;
		} 
        gmx_mm_update_1pot_pd(gpi,work+ii);
	}
    
	/* Sum up the polarization energy from other nodes */
	if(PARTDECOMP(cr))
	{
		gmx_sum(natoms, work, cr);
	}
	else if(DOMAINDECOMP(cr))
	{
		dd_atom_sum_real(cr->dd, work);
	}
	
	/* Compute the radii */
	for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
	{		
		if(born->use[i] != 0)
		{
			gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
			gpi2             = gpi_ai * gpi_ai;
			born->bRad[i]   = factor*gmx_invsqrt(gpi2);
			fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
		}
	}
    
	/* Extra (local) communication required for DD */
	if(DOMAINDECOMP(cr))
	{
		dd_atom_spread_real(cr->dd, born->bRad);
		dd_atom_spread_real(cr->dd, fr->invsqrta);
	}
    
	return 0;	
}