Example #1
0
static void
clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
{
  __m128d xmm1;
  double max = *src2_1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    double x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
  xmm1 = _mm_set1_pd(max);
  for (; n >= 2; n -= 2) {
    __m128d xmm0;
    xmm0 = _mm_loadu_pd(src1);
    xmm0 = _mm_min_pd(xmm0, xmm1);
    _mm_store_pd(dest, xmm0);
    dest += 2;
    src1 += 2;
  }
  for (; n > 0; n--) {
    double x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
}
Example #2
0
__m128d test_mm_min_pd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_min_pd
  // DAG: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
  //
  // ASM-LABEL: test_mm_min_pd
  // ASM: minpd
  return _mm_min_pd(A, B);
}
Example #3
0
__SIMDd _SIMD_min_pd(__SIMDd a, __SIMDd b)
{
#ifdef  USE_SSE
  return _mm_min_pd(a,b); 
#elif defined USE_AVX
  return _mm256_min_pd(a,b); 
#elif defined USE_IBM
  return vec_min(a,b);
#endif
}
Example #4
0
BI_FORCE_INLINE inline sse_double min(const sse_double x,
    const sse_double y) {
  sse_double res;
  res.packed = _mm_min_pd(x.packed, y.packed);
  return res;
}
Example #5
0
AABB3d TriangleItemHandler::clip(
    const size_t                        item_index,
    const size_t                        dimension,
    const double                        slab_min,
    const double                        slab_max) const
{
    const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index];

    if (vertex_info.m_motion_segment_count > 0)
    {
        AABB3d triangle_bbox = m_triangle_bboxes[item_index];

        if (triangle_bbox.min[dimension] < slab_min)
            triangle_bbox.min[dimension] = slab_min;

        if (triangle_bbox.max[dimension] > slab_max)
            triangle_bbox.max[dimension] = slab_max;

        return triangle_bbox;
    }

#ifdef APPLESEED_USE_SSE

    APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const double v0d = v0[dimension];
    const double v1d = v1[dimension];
    const double v2d = v2[dimension];

    const int v0_ge_min = v0d >= slab_min ? 1 : 0;
    const int v0_le_max = v0d <= slab_max ? 1 : 0;
    const int v1_ge_min = v1d >= slab_min ? 1 : 0;
    const int v1_le_max = v1d <= slab_max ? 1 : 0;
    const int v2_ge_min = v2d >= slab_min ? 1 : 0;
    const int v2_le_max = v2d <= slab_max ? 1 : 0;

    __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max());
    __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max());

    const __m128d v0_xy = _mm_load_pd(&v0.x);
    const __m128d v0_zz = _mm_set1_pd(v0.z);
    const __m128d v1_xy = _mm_load_pd(&v1.x);
    const __m128d v1_zz = _mm_set1_pd(v1.z);
    const __m128d v2_xy = _mm_load_pd(&v2.x);
    const __m128d v2_zz = _mm_set1_pd(v2.z);

    if (v0_ge_min & v0_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz);
    }

    if (v1_ge_min & v1_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz);
    }

    if (v2_ge_min & v2_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz);
    }

    const int v0v1_cross_min = v0_ge_min ^ v1_ge_min;
    const int v0v1_cross_max = v0_le_max ^ v1_le_max;
    const int v1v2_cross_min = v1_ge_min ^ v2_ge_min;
    const int v1v2_cross_max = v1_le_max ^ v2_le_max;
    const int v2v0_cross_min = v2_ge_min ^ v0_ge_min;
    const int v2v0_cross_max = v2_le_max ^ v0_le_max;

    if (v0v1_cross_min | v0v1_cross_max)
    {
        const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]);

        if (v0v1_cross_min)
        {
            const double t = (slab_min - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v0v1_cross_max)
        {
            const double t = (slab_max - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v1v2_cross_min | v1v2_cross_max)
    {
        const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]);

        if (v1v2_cross_min)
        {
            const double t = (slab_min - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v1v2_cross_max)
        {
            const double t = (slab_max - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v2v0_cross_min | v2v0_cross_max)
    {
        const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]);

        if (v2v0_cross_min)
        {
            const double t = (slab_min - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v2v0_cross_max)
        {
            const double t = (slab_max - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    APPLESEED_SIMD4_ALIGN AABB3d bbox;

    _mm_store_pd(&bbox.min.x, bbox_min_xy);
    _mm_store_sd(&bbox.min.z, bbox_min_zz);
    _mm_storeu_pd(&bbox.max.x, bbox_max_xy);
    _mm_store_sd(&bbox.max.z, bbox_max_zz);

    if (bbox.min[dimension] < slab_min)
        bbox.min[dimension] = slab_min;

    if (bbox.max[dimension] > slab_max)
        bbox.max[dimension] = slab_max;

#else

    const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0;
    const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0;
    const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0;
    const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0;
    const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0;
    const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0;

    AABB3d bbox;
    bbox.invalidate();

    if (v0_ge_min & v0_le_max)
        bbox.insert(v0);

    if (v1_ge_min & v1_le_max)
        bbox.insert(v1);

    if (v2_ge_min & v2_le_max)
        bbox.insert(v2);

    if (v0_ge_min != v1_ge_min)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min));

    if (v0_le_max != v1_le_max)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max));

    if (v1_ge_min != v2_ge_min)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min));

    if (v1_le_max != v2_le_max)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max));

    if (v2_ge_min != v0_ge_min)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min));

    if (v2_le_max != v0_le_max)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max));

#endif

    return bbox;
}
 static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept  { return _mm_min_pd (a, b); }
Example #7
0
 inline F64vec2 min(const F64vec2 &l, const F64vec2 &r)
 {
     return _mm_min_pd(l, r);
 }
int 
calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
                              int natoms, gmx_localtop_t *top,
                              const t_atomtypes *atype, double *x, t_nblist *nl,
                              gmx_genborn_t *born)
{
	int i,k,n,ii,is3,ii3,nj0,nj1,offset;
	int jnrA,jnrB,j3A,j3B;
    int *mdtype;
	double shX,shY,shZ;
    int *jjnr;
    double *shiftvec;
    
	double gpi_ai,gpi2;
	double factor;
	double *gb_radius;
    double *vsolv;
    double *work;
    double *dadx;
    
	__m128d ix,iy,iz;
	__m128d jx,jy,jz;
	__m128d dx,dy,dz;
	__m128d tx,ty,tz;
	__m128d rsq,rinv,rinv2,rinv4,rinv6;
	__m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
	__m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
	__m128d mask,icf4,icf6,mask_cmp;
	    
	const __m128d half   = _mm_set1_pd(0.5);
	const __m128d three  = _mm_set1_pd(3.0);
	const __m128d one    = _mm_set1_pd(1.0);
	const __m128d two    = _mm_set1_pd(2.0);
	const __m128d zero   = _mm_set1_pd(0.0);
	const __m128d four   = _mm_set1_pd(4.0);
	
	const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
	const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
	const __m128d still_p4     = _mm_set1_pd(STILL_P4);
    
	factor  = 0.5 * ONE_4PI_EPS0;
    
    gb_radius = born->gb_radius;
    vsolv     = born->vsolv;
    work      = born->gpol_still_work;
	jjnr      = nl->jjnr;
    shiftvec  = fr->shift_vec[0];
    dadx      = fr->dadx;
    
	jnrA = jnrB = 0;
    jx = _mm_setzero_pd();
    jy = _mm_setzero_pd();
    jz = _mm_setzero_pd();
    
	n = 0;
    
	for(i=0;i<natoms;i++)
	{
		work[i]=0;
	}
    
	for(i=0;i<nl->nri;i++)
	{
        ii     = nl->iinr[i];
		ii3	   = ii*3;
        is3    = 3*nl->shift[i];     
        shX    = shiftvec[is3];  
        shY    = shiftvec[is3+1];
        shZ    = shiftvec[is3+2];
        nj0    = nl->jindex[i];      
        nj1    = nl->jindex[i+1];    
        
        ix     = _mm_set1_pd(shX+x[ii3+0]);
		iy     = _mm_set1_pd(shY+x[ii3+1]);
		iz     = _mm_set1_pd(shZ+x[ii3+2]);
		

		/* Polarization energy for atom ai */
		gpi    = _mm_setzero_pd();
		
        rai     = _mm_load1_pd(gb_radius+ii);
        prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);

		for(k=nj0;k<nj1-1;k+=2)
		{
			jnrA        = jjnr[k];   
			jnrB        = jjnr[k+1];
            
            j3A         = 3*jnrA;  
			j3B         = 3*jnrB;
            
            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
            
            GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
			GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
            
			dx          = _mm_sub_pd(ix,jx);
			dy          = _mm_sub_pd(iy,jy);
			dz          = _mm_sub_pd(iz,jz);
            
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            rinv2       = _mm_mul_pd(rinv,rinv);
            rinv4       = _mm_mul_pd(rinv2,rinv2);
            rinv6       = _mm_mul_pd(rinv4,rinv2);
            
            rvdw        = _mm_add_pd(rai,raj);
            ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
            
            mask_cmp    = _mm_cmple_pd(ratio,still_p5inv);

            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
            if( 0 == _mm_movemask_pd(mask_cmp) )
            {
                /* if ratio>still_p5inv for ALL elements */
                ccf         = one;
                dccf        = _mm_setzero_pd();
            }
            else 
            {
                ratio       = _mm_min_pd(ratio,still_p5inv);
                theta       = _mm_mul_pd(ratio,still_pip5);
                gmx_mm_sincos_pd(theta,&sinq,&cosq);
                term        = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
                ccf         = _mm_mul_pd(term,term);
                dccf        = _mm_mul_pd(_mm_mul_pd(two,term),
                                         _mm_mul_pd(sinq,theta));
            }

            prod        = _mm_mul_pd(still_p4,vaj);
            icf4        = _mm_mul_pd(ccf,rinv4);
            icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
                        
            GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
            
            gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
            
            _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
            dadx+=2;
            _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
            dadx+=2;
		} 
        
        if(k<nj1)
		{
			jnrA        = jjnr[k];   
            
            j3A         = 3*jnrA;  
            
            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
            
            GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
			GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
            
			dx          = _mm_sub_sd(ix,jx);
			dy          = _mm_sub_sd(iy,jy);
			dz          = _mm_sub_sd(iz,jz);
            
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            rinv2       = _mm_mul_sd(rinv,rinv);
            rinv4       = _mm_mul_sd(rinv2,rinv2);
            rinv6       = _mm_mul_sd(rinv4,rinv2);
            
            rvdw        = _mm_add_sd(rai,raj);
            ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
            
            mask_cmp    = _mm_cmple_sd(ratio,still_p5inv);
            
            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
            if( 0 == _mm_movemask_pd(mask_cmp) )
            {
                /* if ratio>still_p5inv for ALL elements */
                ccf         = one;
                dccf        = _mm_setzero_pd();
            }
            else 
            {
                ratio       = _mm_min_sd(ratio,still_p5inv);
                theta       = _mm_mul_sd(ratio,still_pip5);
                gmx_mm_sincos_pd(theta,&sinq,&cosq);
                term        = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
                ccf         = _mm_mul_sd(term,term);
                dccf        = _mm_mul_sd(_mm_mul_sd(two,term),
                                         _mm_mul_sd(sinq,theta));
            }
            
            prod        = _mm_mul_sd(still_p4,vaj);
            icf4        = _mm_mul_sd(ccf,rinv4);
            icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);

            GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
            
            gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
            
            _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
            dadx+=2;
            _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
            dadx+=2;
		} 
        gmx_mm_update_1pot_pd(gpi,work+ii);
	}
    
	/* Sum up the polarization energy from other nodes */
	if(PARTDECOMP(cr))
	{
		gmx_sum(natoms, work, cr);
	}
	else if(DOMAINDECOMP(cr))
	{
		dd_atom_sum_real(cr->dd, work);
	}
	
	/* Compute the radii */
	for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
	{		
		if(born->use[i] != 0)
		{
			gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
			gpi2             = gpi_ai * gpi_ai;
			born->bRad[i]   = factor*gmx_invsqrt(gpi2);
			fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
		}
	}
    
	/* Extra (local) communication required for DD */
	if(DOMAINDECOMP(cr))
	{
		dd_atom_spread_real(cr->dd, born->bRad);
		dd_atom_spread_real(cr->dd, fr->invsqrta);
	}
    
	return 0;	
}
Example #9
0
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C)
{
//#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__))
#ifdef OGDF_SSE3_EXTENSIONS

	const int n            = C.numberOfNodes();

#ifdef _OPENMP
	const int work = 256;
	const int nThreadsRep  = min(omp_get_max_threads(), 1 + n*n/work);
	const int nThreadsPrev = min(omp_get_max_threads(), 1 + n  /work);
#endif

	const double k       = m_idealEdgeLength;
	const double kSquare = k*k;
	const double c_rep   = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal

	const double minDist       = 10e-6;//100*DBL_EPSILON;
	const double minDistSquare = minDist*minDist;

	double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double));
	double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double));

	__m128d mm_kSquare       = _mm_set1_pd(kSquare);
	__m128d mm_minDist       = _mm_set1_pd(minDist);
	__m128d mm_minDistSquare = _mm_set1_pd(minDistSquare);
	__m128d mm_c_rep         = _mm_set1_pd(c_rep);

	#pragma omp parallel num_threads(nThreadsRep)
	{
		double tx = m_txNull;
		double ty = m_tyNull;
		int cF = 1;

		for(int i = 1; i <= m_iterations; i++)
		{
			// repulsive forces

			#pragma omp for
			for(int v = 0; v < n; ++v)
			{
				__m128d mm_disp_xv = _mm_setzero_pd();
				__m128d mm_disp_yv = _mm_setzero_pd();

				__m128d mm_xv = _mm_set1_pd(C.m_x[v]);
				__m128d mm_yv = _mm_set1_pd(C.m_y[v]);

				int u;
				for(u = 0; u+1 < v; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				int uStart = u+2;
				if(u == v) ++u;
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				for(u = uStart; u < n; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv);
				mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv);

				_mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep));
				_mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep));
			}

			// attractive forces

			#pragma omp single
			for(int e = 0; e < C.numberOfEdges(); ++e)
			{
				int v = C.m_src[e];
				int u = C.m_tgt[e];

				double delta_x = C.m_x[v] - C.m_x[u];
				double delta_y = C.m_y[v] - C.m_y[u];

				double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y));

				disp_x[v] -= delta_x * dist / k;
				disp_y[v] -= delta_y * dist / k;

				disp_x[u] += delta_x * dist / k;
				disp_y[u] += delta_y * dist / k;
			}

			// limit the maximum displacement to the temperature (m_tx,m_ty)

			__m128d mm_tx = _mm_set1_pd(tx);
			__m128d mm_ty = _mm_set1_pd(ty);

			#pragma omp for nowait
			for(int v = 0; v < n-1; v += 2)
			{
				__m128d mm_disp_xv = _mm_load_pd(&disp_x[v]);
				__m128d mm_disp_yv = _mm_load_pd(&disp_y[v]);

				__m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd(
					_mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv))
				));

				_mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx))
				));
				_mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty))
				));
			}
			#pragma omp single nowait
			{
				if(n % 2) {
					int v = n-1;
					double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v]));

					C.m_x[v] += disp_x[v] / dist * min(dist,tx);
					C.m_y[v] += disp_y[v] / dist * min(dist,ty);
				}
			}

			cool(tx,ty,cF);

			#pragma omp barrier
		}
	}

	System::alignedMemoryFree(disp_x);
	System::alignedMemoryFree(disp_y);

#else
	mainStep(C);
#endif
}
Example #10
0
test (__m128d s1, __m128d s2)
{
  return _mm_min_pd (s1, s2); 
}