static void clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1) { __m128d xmm1; double max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { double x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set1_pd(max); for (; n >= 2; n -= 2) { __m128d xmm0; xmm0 = _mm_loadu_pd(src1); xmm0 = _mm_min_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); dest += 2; src1 += 2; } for (; n > 0; n--) { double x = *src1++; if (x > max) x = max; *dest++ = x; } }
__m128d test_mm_min_pd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_min_pd // DAG: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) // // ASM-LABEL: test_mm_min_pd // ASM: minpd return _mm_min_pd(A, B); }
__SIMDd _SIMD_min_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_min_pd(a,b); #elif defined USE_AVX return _mm256_min_pd(a,b); #elif defined USE_IBM return vec_min(a,b); #endif }
BI_FORCE_INLINE inline sse_double min(const sse_double x, const sse_double y) { sse_double res; res.packed = _mm_min_pd(x.packed, y.packed); return res; }
AABB3d TriangleItemHandler::clip( const size_t item_index, const size_t dimension, const double slab_min, const double slab_max) const { const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index]; if (vertex_info.m_motion_segment_count > 0) { AABB3d triangle_bbox = m_triangle_bboxes[item_index]; if (triangle_bbox.min[dimension] < slab_min) triangle_bbox.min[dimension] = slab_min; if (triangle_bbox.max[dimension] > slab_max) triangle_bbox.max[dimension] = slab_max; return triangle_bbox; } #ifdef APPLESEED_USE_SSE APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const double v0d = v0[dimension]; const double v1d = v1[dimension]; const double v2d = v2[dimension]; const int v0_ge_min = v0d >= slab_min ? 1 : 0; const int v0_le_max = v0d <= slab_max ? 1 : 0; const int v1_ge_min = v1d >= slab_min ? 1 : 0; const int v1_le_max = v1d <= slab_max ? 1 : 0; const int v2_ge_min = v2d >= slab_min ? 1 : 0; const int v2_le_max = v2d <= slab_max ? 1 : 0; __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max()); __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max()); const __m128d v0_xy = _mm_load_pd(&v0.x); const __m128d v0_zz = _mm_set1_pd(v0.z); const __m128d v1_xy = _mm_load_pd(&v1.x); const __m128d v1_zz = _mm_set1_pd(v1.z); const __m128d v2_xy = _mm_load_pd(&v2.x); const __m128d v2_zz = _mm_set1_pd(v2.z); if (v0_ge_min & v0_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz); } if (v1_ge_min & v1_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz); } if (v2_ge_min & v2_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz); } const int v0v1_cross_min = v0_ge_min ^ v1_ge_min; const int v0v1_cross_max = v0_le_max ^ v1_le_max; const int v1v2_cross_min = v1_ge_min ^ v2_ge_min; const int v1v2_cross_max = v1_le_max ^ v2_le_max; const int v2v0_cross_min = v2_ge_min ^ v0_ge_min; const int v2v0_cross_max = v2_le_max ^ v0_le_max; if (v0v1_cross_min | v0v1_cross_max) { const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]); if (v0v1_cross_min) { const double t = (slab_min - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v0v1_cross_max) { const double t = (slab_max - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v1v2_cross_min | v1v2_cross_max) { const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]); if (v1v2_cross_min) { const double t = (slab_min - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v1v2_cross_max) { const double t = (slab_max - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v2v0_cross_min | v2v0_cross_max) { const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]); if (v2v0_cross_min) { const double t = (slab_min - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v2v0_cross_max) { const double t = (slab_max - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } APPLESEED_SIMD4_ALIGN AABB3d bbox; _mm_store_pd(&bbox.min.x, bbox_min_xy); _mm_store_sd(&bbox.min.z, bbox_min_zz); _mm_storeu_pd(&bbox.max.x, bbox_max_xy); _mm_store_sd(&bbox.max.z, bbox_max_zz); if (bbox.min[dimension] < slab_min) bbox.min[dimension] = slab_min; if (bbox.max[dimension] > slab_max) bbox.max[dimension] = slab_max; #else const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0; const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0; const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0; const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0; const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0; const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0; AABB3d bbox; bbox.invalidate(); if (v0_ge_min & v0_le_max) bbox.insert(v0); if (v1_ge_min & v1_le_max) bbox.insert(v1); if (v2_ge_min & v2_le_max) bbox.insert(v2); if (v0_ge_min != v1_ge_min) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min)); if (v0_le_max != v1_le_max) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max)); if (v1_ge_min != v2_ge_min) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min)); if (v1_le_max != v2_le_max) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max)); if (v2_ge_min != v0_ge_min) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min)); if (v2_le_max != v0_le_max) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max)); #endif return bbox; }
static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept { return _mm_min_pd (a, b); }
inline F64vec2 min(const F64vec2 &l, const F64vec2 &r) { return _mm_min_pd(l, r); }
int calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born) { int i,k,n,ii,is3,ii3,nj0,nj1,offset; int jnrA,jnrB,j3A,j3B; int *mdtype; double shX,shY,shZ; int *jjnr; double *shiftvec; double gpi_ai,gpi2; double factor; double *gb_radius; double *vsolv; double *work; double *dadx; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d tx,ty,tz; __m128d rsq,rinv,rinv2,rinv4,rinv6; __m128d ratio,gpi,rai,raj,vai,vaj,rvdw; __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp; __m128d mask,icf4,icf6,mask_cmp; const __m128d half = _mm_set1_pd(0.5); const __m128d three = _mm_set1_pd(3.0); const __m128d one = _mm_set1_pd(1.0); const __m128d two = _mm_set1_pd(2.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d four = _mm_set1_pd(4.0); const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV); const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5); const __m128d still_p4 = _mm_set1_pd(STILL_P4); factor = 0.5 * ONE_4PI_EPS0; gb_radius = born->gb_radius; vsolv = born->vsolv; work = born->gpol_still_work; jjnr = nl->jjnr; shiftvec = fr->shift_vec[0]; dadx = fr->dadx; jnrA = jnrB = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); n = 0; for(i=0;i<natoms;i++) { work[i]=0; } for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); /* Polarization energy for atom ai */ gpi = _mm_setzero_pd(); rai = _mm_load1_pd(gb_radius+ii); prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj); GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_pd(rinv,rinv); rinv4 = _mm_mul_pd(rinv2,rinv2); rinv6 = _mm_mul_pd(rinv4,rinv2); rvdw = _mm_add_pd(rai,raj); ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_pd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_pd(ratio,still_p5inv); theta = _mm_mul_pd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_pd(half,_mm_sub_pd(one,cosq)); ccf = _mm_mul_pd(term,term); dccf = _mm_mul_pd(_mm_mul_pd(two,term), _mm_mul_pd(sinq,theta)); } prod = _mm_mul_pd(still_p4,vaj); icf4 = _mm_mul_pd(ccf,rinv4); icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4)); gpi = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj); GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_sd(rinv,rinv); rinv4 = _mm_mul_sd(rinv2,rinv2); rinv6 = _mm_mul_sd(rinv4,rinv2); rvdw = _mm_add_sd(rai,raj); ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_sd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_sd(ratio,still_p5inv); theta = _mm_mul_sd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_sd(half,_mm_sub_sd(one,cosq)); ccf = _mm_mul_sd(term,term); dccf = _mm_mul_sd(_mm_mul_sd(two,term), _mm_mul_sd(sinq,theta)); } prod = _mm_mul_sd(still_p4,vaj); icf4 = _mm_mul_sd(ccf,rinv4); icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4)); gpi = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } gmx_mm_update_1pot_pd(gpi,work+ii); } /* Sum up the polarization energy from other nodes */ if(PARTDECOMP(cr)) { gmx_sum(natoms, work, cr); } else if(DOMAINDECOMP(cr)) { dd_atom_sum_real(cr->dd, work); } /* Compute the radii */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/ gpi2 = gpi_ai * gpi_ai; born->bRad[i] = factor*gmx_invsqrt(gpi2); fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); } } /* Extra (local) communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); } return 0; }
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C) { //#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__)) #ifdef OGDF_SSE3_EXTENSIONS const int n = C.numberOfNodes(); #ifdef _OPENMP const int work = 256; const int nThreadsRep = min(omp_get_max_threads(), 1 + n*n/work); const int nThreadsPrev = min(omp_get_max_threads(), 1 + n /work); #endif const double k = m_idealEdgeLength; const double kSquare = k*k; const double c_rep = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal const double minDist = 10e-6;//100*DBL_EPSILON; const double minDistSquare = minDist*minDist; double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); __m128d mm_kSquare = _mm_set1_pd(kSquare); __m128d mm_minDist = _mm_set1_pd(minDist); __m128d mm_minDistSquare = _mm_set1_pd(minDistSquare); __m128d mm_c_rep = _mm_set1_pd(c_rep); #pragma omp parallel num_threads(nThreadsRep) { double tx = m_txNull; double ty = m_tyNull; int cF = 1; for(int i = 1; i <= m_iterations; i++) { // repulsive forces #pragma omp for for(int v = 0; v < n; ++v) { __m128d mm_disp_xv = _mm_setzero_pd(); __m128d mm_disp_yv = _mm_setzero_pd(); __m128d mm_xv = _mm_set1_pd(C.m_x[v]); __m128d mm_yv = _mm_set1_pd(C.m_y[v]); int u; for(u = 0; u+1 < v; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } int uStart = u+2; if(u == v) ++u; if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } for(u = uStart; u < n; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv); mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv); _mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep)); _mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep)); } // attractive forces #pragma omp single for(int e = 0; e < C.numberOfEdges(); ++e) { int v = C.m_src[e]; int u = C.m_tgt[e]; double delta_x = C.m_x[v] - C.m_x[u]; double delta_y = C.m_y[v] - C.m_y[u]; double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y)); disp_x[v] -= delta_x * dist / k; disp_y[v] -= delta_y * dist / k; disp_x[u] += delta_x * dist / k; disp_y[u] += delta_y * dist / k; } // limit the maximum displacement to the temperature (m_tx,m_ty) __m128d mm_tx = _mm_set1_pd(tx); __m128d mm_ty = _mm_set1_pd(ty); #pragma omp for nowait for(int v = 0; v < n-1; v += 2) { __m128d mm_disp_xv = _mm_load_pd(&disp_x[v]); __m128d mm_disp_yv = _mm_load_pd(&disp_y[v]); __m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd( _mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv)) )); _mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]), _mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx)) )); _mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]), _mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty)) )); } #pragma omp single nowait { if(n % 2) { int v = n-1; double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v])); C.m_x[v] += disp_x[v] / dist * min(dist,tx); C.m_y[v] += disp_y[v] / dist * min(dist,ty); } } cool(tx,ty,cF); #pragma omp barrier } } System::alignedMemoryFree(disp_x); System::alignedMemoryFree(disp_y); #else mainStep(C); #endif }
test (__m128d s1, __m128d s2) { return _mm_min_pd (s1, s2); }