static void clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1) { __m128d xmm1; double min = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { double x = *src1++; if (x < min) x = min; *dest++ = x; } xmm1 = _mm_set1_pd(min); for (; n >= 2; n -= 2) { __m128d xmm0; xmm0 = _mm_loadu_pd(src1); xmm0 = _mm_max_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); dest += 2; src1 += 2; } for (; n > 0; n--) { double x = *src1++; if (x < min) x = min; *dest++ = x; } }
__m128d test_mm_max_pd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_max_pd // DAG: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) // // ASM-LABEL: test_mm_max_pd // ASM: maxpd return _mm_max_pd(A, B); }
__SIMDd _SIMD_max_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_max_pd(a,b); #elif defined USE_AVX return _mm256_max_pd(a,b); #elif defined USE_IBM return vec_max(a,b); #endif }
test (__m128d s1, __m128d s2) { return _mm_max_pd (s1, s2); }
BI_FORCE_INLINE inline sse_double max(const sse_double x, const sse_double y) { sse_double res; res.packed = _mm_max_pd(x.packed, y.packed); return res; }
AABB3d TriangleItemHandler::clip( const size_t item_index, const size_t dimension, const double slab_min, const double slab_max) const { const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index]; if (vertex_info.m_motion_segment_count > 0) { AABB3d triangle_bbox = m_triangle_bboxes[item_index]; if (triangle_bbox.min[dimension] < slab_min) triangle_bbox.min[dimension] = slab_min; if (triangle_bbox.max[dimension] > slab_max) triangle_bbox.max[dimension] = slab_max; return triangle_bbox; } #ifdef APPLESEED_USE_SSE APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const double v0d = v0[dimension]; const double v1d = v1[dimension]; const double v2d = v2[dimension]; const int v0_ge_min = v0d >= slab_min ? 1 : 0; const int v0_le_max = v0d <= slab_max ? 1 : 0; const int v1_ge_min = v1d >= slab_min ? 1 : 0; const int v1_le_max = v1d <= slab_max ? 1 : 0; const int v2_ge_min = v2d >= slab_min ? 1 : 0; const int v2_le_max = v2d <= slab_max ? 1 : 0; __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max()); __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max()); const __m128d v0_xy = _mm_load_pd(&v0.x); const __m128d v0_zz = _mm_set1_pd(v0.z); const __m128d v1_xy = _mm_load_pd(&v1.x); const __m128d v1_zz = _mm_set1_pd(v1.z); const __m128d v2_xy = _mm_load_pd(&v2.x); const __m128d v2_zz = _mm_set1_pd(v2.z); if (v0_ge_min & v0_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz); } if (v1_ge_min & v1_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz); } if (v2_ge_min & v2_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz); } const int v0v1_cross_min = v0_ge_min ^ v1_ge_min; const int v0v1_cross_max = v0_le_max ^ v1_le_max; const int v1v2_cross_min = v1_ge_min ^ v2_ge_min; const int v1v2_cross_max = v1_le_max ^ v2_le_max; const int v2v0_cross_min = v2_ge_min ^ v0_ge_min; const int v2v0_cross_max = v2_le_max ^ v0_le_max; if (v0v1_cross_min | v0v1_cross_max) { const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]); if (v0v1_cross_min) { const double t = (slab_min - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v0v1_cross_max) { const double t = (slab_max - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v1v2_cross_min | v1v2_cross_max) { const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]); if (v1v2_cross_min) { const double t = (slab_min - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v1v2_cross_max) { const double t = (slab_max - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v2v0_cross_min | v2v0_cross_max) { const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]); if (v2v0_cross_min) { const double t = (slab_min - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v2v0_cross_max) { const double t = (slab_max - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } APPLESEED_SIMD4_ALIGN AABB3d bbox; _mm_store_pd(&bbox.min.x, bbox_min_xy); _mm_store_sd(&bbox.min.z, bbox_min_zz); _mm_storeu_pd(&bbox.max.x, bbox_max_xy); _mm_store_sd(&bbox.max.z, bbox_max_zz); if (bbox.min[dimension] < slab_min) bbox.min[dimension] = slab_min; if (bbox.max[dimension] > slab_max) bbox.max[dimension] = slab_max; #else const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0; const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0; const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0; const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0; const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0; const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0; AABB3d bbox; bbox.invalidate(); if (v0_ge_min & v0_le_max) bbox.insert(v0); if (v1_ge_min & v1_le_max) bbox.insert(v1); if (v2_ge_min & v2_le_max) bbox.insert(v2); if (v0_ge_min != v1_ge_min) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min)); if (v0_le_max != v1_le_max) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max)); if (v1_ge_min != v2_ge_min) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min)); if (v1_le_max != v2_le_max) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max)); if (v2_ge_min != v0_ge_min) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min)); if (v2_le_max != v0_le_max) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max)); #endif return bbox; }
/* vms_expma: * Compute the component-wise exponential minus <a>: * r[i] <-- e^x[i] - a * * The following comments apply to the SSE2 version of this code: * * Computation is done four doubles as a time by doing computation in paralell * on two vectors of two doubles using SSE2 intrisics. If size is not a * multiple of 4, the remaining elements are computed using the stdlib exp(). * * The computation is done by first doing a range reduction of the argument of * the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5]. * Then 2^k can be computed exactly using bit operations to build the double * result and e^f can be efficiently computed with enough precision using a * polynomial approximation. * * The polynomial approximation is done with 11th order polynomial computed by * Remez algorithm with the Solya suite, instead of the more classical Pade * polynomial form cause it is better suited to parallel execution. In order * to achieve the same precision, a Pade form seems to require three less * multiplications but need a very costly division, so it will be less * efficient. * * The maximum error is less than 1lsb and special cases are correctly * handled: * +inf or +oor --> return +inf * -inf or -oor --> return 0.0 * qNaN or sNaN --> return qNaN * * This code is copyright 2004-2012 Thomas Lavergne and licenced under the * BSD licence like the remaining of Wapiti. */ void xvm_expma(double r[], const double x[], double a, uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v)))) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); const __m128i vl = _mm_set1_epi64x(0x3ff0000000000000ULL); const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL); const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL); const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL); const __m128d hal = xvm_vconst(0x3fe0000000000000ULL); const __m128d nan = xvm_vconst(0xfff8000000000000ULL); const __m128d inf = xvm_vconst(0x7ff0000000000000ULL); const __m128d c1 = xvm_vconst(0x3fe62e4000000000ULL); const __m128d c2 = xvm_vconst(0x3eb7f7d1cf79abcaULL); const __m128d p0 = xvm_vconst(0x3feffffffffffffeULL); const __m128d p1 = xvm_vconst(0x3ff000000000000bULL); const __m128d p2 = xvm_vconst(0x3fe0000000000256ULL); const __m128d p3 = xvm_vconst(0x3fc5555555553a2aULL); const __m128d p4 = xvm_vconst(0x3fa55555554e57d3ULL); const __m128d p5 = xvm_vconst(0x3f81111111362f4fULL); const __m128d p6 = xvm_vconst(0x3f56c16c25f3bae1ULL); const __m128d p7 = xvm_vconst(0x3f2a019fc9310c33ULL); const __m128d p8 = xvm_vconst(0x3efa01825f3cb28bULL); const __m128d p9 = xvm_vconst(0x3ec71e2bd880fdd8ULL); const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL); const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL); const __m128d va = _mm_set1_pd(a); for (uint64_t n = 0; n < N; n += 4) { __m128d mn1, mn2, mi1, mi2; __m128d t1, t2, d1, d2; __m128d v1, v2, w1, w2; __m128i k1, k2; __m128d f1, f2; // Load the next four values __m128d x1 = _mm_load_pd(x + n ); __m128d x2 = _mm_load_pd(x + n + 2); // Check for out of ranges, infinites and NaN mn1 = _mm_cmpneq_pd(x1, x1); mn2 = _mm_cmpneq_pd(x2, x2); mi1 = _mm_cmpgt_pd(x1, ehi); mi2 = _mm_cmpgt_pd(x2, ehi); x1 = _mm_max_pd(x1, elo); x2 = _mm_max_pd(x2, elo); // Range reduction: we search k and f such that e^x = 2^k * e^f // with f in [-0.5, 0.5] t1 = _mm_mul_pd(x1, l2e); t2 = _mm_mul_pd(x2, l2e); t1 = _mm_add_pd(t1, hal); t2 = _mm_add_pd(t2, hal); k1 = _mm_cvttpd_epi32(t1); k2 = _mm_cvttpd_epi32(t2); d1 = _mm_cvtepi32_pd(k1); d2 = _mm_cvtepi32_pd(k2); t1 = _mm_mul_pd(d1, c1); t2 = _mm_mul_pd(d2, c1); f1 = _mm_sub_pd(x1, t1); f2 = _mm_sub_pd(x2, t2); t1 = _mm_mul_pd(d1, c2); t2 = _mm_mul_pd(d2, c2); f1 = _mm_sub_pd(f1, t1); f2 = _mm_sub_pd(f2, t2); // Evaluation of e^f using a 11th order polynom in Horner form v1 = _mm_mul_pd(f1, p11); v2 = _mm_mul_pd(f2, p11); v1 = _mm_add_pd(v1, p10); v2 = _mm_add_pd(v2, p10); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p9); v2 = _mm_add_pd(v2, p9); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p8); v2 = _mm_add_pd(v2, p8); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p7); v2 = _mm_add_pd(v2, p7); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p6); v2 = _mm_add_pd(v2, p6); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p5); v2 = _mm_add_pd(v2, p5); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p4); v2 = _mm_add_pd(v2, p4); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p3); v2 = _mm_add_pd(v2, p3); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p2); v2 = _mm_add_pd(v2, p2); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p1); v2 = _mm_add_pd(v2, p1); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p0); v2 = _mm_add_pd(v2, p0); // Evaluation of 2^k using bitops to achieve exact computation k1 = _mm_slli_epi32(k1, 20); k2 = _mm_slli_epi32(k2, 20); k1 = _mm_shuffle_epi32(k1, 0x72); k2 = _mm_shuffle_epi32(k2, 0x72); k1 = _mm_add_epi32(k1, vl); k2 = _mm_add_epi32(k2, vl); w1 = _mm_castsi128_pd(k1); w2 = _mm_castsi128_pd(k2); // Return to full range to substract <a> v1 = _mm_mul_pd(v1, w1); v2 = _mm_mul_pd(v2, w2); v1 = _mm_sub_pd(v1, va); v2 = _mm_sub_pd(v2, va); // Finally apply infinite and NaN where needed v1 = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1)); v2 = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2)); v1 = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1)); v2 = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2)); // Store the results _mm_store_pd(r + n, v1); _mm_store_pd(r + n + 2, v2); } #else for (uint64_t n = 0; n < N; n++) r[n] = exp(x[n]) - a; #endif }
static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return _mm_max_pd (a, b); }
inline F64vec2 max(const F64vec2 &l, const F64vec2 &r) { return _mm_max_pd(l, r); }
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C) { //#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__)) #ifdef OGDF_SSE3_EXTENSIONS const int n = C.numberOfNodes(); #ifdef _OPENMP const int work = 256; const int nThreadsRep = min(omp_get_max_threads(), 1 + n*n/work); const int nThreadsPrev = min(omp_get_max_threads(), 1 + n /work); #endif const double k = m_idealEdgeLength; const double kSquare = k*k; const double c_rep = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal const double minDist = 10e-6;//100*DBL_EPSILON; const double minDistSquare = minDist*minDist; double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); __m128d mm_kSquare = _mm_set1_pd(kSquare); __m128d mm_minDist = _mm_set1_pd(minDist); __m128d mm_minDistSquare = _mm_set1_pd(minDistSquare); __m128d mm_c_rep = _mm_set1_pd(c_rep); #pragma omp parallel num_threads(nThreadsRep) { double tx = m_txNull; double ty = m_tyNull; int cF = 1; for(int i = 1; i <= m_iterations; i++) { // repulsive forces #pragma omp for for(int v = 0; v < n; ++v) { __m128d mm_disp_xv = _mm_setzero_pd(); __m128d mm_disp_yv = _mm_setzero_pd(); __m128d mm_xv = _mm_set1_pd(C.m_x[v]); __m128d mm_yv = _mm_set1_pd(C.m_y[v]); int u; for(u = 0; u+1 < v; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } int uStart = u+2; if(u == v) ++u; if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } for(u = uStart; u < n; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv); mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv); _mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep)); _mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep)); } // attractive forces #pragma omp single for(int e = 0; e < C.numberOfEdges(); ++e) { int v = C.m_src[e]; int u = C.m_tgt[e]; double delta_x = C.m_x[v] - C.m_x[u]; double delta_y = C.m_y[v] - C.m_y[u]; double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y)); disp_x[v] -= delta_x * dist / k; disp_y[v] -= delta_y * dist / k; disp_x[u] += delta_x * dist / k; disp_y[u] += delta_y * dist / k; } // limit the maximum displacement to the temperature (m_tx,m_ty) __m128d mm_tx = _mm_set1_pd(tx); __m128d mm_ty = _mm_set1_pd(ty); #pragma omp for nowait for(int v = 0; v < n-1; v += 2) { __m128d mm_disp_xv = _mm_load_pd(&disp_x[v]); __m128d mm_disp_yv = _mm_load_pd(&disp_y[v]); __m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd( _mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv)) )); _mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]), _mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx)) )); _mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]), _mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty)) )); } #pragma omp single nowait { if(n % 2) { int v = n-1; double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v])); C.m_x[v] += disp_x[v] / dist * min(dist,tx); C.m_y[v] += disp_y[v] / dist * min(dist,ty); } } cool(tx,ty,cF); #pragma omp barrier } } System::alignedMemoryFree(disp_x); System::alignedMemoryFree(disp_y); #else mainStep(C); #endif }