__m128d test_mm_div_sd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_div_sd // DAG: fdiv double // // ASM-LABEL: test_mm_div_sd // ASM: divsd return _mm_div_sd(A, B); }
void CalcGravity(int sp, PSpot* allSpot,int* length) { __m128d force1 = _mm_set1_pd(0); __m128d force2 = _mm_set1_pd(0); PSpot* spotSp = &allSpot[sp]; for(int i=0;i<sp;i++) { __m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1); __m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2); __m128d r = Length(diff1, diff2); if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass)) { if (allSpot[i].mass > spotSp->mass) { allSpot[i].heading1 = _mm_add_pd( allSpot[i].heading1, _mm_mul_pd( _mm_sub_pd(spotSp->heading1, allSpot[i].heading1), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].heading2 = _mm_add_sd( allSpot[i].heading2, _mm_mul_sd( _mm_sub_sd(spotSp->heading2, allSpot[i].heading2), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].mass += spotSp->mass; allSpot[i].qmass = pow(allSpot[i].mass, 0.33333); spotSp->mass = 0; (*length)--; PSpot temp; temp = allSpot[sp]; allSpot[sp] = allSpot[*length]; allSpot[*length] = temp; return; } else { spotSp->heading1 = _mm_add_pd( spotSp->heading1, _mm_mul_pd( _mm_sub_pd(allSpot[i].heading1, spotSp->heading1), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->heading2 = _mm_add_sd( spotSp->heading2, _mm_mul_sd( _mm_sub_sd(allSpot[i].heading2, spotSp->heading2), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->mass += allSpot[i].mass; spotSp->qmass = pow(spotSp->mass, 0.33333); allSpot[i].mass = 0; (*length)--; PSpot temp; temp = allSpot[i]; allSpot[i] = allSpot[*length]; allSpot[*length] = temp; return; } } //float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]); __m128d r1 = r; r1.m128d_f64[1] = G; __m128d r2 = r; r2.m128d_f64[1] = spotSp->mass; __m128d r3 = r; r3.m128d_f64[1] = allSpot[i].mass; __m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3); __m128d r5 = _mm_shuffle_pd(r4, r4, 3); r4 = _mm_shuffle_pd(r4, r4, 0); __m128d r6 = _mm_div_pd(r5, r4); force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6)); force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6)); } for(int i=sp+1;i<*length;i++) { __m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1); __m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2); __m128d r = Length(diff1, diff2); if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass)) { if (allSpot[i].mass > spotSp->mass) { allSpot[i].heading1 = _mm_add_pd( allSpot[i].heading1, _mm_mul_pd( _mm_sub_pd(spotSp->heading1, allSpot[i].heading1), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].heading2 = _mm_add_sd( allSpot[i].heading2, _mm_mul_sd( _mm_sub_sd(spotSp->heading2, allSpot[i].heading2), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].mass += spotSp->mass; allSpot[i].qmass = pow(allSpot[i].mass, 0.33333); spotSp->mass = 0; (*length)--; PSpot temp; temp = allSpot[sp]; allSpot[sp] = allSpot[*length]; allSpot[*length] = temp; return; } else { spotSp->heading1 = _mm_add_pd( spotSp->heading1, _mm_mul_pd( _mm_sub_pd(allSpot[i].heading1, spotSp->heading1), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->heading2 = _mm_add_sd( spotSp->heading2, _mm_mul_sd( _mm_sub_sd(allSpot[i].heading2, spotSp->heading2), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->mass += allSpot[i].mass; spotSp->qmass = pow(spotSp->mass, 0.33333); allSpot[i].mass = 0; (*length)--; PSpot temp; temp = allSpot[i]; allSpot[i] = allSpot[*length]; allSpot[*length] = temp; return; } } //float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]); __m128d r1 = r; r1.m128d_f64[1] = G; __m128d r2 = r; r2.m128d_f64[1] = spotSp->mass; __m128d r3 = r; r3.m128d_f64[1] = allSpot[i].mass; __m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3); __m128d r5 = _mm_shuffle_pd(r4, r4, 3); r4 = _mm_shuffle_pd(r4, r4, 0); __m128d r6 = _mm_div_pd(r5, r4); force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6)); force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6)); } force1 = _mm_div_pd(force1, _mm_set1_pd(spotSp->mass)); force2 = _mm_div_sd(force2, _mm_set1_pd(spotSp->mass)); __m128d forcef = Length(force1, force2); if (forcef.m128d_f64[0] > 0) { double gate = 0.001f; double step = gate / forcef.m128d_f64[0]; if (spotSp->process + step < 1) { spotSp->process += step; } else { step = 1 - spotSp->process; spotSp->process = 1; } __m128d stepd = _mm_set1_pd(step); spotSp->heading1 = _mm_add_pd(spotSp->heading1,_mm_mul_pd(force1,stepd)); spotSp->heading2 = _mm_add_sd(spotSp->heading2,_mm_mul_sd(force2,stepd)); spotSp->pos1 = _mm_add_pd(spotSp->pos1, _mm_mul_pd(spotSp->heading1,stepd)); spotSp->pos2 = _mm_add_sd(spotSp->pos2, _mm_mul_sd(spotSp->heading2,stepd)); } else { spotSp->pos1 = _mm_add_pd(spotSp->pos1, spotSp->heading1); spotSp->pos2 = _mm_add_sd(spotSp->pos2, spotSp->heading2); spotSp->process = 1; } }
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C) { //#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__)) #ifdef OGDF_SSE3_EXTENSIONS const int n = C.numberOfNodes(); #ifdef _OPENMP const int work = 256; const int nThreadsRep = min(omp_get_max_threads(), 1 + n*n/work); const int nThreadsPrev = min(omp_get_max_threads(), 1 + n /work); #endif const double k = m_idealEdgeLength; const double kSquare = k*k; const double c_rep = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal const double minDist = 10e-6;//100*DBL_EPSILON; const double minDistSquare = minDist*minDist; double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); __m128d mm_kSquare = _mm_set1_pd(kSquare); __m128d mm_minDist = _mm_set1_pd(minDist); __m128d mm_minDistSquare = _mm_set1_pd(minDistSquare); __m128d mm_c_rep = _mm_set1_pd(c_rep); #pragma omp parallel num_threads(nThreadsRep) { double tx = m_txNull; double ty = m_tyNull; int cF = 1; for(int i = 1; i <= m_iterations; i++) { // repulsive forces #pragma omp for for(int v = 0; v < n; ++v) { __m128d mm_disp_xv = _mm_setzero_pd(); __m128d mm_disp_yv = _mm_setzero_pd(); __m128d mm_xv = _mm_set1_pd(C.m_x[v]); __m128d mm_yv = _mm_set1_pd(C.m_y[v]); int u; for(u = 0; u+1 < v; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } int uStart = u+2; if(u == v) ++u; if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } for(u = uStart; u < n; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv); mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv); _mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep)); _mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep)); } // attractive forces #pragma omp single for(int e = 0; e < C.numberOfEdges(); ++e) { int v = C.m_src[e]; int u = C.m_tgt[e]; double delta_x = C.m_x[v] - C.m_x[u]; double delta_y = C.m_y[v] - C.m_y[u]; double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y)); disp_x[v] -= delta_x * dist / k; disp_y[v] -= delta_y * dist / k; disp_x[u] += delta_x * dist / k; disp_y[u] += delta_y * dist / k; } // limit the maximum displacement to the temperature (m_tx,m_ty) __m128d mm_tx = _mm_set1_pd(tx); __m128d mm_ty = _mm_set1_pd(ty); #pragma omp for nowait for(int v = 0; v < n-1; v += 2) { __m128d mm_disp_xv = _mm_load_pd(&disp_x[v]); __m128d mm_disp_yv = _mm_load_pd(&disp_y[v]); __m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd( _mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv)) )); _mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]), _mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx)) )); _mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]), _mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty)) )); } #pragma omp single nowait { if(n % 2) { int v = n-1; double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v])); C.m_x[v] += disp_x[v] / dist * min(dist,tx); C.m_y[v] += disp_y[v] / dist * min(dist,ty); } } cool(tx,ty,cF); #pragma omp barrier } } System::alignedMemoryFree(disp_x); System::alignedMemoryFree(disp_y); #else mainStep(C); #endif }