示例#1
0
__m128d test_mm_div_sd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_div_sd
  // DAG: fdiv double
  //
  // ASM-LABEL: test_mm_div_sd
  // ASM: divsd
  return _mm_div_sd(A, B);
}
示例#2
0
void CalcGravity(int sp, PSpot* allSpot,int* length)
{
	__m128d force1 = _mm_set1_pd(0);
	__m128d force2 = _mm_set1_pd(0);
	
	PSpot* spotSp = &allSpot[sp];

	for(int i=0;i<sp;i++)
	{
		__m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1);
		__m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2);

		__m128d r = Length(diff1, diff2);

		if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass))
		{
			if (allSpot[i].mass > spotSp->mass)
			{
				allSpot[i].heading1 = 
					_mm_add_pd(
						allSpot[i].heading1,
						_mm_mul_pd(
							_mm_sub_pd(spotSp->heading1, allSpot[i].heading1),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].heading2 = 
					_mm_add_sd(
						allSpot[i].heading2,
						_mm_mul_sd(
							_mm_sub_sd(spotSp->heading2, allSpot[i].heading2),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].mass += spotSp->mass;
				allSpot[i].qmass = pow(allSpot[i].mass, 0.33333);
				spotSp->mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[sp];
				allSpot[sp] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
			else
			{
				spotSp->heading1 = 
					_mm_add_pd(
						spotSp->heading1,
						_mm_mul_pd(
							_mm_sub_pd(allSpot[i].heading1, spotSp->heading1),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->heading2 = 
					_mm_add_sd(
						spotSp->heading2,
						_mm_mul_sd(
							_mm_sub_sd(allSpot[i].heading2, spotSp->heading2),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->mass += allSpot[i].mass;
				spotSp->qmass = pow(spotSp->mass, 0.33333);
				allSpot[i].mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[i];
				allSpot[i] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
		}

		//float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]);

		__m128d r1 = r;
		r1.m128d_f64[1] = G;
		__m128d r2 = r;
		r2.m128d_f64[1] = spotSp->mass;
		__m128d r3 = r;
		r3.m128d_f64[1] = allSpot[i].mass;
		__m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3);
		__m128d r5 = _mm_shuffle_pd(r4, r4, 3);
		r4 = _mm_shuffle_pd(r4, r4, 0);
		__m128d r6 = _mm_div_pd(r5, r4);

		force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6));
		force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6));
	}

	for(int i=sp+1;i<*length;i++)
	{
		__m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1);
		__m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2);

		__m128d r = Length(diff1, diff2);

		if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass))
		{
			if (allSpot[i].mass > spotSp->mass)
			{
				allSpot[i].heading1 = 
					_mm_add_pd(
						allSpot[i].heading1,
						_mm_mul_pd(
							_mm_sub_pd(spotSp->heading1, allSpot[i].heading1),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].heading2 = 
					_mm_add_sd(
						allSpot[i].heading2,
						_mm_mul_sd(
							_mm_sub_sd(spotSp->heading2, allSpot[i].heading2),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].mass += spotSp->mass;
				allSpot[i].qmass = pow(allSpot[i].mass, 0.33333);
				spotSp->mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[sp];
				allSpot[sp] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
			else
			{
				spotSp->heading1 = 
					_mm_add_pd(
						spotSp->heading1,
						_mm_mul_pd(
							_mm_sub_pd(allSpot[i].heading1, spotSp->heading1),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->heading2 = 
					_mm_add_sd(
						spotSp->heading2,
						_mm_mul_sd(
							_mm_sub_sd(allSpot[i].heading2, spotSp->heading2),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->mass += allSpot[i].mass;
				spotSp->qmass = pow(spotSp->mass, 0.33333);
				allSpot[i].mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[i];
				allSpot[i] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
		}

		//float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]);

		__m128d r1 = r;
		r1.m128d_f64[1] = G;
		__m128d r2 = r;
		r2.m128d_f64[1] = spotSp->mass;
		__m128d r3 = r;
		r3.m128d_f64[1] = allSpot[i].mass;
		__m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3);
		__m128d r5 = _mm_shuffle_pd(r4, r4, 3);
		r4 = _mm_shuffle_pd(r4, r4, 0);
		__m128d r6 = _mm_div_pd(r5, r4);

		force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6));
		force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6));
	}

	force1 = _mm_div_pd(force1, _mm_set1_pd(spotSp->mass));
	force2 = _mm_div_sd(force2, _mm_set1_pd(spotSp->mass));

	__m128d forcef = Length(force1, force2);

	if (forcef.m128d_f64[0] > 0)
	{
		double gate = 0.001f;
		double step = gate / forcef.m128d_f64[0];

		if (spotSp->process + step < 1)
		{
			spotSp->process += step;
		}
		else
		{
			step = 1 - spotSp->process;
			spotSp->process = 1;
		}

		__m128d stepd = _mm_set1_pd(step);

		spotSp->heading1 = _mm_add_pd(spotSp->heading1,_mm_mul_pd(force1,stepd));
		spotSp->heading2 = _mm_add_sd(spotSp->heading2,_mm_mul_sd(force2,stepd));

		spotSp->pos1 = _mm_add_pd(spotSp->pos1, _mm_mul_pd(spotSp->heading1,stepd));
		spotSp->pos2 = _mm_add_sd(spotSp->pos2, _mm_mul_sd(spotSp->heading2,stepd));
	}
	else
	{
		spotSp->pos1 = _mm_add_pd(spotSp->pos1, spotSp->heading1);
		spotSp->pos2 = _mm_add_sd(spotSp->pos2, spotSp->heading2);
		spotSp->process = 1;
	}
}
示例#3
0
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C)
{
//#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__))
#ifdef OGDF_SSE3_EXTENSIONS

	const int n            = C.numberOfNodes();

#ifdef _OPENMP
	const int work = 256;
	const int nThreadsRep  = min(omp_get_max_threads(), 1 + n*n/work);
	const int nThreadsPrev = min(omp_get_max_threads(), 1 + n  /work);
#endif

	const double k       = m_idealEdgeLength;
	const double kSquare = k*k;
	const double c_rep   = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal

	const double minDist       = 10e-6;//100*DBL_EPSILON;
	const double minDistSquare = minDist*minDist;

	double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double));
	double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double));

	__m128d mm_kSquare       = _mm_set1_pd(kSquare);
	__m128d mm_minDist       = _mm_set1_pd(minDist);
	__m128d mm_minDistSquare = _mm_set1_pd(minDistSquare);
	__m128d mm_c_rep         = _mm_set1_pd(c_rep);

	#pragma omp parallel num_threads(nThreadsRep)
	{
		double tx = m_txNull;
		double ty = m_tyNull;
		int cF = 1;

		for(int i = 1; i <= m_iterations; i++)
		{
			// repulsive forces

			#pragma omp for
			for(int v = 0; v < n; ++v)
			{
				__m128d mm_disp_xv = _mm_setzero_pd();
				__m128d mm_disp_yv = _mm_setzero_pd();

				__m128d mm_xv = _mm_set1_pd(C.m_x[v]);
				__m128d mm_yv = _mm_set1_pd(C.m_y[v]);

				int u;
				for(u = 0; u+1 < v; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				int uStart = u+2;
				if(u == v) ++u;
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				for(u = uStart; u < n; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv);
				mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv);

				_mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep));
				_mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep));
			}

			// attractive forces

			#pragma omp single
			for(int e = 0; e < C.numberOfEdges(); ++e)
			{
				int v = C.m_src[e];
				int u = C.m_tgt[e];

				double delta_x = C.m_x[v] - C.m_x[u];
				double delta_y = C.m_y[v] - C.m_y[u];

				double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y));

				disp_x[v] -= delta_x * dist / k;
				disp_y[v] -= delta_y * dist / k;

				disp_x[u] += delta_x * dist / k;
				disp_y[u] += delta_y * dist / k;
			}

			// limit the maximum displacement to the temperature (m_tx,m_ty)

			__m128d mm_tx = _mm_set1_pd(tx);
			__m128d mm_ty = _mm_set1_pd(ty);

			#pragma omp for nowait
			for(int v = 0; v < n-1; v += 2)
			{
				__m128d mm_disp_xv = _mm_load_pd(&disp_x[v]);
				__m128d mm_disp_yv = _mm_load_pd(&disp_y[v]);

				__m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd(
					_mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv))
				));

				_mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx))
				));
				_mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty))
				));
			}
			#pragma omp single nowait
			{
				if(n % 2) {
					int v = n-1;
					double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v]));

					C.m_x[v] += disp_x[v] / dist * min(dist,tx);
					C.m_y[v] += disp_y[v] / dist * min(dist,ty);
				}
			}

			cool(tx,ty,cF);

			#pragma omp barrier
		}
	}

	System::alignedMemoryFree(disp_x);
	System::alignedMemoryFree(disp_y);

#else
	mainStep(C);
#endif
}