Exemplo n.º 1
0
int lis_quad_sqrt(LIS_QUAD *a, const LIS_QUAD *b)
{
	LIS_QUAD_DECLAR;

	#ifndef USE_SSE2
		LIS_QUAD_SQRT(a->hi,a->lo,b->hi,b->lo);
	#else
		LIS_QUAD_SQRT_SSE2(a->hi,a->lo,b->hi,b->lo);
	#endif
	return LIS_SUCCESS;
}
Exemplo n.º 2
0
LIS_INT lis_vector_nrm2ex_mm(LIS_VECTOR vx, LIS_QUAD_PTR *val)
{
	LIS_INT i,n;
	LIS_SCALAR *x,*xl;
	LIS_QUAD_PTR dotm2,dotm,tmpm;
	#ifdef _OPENMP
		LIS_INT is,ie,nprocs,my_rank;
		LIS_SCALAR *gt;
	#endif
	#ifdef USE_MPI
		MPI_Comm comm;
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = vx->n;
	x  = vx->value;
	xl = vx->value_lo;
	dotm2.hi = &vx->work[0];
	dotm2.lo = &vx->work[2];
	dotm.hi = &vx->work[8];
	dotm.lo = &vx->work[9];
	tmpm.hi = &vx->work[10];
	tmpm.lo = &vx->work[11];
	#ifdef USE_MPI
		comm   = vx->comm;
	#endif
	#ifdef _OPENMP
		gt     = lis_vec_tmp;
		nprocs = omp_get_max_threads();
		#ifndef USE_SSE2
			#pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl)
		#else
			#pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank)
		#endif
		{
			my_rank = omp_get_thread_num();
			LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
			#ifndef USE_FMA2_SSE2
				gt[my_rank*LIS_VEC_TMP_PADD] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0;
				#pragma cdir nodep
				for(i=is;i<ie;i++)
				{
					LIS_QUAD_FSA(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]);
				}
			#else
				gt[my_rank*LIS_VEC_TMP_PADD  ] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0;
				gt[my_rank*LIS_VEC_TMP_PADD+2] = gt[my_rank*LIS_VEC_TMP_PADD+3] = 0.0;
				#ifdef USE_VEC_COMP
				#pragma cdir nodep
				#endif
				for(i=is;i<ie-1;i+=2)
				{
					LIS_QUAD_FSA2_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],x[i],xl[i]);
				}
				LIS_QUAD_ADD_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD+3]);
				for(;i<ie;i++)
				{
					LIS_QUAD_FSA_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]);
				}
			#endif
		}
		dotm.hi[0] = dotm.lo[0] = 0.0;
		for(i=0;i<nprocs;i++)
		{
			#ifndef USE_SSE2
				LIS_QUAD_ADD(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]);
			#else
				LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]);
			#endif
		}
	#else
		#ifndef USE_FMA2_SSE2
			dotm.hi[0] = dotm.lo[0] = 0.0;
			#pragma cdir nodep
			for(i=0;i<n;i++)
			{
				LIS_QUAD_FSA(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]);
			}
		#else
			dotm2.hi[0] = dotm2.hi[1] = 0.0;
			dotm2.lo[0] = dotm2.lo[1] = 0.0;
			for(i=0;i<n-1;i+=2)
			{
				LIS_QUAD_FSA2_SSE2(dotm2.hi[0],dotm2.lo[0],dotm2.hi[0],dotm2.lo[0],x[i],xl[i]);
			}
			LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm2.hi[0],dotm2.lo[0],dotm2.hi[1],dotm2.lo[1]);
			for(;i<n;i++)
			{
				LIS_QUAD_FSA_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]);
			}
		#endif
	#endif
	#ifdef USE_MPI
		MPI_Allreduce(dotm.hi,tmpm.hi,1,LIS_MPI_MSCALAR,LIS_MPI_MSUM,comm);
		#ifndef USE_SSE2
			LIS_QUAD_SQRT(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]);
		#else
			LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]);
		#endif
	#else
		#ifndef USE_SSE2
			LIS_QUAD_SQRT(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]);
		#else
			LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]);
		#endif
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}