void lis_quad_sub(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c)
{
	LIS_QUAD_DECLAR;

	#ifndef USE_SSE2
		LIS_QUAD_ADD(a->hi,a->lo,b->hi,b->lo,-c->hi,-c->lo);
	#else
		LIS_QUAD_ADD_SSE2(a->hi,a->lo,b->hi,b->lo,-c->hi,-c->lo);
	#endif
}
Exemple #2
0
LIS_INT lis_reduce_mp(LIS_COMMTABLE commtable, LIS_VECTOR X)
{
	LIS_INT neib,i,is,inum,neibpetot,pad;
	LIS_SCALAR *x,*xl;
	LIS_SCALAR *ws,*wr;
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	neibpetot = commtable->neibpetot;
	ws        = commtable->ws;
	wr        = commtable->wr;
	pad       = commtable->pad;
	x         = X->value;
	xl        = X->value_lo;

	for(neib=0;neib<neibpetot;neib++)
	{
		is = commtable->import_ptr[neib];
		inum = commtable->import_ptr[neib+1] - is;
		for(i=0;i<inum;i++)
		{
			wr[is*2+i]      = x[commtable->import_index[is+i]+pad];
			wr[is*2+inum+i] = xl[commtable->import_index[is+i]+pad];
		}
		MPI_Isend(&wr[is*2],inum*2,MPI_DOUBLE,commtable->neibpe[neib],0,commtable->comm,&commtable->req1[neib]);
	}
	for(neib=0;neib<neibpetot;neib++)
	{
		is = commtable->export_ptr[neib];
		inum = commtable->export_ptr[neib+1] - is;
		MPI_Irecv(&ws[is*2],inum*2,MPI_DOUBLE,commtable->neibpe[neib],0,commtable->comm,&commtable->req2[neib]);
	}
	MPI_Waitall(neibpetot, commtable->req2, commtable->sta2);
	for(neib=0;neib<neibpetot;neib++)
	{
		is = commtable->export_ptr[neib];
		inum = commtable->export_ptr[neib+1] - is;
		for(i=0;i<inum;i++)
		{
			/*x[commtable->export_index[i]] += ws[i];*/
			#ifndef USE_SSE2
				LIS_QUAD_ADD(x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],ws[is*2+i],ws[is*2+inum+i]);
			#else
				LIS_QUAD_ADD_SSE2(x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],ws[is*2+i],ws[is*2+inum+i]);
			#endif
		}
	}
	MPI_Waitall(neibpetot, commtable->req1, commtable->sta1);

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
Exemple #3
0
void lis_mpi_msum(LIS_QUAD *invec, LIS_QUAD *inoutvec, LIS_INT *len, MPI_Datatype *datatype)
{
	LIS_INT	i;
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	for(i=0;i<*len;i++)
	{
		#ifndef USE_SSE2
			LIS_QUAD_ADD(inoutvec[i].hi,inoutvec[i].lo,inoutvec[i].hi,inoutvec[i].lo,invec[i].hi,invec[i].lo);
		#else
			LIS_QUAD_ADD_SSE2(inoutvec[i].hi,inoutvec[i].lo,inoutvec[i].hi,inoutvec[i].lo,invec[i].hi,invec[i].lo);
		#endif
	}

	LIS_DEBUG_FUNC_OUT;
}
LIS_INT lis_matvec_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y)
{
	LIS_INT i,j,jj,n,np;
	LIS_SCALAR *x,*y;
	#ifdef _OPENMP
		LIS_INT nprocs,k;
		LIS_SCALAR t,*w;
	#endif
	#ifdef USE_QUAD_PRECISION
		LIS_INT j0,j1;
		#ifdef _OPENMP
				LIS_SCALAR *ww,*wwl;
		#endif
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	np = A->np;
	n  = LU->n;
	x  = X->value;
	y  = Y->value;

	#ifdef USE_QUAD_PRECISION
	if( X->precision==LIS_PRECISION_DEFAULT )
	#endif
	{
		#ifdef USE_MPI
			LIS_MATVEC_SENDRECV;
		#endif
		#ifdef _OPENMP
			nprocs = omp_get_max_threads();
			w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs::w" );
			#pragma omp parallel private(i,j,k,jj,t)
			{
				k = omp_get_thread_num();
				#pragma omp for
				for(j=0;j<nprocs;j++)
				{
					memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
				}
				#pragma omp for 
				for(i=0;i<n;i++)
				{
					for(j=0;j<LU->nnz[i];j++)
					{
						jj = k*np + LU->index[i][j];
						w[jj] += LU->value[i][j] * X->value[i];
					}
				}
				#pragma omp for 
				for(i=0;i<np;i++)
				{
					t = 0.0;
					for(j=0;j<nprocs;j++)
					{
						t += w[j*np+i];
					}
					Y->value[i] = t;
				}
			}
			lis_free(w);
		#else
			for(i=0;i<np;i++)
			{
				Y->value[i] = 0.0;
			}
			for(i=0;i<n;i++)
			{
				for(j=0;j<LU->nnz[i];j++)
				{
					jj = LU->index[i][j];
					Y->value[jj] += LU->value[i][j] * X->value[i];
				}
			}
		#endif
	}
	#ifdef USE_QUAD_PRECISION
	else
	{
		#ifdef USE_MPI
			lis_send_recv_mp(A->commtable,X);
		#endif
		#ifdef _OPENMP
			#ifndef USE_FMA2_SSE2
				nprocs = omp_get_max_threads();
				ww  = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs_mp::ww" );
				wwl = &ww[nprocs*np];
				#ifndef USE_SSE2
					#pragma omp parallel private(i,j,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
				#else
					#pragma omp parallel private(i,j,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
				#endif
				{
					k = omp_get_thread_num();
					#pragma omp for
					for(j=0;j<nprocs;j++)
					{
						memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
						memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
					}
					#pragma omp for 
					for(i=0;i<n;i++)
					{
						for(j=0;j<LU->nnz[i];j++)
						{
							jj  = k*np + LU->index[i][j];
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
							#else
								LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
							#endif
						}
					}
					#pragma omp for 
					for(i=0;i<np;i++)
					{
						Y->value[i] = Y->value_lo[i] = 0.0;
						for(j=0;j<nprocs;j++)
						{
							#ifndef USE_SSE2
								LIS_QUAD_ADD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]);
							#else
								LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]);
							#endif
						}
					}
				}
				lis_free(ww);
			#else
				nprocs = omp_get_max_threads();
				ww  = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvect_crs_mp2::ww" );
				wwl = &ww[nprocs*np];
				#pragma omp parallel private(i,j,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
				{
					k = omp_get_thread_num();
					#pragma omp for
					for(j=0;j<nprocs;j++)
					{
						memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
						memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
					}
					#pragma omp for
					for(i=0; i<n; i++)
					{
						for(j=0;j<LU->nnz[i]-1;j+=2)
						{
							j0  = k*np + LU->index[i][j];
							j1  = k*np + LU->index[i][j+1];
							#ifdef USE_SSE2
								LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]);
							#endif
						}
						for(;j<LU->nnz[i];j++)
						{
							j0  = LU->index[i][j];
							#ifdef USE_SSE2
								LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],X->value[i],X->value_lo[i],LU->value[i][j]);
							#endif
						}
					}
					#pragma omp for 
					for(i=0;i<np;i++)
					{
						Y->value[i] = Y->value_lo[i] = 0.0;
						for(j=0;j<nprocs;j++)
						{
							#ifdef USE_SSE2
								LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]);
							#endif
						}
					}
				}
				lis_free(ww);
			#endif
		#else
			#ifndef USE_FMA2_SSE2
				for(i=0;i<np;i++)
				{
					Y->value[i]    = 0.0;
					Y->value_lo[i] = 0.0;
				}
				for(i=0;i<n;i++)
				{
					for(j=0;j<LU->nnz[i];j++)
					{
						jj  = LU->index[i][j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
						#else
							LIS_QUAD_FMAD_SSE2(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
						#endif
					}
				}
			#else
				for(i=0; i<np; i++)
				{
					Y->value[i]  = 0.0;
					Y->value_lo[i] = 0.0;
				}
				for(i=0; i<n; i++)
				{
					for(j=0;j<LU->nnz[i]-1;j+=2)
					{
						j0  = LU->index[i][j];
						j1  = LU->index[i][j+1];
						#ifdef USE_SSE2
							LIS_QUAD_FMAD2_SSE2_STSD(Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]);
						#endif
					}
					for(;j<LU->nnz[i];j++)
					{
						j0  = LU->index[i][j];
						#ifdef USE_SSE2
							LIS_QUAD_FMAD_SSE2(Y->value[j0],Y->value_lo[j0],Y->value[j0],Y->value_lo[j0],X->value[i],X->value_lo[i],LU->value[i][j]);
						#endif
					}
				}
			#endif
		#endif
	}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
LIS_INT lis_matvect_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y)
{
	LIS_INT i,j,jj,n;
	LIS_SCALAR t,*x,*y;
	LIS_QUAD_DECLAR;
	#ifdef USE_QUAD_PRECISION
		LIS_INT	j0,j1;
		LIS_QUAD_PD	tt;
	#endif

	LIS_DEBUG_FUNC_IN;

	n = LU->n;
	x = X->value;
	y = Y->value;

	#ifdef USE_QUAD_PRECISION
	if( X->precision==LIS_PRECISION_DEFAULT )
	#endif
	{
		#ifdef USE_MPI
			LIS_MATVEC_SENDRECV;
		#endif
		#ifdef _OPENMP
		#pragma omp parallel for private(i,j,jj,t)
		#endif
		for(i=0;i<n;i++)
		{
			t = 0.0;
			for(j=0;j<LU->nnz[i];j++)
			{
				jj = LU->index[i][j];
				t += LU->value[i][j] * X->value[jj];
			}
			Y->value[i] = t;
		}
	}
	#ifdef USE_QUAD_PRECISION
	else
	{
		#ifdef USE_MPI
			lis_send_recv_mp(A->commtable,X);
		#endif
		#ifndef USE_FMA2_SSE2
			#ifndef USE_SSE2
				#pragma omp parallel private(i,j,jj,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
			#else
				#pragma omp parallel private(i,j,jj,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
			#endif
			for(i=0;i<n;i++)
			{
				Y->value[i] = Y->value_lo[i] = 0.0;
				for(j=0;j<LU->nnz[i];j++)
				{
					jj = LU->index[i][j];
					#ifndef USE_SSE2
						LIS_QUAD_FMAD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]);
					#else
						LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]);
					#endif
				}
			}
		#else
			#ifdef _OPENMP
			#ifndef USE_SSE2
				#pragma omp parallel for private(i,j,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
			#else
				#pragma omp parallel for private(i,j,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
			#endif
			#endif
			for(i=0;i<n;i++)
			{
				tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0;
				for(j=0;j<LU->nnz[i]-1;j+=2)
				{
					j0 = LU->index[i][j];
					j1 = LU->index[i][j+1];
					#ifdef USE_SSE2
						LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],X->value[j0],X->value_lo[j0],X->value[j1],X->value_lo[j1],LU->value[i][j]);
					#endif
				}
				#ifdef USE_SSE2
					LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],tt.hi[0],tt.lo[0],tt.hi[1],tt.lo[1]);
				#endif
				for(;j<LU->nnz[i];j++)
				{
					j0 = LU->index[i][j];
					#ifdef USE_SSE2
						LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[j0],X->value_lo[j0],LU->value[i][j]);
					#endif
				}
			}
		#endif
	}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
Exemple #6
0
LIS_INT lis_psolve_adds(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
{
	LIS_INT i,k,n,np,iter,ptype;
	LIS_SCALAR *b,*x,*w,*r,*rl;
	LIS_VECTOR W,R;
	LIS_PRECON precon;
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	precon = solver->precon;
	n     = precon->A->n;
	np    = precon->A->np;
	W     = precon->work[0];
	R     = precon->work[1];
	b     = B->value;
	x     = X->value;
	w     = W->value;
	r     = R->value;
	rl    = R->value_lo;
	iter  = solver->options[LIS_OPTIONS_ADDS_ITER];
	ptype = solver->options[LIS_OPTIONS_PRECON];

	#ifdef USE_QUAD_PRECISION
	if( solver->precision==LIS_PRECISION_DEFAULT )
	{
	#endif
		lis_vector_set_all(0.0,X);
		lis_vector_copy(B,R);
		for(k=0;k<iter+1;k++)
		{
			for(i=n;i<np;i++)
			{
				r[i] = 0.0;
			}

			lis_psolve_xxx[ptype](solver,R,W);
			#ifdef _OPENMP
			#pragma omp parallel for private(i)
			#endif
			for(i=0;i<n;i++)
			{
				x[i] += w[i];
			}
		
			if(k!=iter)
			{
				lis_matvec(precon->A,X,R);
				#ifdef _OPENMP
				#pragma omp parallel for private(i)
				#endif
				for(i=0;i<n;i++)
				{
					r[i] = b[i] - r[i];
				}
			}
		}
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			lis_vector_set_allex_nm(0.0,X);
			lis_vector_copyex_mm(B,R);
			for(k=0;k<iter+1;k++)
			{
				for(i=n;i<np;i++)
				{
					r[i] = 0.0;
					rl[i] = 0.0;
				}

				lis_psolve_xxx[ptype](solver,R,W);
				for(i=0;i<n;i++)
				{
					#ifndef USE_SSE2
						LIS_QUAD_ADD(X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],W->value[i],W->value_lo[i]);
					#else
						LIS_QUAD_ADD_SSE2(X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],W->value[i],W->value_lo[i]);
					#endif
	/*				x[i] += w[i];*/
				}
			
				if(k==iter) break;

				lis_matvec(precon->A,X,R);
				for(i=0;i<n;i++)
				{
					#ifndef USE_SSE2
						LIS_QUAD_ADD(R->value[i],R->value_lo[i],B->value[i],B->value_lo[i],-R->value[i],-R->value_lo[i]);
					#else
						LIS_QUAD_ADD_SSE2(R->value[i],R->value_lo[i],B->value[i],B->value_lo[i],-R->value[i],-R->value_lo[i]);
					#endif
	/*				r[i] = b[i] - r[i];*/
				}
			}
		}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
Exemple #7
0
LIS_INT lis_vector_nrm2ex_mm(LIS_VECTOR vx, LIS_QUAD_PTR *val)
{
	LIS_INT i,n;
	LIS_SCALAR *x,*xl;
	LIS_QUAD_PTR dotm2,dotm,tmpm;
	#ifdef _OPENMP
		LIS_INT is,ie,nprocs,my_rank;
		LIS_SCALAR *gt;
	#endif
	#ifdef USE_MPI
		MPI_Comm comm;
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = vx->n;
	x  = vx->value;
	xl = vx->value_lo;
	dotm2.hi = &vx->work[0];
	dotm2.lo = &vx->work[2];
	dotm.hi = &vx->work[8];
	dotm.lo = &vx->work[9];
	tmpm.hi = &vx->work[10];
	tmpm.lo = &vx->work[11];
	#ifdef USE_MPI
		comm   = vx->comm;
	#endif
	#ifdef _OPENMP
		gt     = lis_vec_tmp;
		nprocs = omp_get_max_threads();
		#ifndef USE_SSE2
			#pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl)
		#else
			#pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank)
		#endif
		{
			my_rank = omp_get_thread_num();
			LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
			#ifndef USE_FMA2_SSE2
				gt[my_rank*LIS_VEC_TMP_PADD] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0;
				#pragma cdir nodep
				for(i=is;i<ie;i++)
				{
					LIS_QUAD_FSA(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]);
				}
			#else
				gt[my_rank*LIS_VEC_TMP_PADD  ] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0;
				gt[my_rank*LIS_VEC_TMP_PADD+2] = gt[my_rank*LIS_VEC_TMP_PADD+3] = 0.0;
				#ifdef USE_VEC_COMP
				#pragma cdir nodep
				#endif
				for(i=is;i<ie-1;i+=2)
				{
					LIS_QUAD_FSA2_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],x[i],xl[i]);
				}
				LIS_QUAD_ADD_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD+3]);
				for(;i<ie;i++)
				{
					LIS_QUAD_FSA_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]);
				}
			#endif
		}
		dotm.hi[0] = dotm.lo[0] = 0.0;
		for(i=0;i<nprocs;i++)
		{
			#ifndef USE_SSE2
				LIS_QUAD_ADD(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]);
			#else
				LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]);
			#endif
		}
	#else
		#ifndef USE_FMA2_SSE2
			dotm.hi[0] = dotm.lo[0] = 0.0;
			#pragma cdir nodep
			for(i=0;i<n;i++)
			{
				LIS_QUAD_FSA(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]);
			}
		#else
			dotm2.hi[0] = dotm2.hi[1] = 0.0;
			dotm2.lo[0] = dotm2.lo[1] = 0.0;
			for(i=0;i<n-1;i+=2)
			{
				LIS_QUAD_FSA2_SSE2(dotm2.hi[0],dotm2.lo[0],dotm2.hi[0],dotm2.lo[0],x[i],xl[i]);
			}
			LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm2.hi[0],dotm2.lo[0],dotm2.hi[1],dotm2.lo[1]);
			for(;i<n;i++)
			{
				LIS_QUAD_FSA_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]);
			}
		#endif
	#endif
	#ifdef USE_MPI
		MPI_Allreduce(dotm.hi,tmpm.hi,1,LIS_MPI_MSCALAR,LIS_MPI_MSUM,comm);
		#ifndef USE_SSE2
			LIS_QUAD_SQRT(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]);
		#else
			LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]);
		#endif
	#else
		#ifndef USE_SSE2
			LIS_QUAD_SQRT(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]);
		#else
			LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]);
		#endif
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}