LIS_INT lis_matrix_elements_copy_dia(LIS_INT n, LIS_INT nnd, LIS_INT *index, LIS_SCALAR *value,
                 LIS_INT *o_index, LIS_SCALAR *o_value)
{
  LIS_INT      is,ie;
  LIS_INT      nprocs,my_rank;

  LIS_DEBUG_FUNC_IN;

  #ifdef _OPENMP
    nprocs = omp_get_max_threads();
  #else
    nprocs = 1;
  #endif

  memcpy(o_index,index,nnd*sizeof(LIS_INT));
  #ifdef _OPENMP
  #pragma omp parallel private(is,ie,my_rank)
  #endif
  {
    #ifdef _OPENMP
      my_rank = omp_get_thread_num();
    #else
      my_rank = 0;
    #endif
    LIS_GET_ISIE(my_rank,nprocs,n,is,ie)

    memcpy(&o_value[is*nnd],&value[is*nnd],(ie-is)*nnd*sizeof(LIS_SCALAR));
  }

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
}
Exemplo n.º 2
0
LIS_INT lis_vector_axpyzex_mmmm(LIS_QUAD_PTR alpha, LIS_VECTOR vx, LIS_VECTOR vy, LIS_VECTOR vz)
{
	LIS_INT i,n,is,ie,nprocs,my_rank;
	LIS_QUAD_PTR aa;
	LIS_SCALAR *x,*y,*z;
	LIS_SCALAR *xl,*yl,*zl;
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n    = vx->n;
	x    = vx->value;
	y    = vy->value;
	z    = vz->value;
	xl   = vx->value_lo;
	yl   = vy->value_lo;
	zl   = vz->value_lo;
	aa.hi = &vx->work[4];
	aa.lo = &vx->work[6];
	#ifndef USE_FMA2_SSE2
	    #pragma cdir nodep
		#pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
		for(i=0; i<n; i++)
		{
			LIS_QUAD_FMA(z[i],zl[i],y[i],yl[i],alpha.hi[0],alpha.lo[0],x[i],xl[i]);
		}
	#else
		#ifdef _OPENMP
			nprocs = omp_get_max_threads();
		#else
			nprocs = 1;
		#endif
		aa.hi[0] = aa.hi[1] = alpha.hi[0];
		aa.lo[0] = aa.lo[1] = alpha.lo[0];

		#ifdef _OPENMP
		#pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank)
		#endif
		{
			#ifdef _OPENMP
				my_rank = omp_get_thread_num();
			#else
				my_rank = 0;
			#endif
			LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
			for(i=is;i<ie-1;i+=2)
			{
				LIS_QUAD_FMA2_SSE2(z[i],zl[i],y[i],yl[i],aa.hi[0],aa.lo[0],x[i],xl[i]);
			}
			for(;i<ie;i++)
			{
				LIS_QUAD_FMA_SSE2(z[i],zl[i],y[i],yl[i],alpha.hi[0],alpha.lo[0],x[i],xl[i]);
			}
		}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
Exemplo n.º 3
0
LIS_INT lis_vector_scaleex_nm(LIS_SCALAR alpha, LIS_VECTOR vx)
{
	LIS_INT i,n,is,ie,nprocs,my_rank;
	LIS_SCALAR *aa;
	LIS_SCALAR *x,*xl;
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = vx->n;
	x  = vx->value;
	xl = vx->value_lo;
	aa = vx->work;
	#ifndef USE_FMA2_SSE2
	    #pragma cdir nodep
		#ifndef USE_SSE2
			#pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl)
		#else
			#pragma omp parallel for private(i,bh,ch,sh,th,bl,sl,tl,p1,p2,t0,t1,t2,is,ie,my_rank)
		#endif
		for(i=0; i<n; i++)
		{
			LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],alpha);
		}
	#else
		#ifdef _OPENMP
			nprocs = omp_get_max_threads();
		#else
			nprocs = 1;
		#endif
		aa[0] = aa[1] = alpha;
		#ifdef _OPENMP
		#ifndef USE_SSE2
			#pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl)
		#else
			#pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank)
		#endif
		#endif
		{
			#ifdef _OPENMP
				my_rank = omp_get_thread_num();
			#else
				my_rank = 0;
			#endif
			LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
			for(i=is;i<ie-1;i+=2)
			{
				LIS_QUAD_MULD2_SSE2(x[i],xl[i],x[i],xl[i],aa[0]);
			}
			for(;i<ie;i++)
			{
				LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],alpha);
			}
		}
	#endif
	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
LIS_INT lis_matrix_get_diagonal_dia(LIS_MATRIX A, LIS_SCALAR d[])
{
  LIS_INT i,j;
  LIS_INT n,nnd;
  #ifdef _OPENMP
    LIS_INT is,ie,my_rank,nprocs;
  #endif

  LIS_DEBUG_FUNC_IN;

  n    = A->n;
  nnd  = A->nnd;
  if( A->is_splited )
  {
    #ifdef _OPENMP
    #pragma omp parallel for private(i)
    #endif
    for(i=0; i<n; i++)
    {
      d[i] = A->D->value[i];
    }
  }
  else
  {
    #ifdef _OPENMP
      nprocs  = omp_get_max_threads();
      for(j=0;j<nnd;j++)
      {
        if( A->index[j]==0 ) break;
      }
      #pragma omp parallel private(is,ie,my_rank)
      {
        my_rank = omp_get_thread_num();
        LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
        memcpy(&d[is],&A->value[is*nnd+j*(ie-is)],(ie-is)*sizeof(LIS_SCALAR));
      }
    #else
      for(j=0;j<nnd;j++)
      {
        if( A->index[j]==0 ) break;
      }
      for(i=0;i<n;i++)
      {
        d[i] = A->value[j*n+i];
      }
    #endif
  }
  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
}
Exemplo n.º 5
0
void lis_matvec_ell(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
{
	LIS_INT i,j,jj,is,ie;
	LIS_INT n,maxnzr,nprocs,my_rank;

	n      = A->n;
	if( A->is_splited )
	{
		#ifdef USE_VEC_COMP
		#pragma cdir nodep
		#endif
		for(i=0; i<n; i++)
		{
			y[i] = A->D->value[i]*x[i];
		}
		for(j=0;j<A->L->maxnzr;j++)
		{
			jj = j*n;
			#ifdef USE_VEC_COMP
			#pragma cdir nodep
			#endif
			for(i=0;i<n;i++)
			{
				y[i] += A->L->value[jj + i] * x[A->L->index[jj + i]];
			}
		}
		for(j=0;j<A->U->maxnzr;j++)
		{
			jj = j*n;
			#ifdef USE_VEC_COMP
			#pragma cdir nodep
			#endif
			for(i=0;i<n;i++)
			{
				y[i] += A->U->value[jj + i] * x[A->U->index[jj + i]];
			}
		}
	}
	else
	{
		maxnzr = A->maxnzr;
		#ifdef _OPENMP
			nprocs = omp_get_max_threads();
		#else
			nprocs = 1;
		#endif
		#ifdef _OPENMP
		#pragma omp parallel private(i,j,jj,is,ie,my_rank)
		#endif
		{
			#ifdef _OPENMP
				my_rank = omp_get_thread_num();
			#else
				my_rank = 0;
			#endif
			LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
			#ifdef USE_VEC_COMP
			#pragma cdir nodep
			#endif
			for(i=is;i<ie;i++)
			{
				y[i] = 0.0;
			}
			for(j=0;j<maxnzr;j++)
			{
				jj = j*n;
				#ifdef USE_VEC_COMP
				#pragma cdir nodep
				#endif
				for(i=is;i<ie;i++)
				{
					y[i] += A->value[jj + i] * x[A->index[jj + i]];
				}
			}
		}
	}
}
Exemplo n.º 6
0
void lis_matvect_ell(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
{
	LIS_INT i,j,jj;
	LIS_INT n,np,maxnzr;
	#ifdef _OPENMP
		LIS_INT k,is,ie,nprocs;
		LIS_SCALAR t;
		LIS_SCALAR *w;
	#endif

	n      = A->n;
	np     = A->np;
	if( A->is_splited )
	{
		#ifdef USE_VEC_COMP
		#pragma cdir nodep
		#endif
		for(i=0; i<n; i++)
		{
			y[i] = A->D->value[i]*x[i];
		}
		for(j=0;j<A->L->maxnzr;j++)
		{
			jj = j*n;
			#ifdef USE_VEC_COMP
			#pragma cdir nodep
			#endif
			for(i=0;i<n;i++)
			{
				y[A->L->index[jj + i]] += A->L->value[jj + i] * x[i];
			}
		}
		for(j=0;j<A->U->maxnzr;j++)
		{
			jj = j*n;
			#ifdef USE_VEC_COMP
			#pragma cdir nodep
			#endif
			for(i=0;i<n;i++)
			{
				y[A->U->index[jj + i]] += A->U->value[jj + i] * x[i];
			}
		}
	}
	else
	{
		#ifdef _OPENMP
			maxnzr = A->maxnzr;
			nprocs = omp_get_max_threads();
			w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_ell::w" );
			#pragma omp parallel private(i,j,t,jj,k,is,ie)
			{
				k = omp_get_thread_num();
				LIS_GET_ISIE(k,nprocs,n,is,ie);
				#pragma omp for
				for(j=0;j<nprocs;j++)
				{
					memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
				}
				for(j=0;j<maxnzr;j++)
				{
					jj = j*n;
					#ifdef USE_VEC_COMP
					#pragma cdir nodep
					#endif
					for(i=is;i<ie;i++)
					{
						w[k*np + A->index[jj + i]] += A->value[jj + i] * x[i];
					}
				}
				#pragma omp barrier
				#pragma omp for 
				#ifdef USE_VEC_COMP
				#pragma cdir nodep
				#endif
				for(i=0;i<np;i++)
				{
					t = 0.0;
					for(j=0;j<nprocs;j++)
					{
						t += w[j*np+i];
					}
					y[i] = t;
				}
			}
			lis_free(w);
		#else
			maxnzr = A->maxnzr;
			#ifdef USE_VEC_COMP
			#pragma cdir nodep
			#endif
			for(i=0; i<n; i++)
			{
				y[i] = 0.0;
			}
			for(j=0;j<maxnzr;j++)
			{
				jj = j*n;
				#ifdef USE_VEC_COMP
				#pragma cdir nodep
				#endif
				for(i=0;i<n;i++)
				{
					y[A->index[jj + i]] += A->value[jj + i] * x[i];
				}
			}
		#endif
	}
}
Exemplo n.º 7
0
LIS_INT lis_ranges_create(LIS_Comm comm, LIS_INT *local_n, LIS_INT *global_n, LIS_INT **ranges, LIS_INT *is, LIS_INT *ie, LIS_INT *nprocs, LIS_INT *my_rank)
{
  #ifdef USE_MPI
    LIS_INT    i;
  #endif
  LIS_INT    *tranges;
  int             int_nprocs,int_my_rank;

  LIS_DEBUG_FUNC_IN;

  #ifdef USE_MPI
    MPI_Comm_size(comm,&int_nprocs);
    MPI_Comm_rank(comm,&int_my_rank);

    *nprocs=int_nprocs;
    *my_rank=int_my_rank;

    tranges = (LIS_INT *)lis_malloc( (*nprocs+1)*sizeof(LIS_INT),"lis_ranges_create::tranges" );
    if( tranges==NULL )
    {
      LIS_SETERR_MEM((*nprocs+1)*sizeof(LIS_INT));
      return LIS_OUT_OF_MEMORY;
    }
  #else
    *nprocs  = 1;
    *my_rank = 0;
    tranges  = NULL;
  #endif

  #ifdef USE_MPI
    MPI_Allreduce(local_n,&i,1,LIS_MPI_INT,MPI_SUM,comm);
    if( i==0 )
  #else
    if( *local_n==0 )
  #endif
  {
    #ifdef USE_MPI
      LIS_GET_ISIE(*my_rank,*nprocs,*global_n,*is,*ie);
      *local_n = *ie-*is;
      MPI_Allgather(ie,1,LIS_MPI_INT,&tranges[1],1,LIS_MPI_INT,comm);
      tranges[0] = 0;
    #else
      *local_n = *global_n;
      *is      = 0;
      *ie      = *global_n;
    #endif
  }
  else
  {
    #ifdef USE_MPI
      MPI_Allgather(local_n,1,LIS_MPI_INT,&tranges[1],1,LIS_MPI_INT,comm);
      tranges[0] = 0;
      for(i=0;i<*nprocs;i++)
      {
        tranges[i+1] += tranges[i];
      }
      *global_n = tranges[*nprocs];
      *is       = tranges[*my_rank];
      *ie       = tranges[*my_rank+1];
    #else
      *global_n = *local_n;
      *is       = 0;
      *ie       = *local_n;
    #endif
  }
  *ranges = tranges;

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
}
Exemplo n.º 8
0
LIS_INT lis_matrix_solvet_csr(LIS_MATRIX A, LIS_VECTOR B, LIS_VECTOR X, LIS_INT flag)
{
	LIS_INT i,j,jj,n;
	LIS_SCALAR t;
	LIS_SCALAR *x;
	#ifdef _OPENMP
		LIS_INT is,ie,my_rank,nprocs;
	#endif
	#ifdef USE_QUAD_PRECISION
		LIS_QUAD w1,w2;
		LIS_SCALAR *xl;
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = A->n;
	x  = X->value;
	#ifdef USE_QUAD_PRECISION
		xl = X->value_lo;
	#endif

	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			lis_vector_copy(B,X);
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			lis_vector_copyex_mm(B,X);
		}
	#endif
	switch(flag)
	{
	case LIS_MATRIX_LOWER:
		for(i=0;i<n;i++)
		{
			x[i]   = x[i] * A->WD->value[i];
			for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
			{
				x[A->U->index[j]] -= A->U->value[j] * x[i];
			}
		}
		break;
	case LIS_MATRIX_UPPER:
		for(i=n-1;i>=0;i--)
		{
			x[i]   = x[i] * A->WD->value[i];
			for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
			{
				x[A->L->index[j]] -= A->L->value[j] * x[i];
			}
		}
		break;
	case LIS_MATRIX_SSOR:
	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			#ifdef _OPENMP
				nprocs = omp_get_max_threads();
				#pragma omp parallel private(i,j,jj,t,is,ie,my_rank)
				{
					my_rank = omp_get_thread_num();
					LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
					for(i=is;i<ie;i++)
					{
						t   = x[i] * A->WD->value[i];
						for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
						{
							jj = A->U->index[j];
							if( jj<is || jj>=ie ) continue;
							x[jj] -= A->U->value[j] * t;
						}
					}
					for(i=ie-1;i>=is;i--)
					{
						t    = x[i] * A->WD->value[i];
						x[i] = t;
						for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
						{
							jj = A->L->index[j];
							if( jj<is ) continue;
							x[jj] -= A->L->value[j] * t;
						}
					}
				}
			#else
				for(i=0;i<n;i++)
				{
					t   = x[i] * A->WD->value[i];
					for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
					{
						x[A->U->index[j]] -= A->U->value[j] * t;
					}
				}
				for(i=n-1;i>=0;i--)
				{
					t    = x[i] * A->WD->value[i];
					x[i] = t;
					for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
					{
						x[A->L->index[j]] -= A->L->value[j] * t;
					}
				}
			#endif
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			#ifdef _OPENMP
				nprocs = omp_get_max_threads();
				#ifndef USE_SSE2
					#pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
				#else
					#pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
				#endif
				{
					my_rank = omp_get_thread_num();
					LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
					for(i=is;i<ie;i++)
					{
						#ifndef USE_SSE2
							LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#else
							LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#endif
						/* t   = x[i] * A->WD->value[i]; */
						for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
						{
							jj = A->U->index[j];
							if( jj<is || jj>=ie ) continue;
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
							#else
								LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
							#endif
							/* x[A->U->index[j]] -= A->U->value[j] * t; */
						}
					}
					for(i=ie-1;i>=is;i--)
					{
						#ifndef USE_SSE2
							LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#else
							LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#endif
						x[i]  = w1.hi;
						xl[i] = w1.lo;
						/* t    = x[i] * A->WD->value[i]; */
						/* x[i] = t; */
						for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
						{
							jj = A->L->index[j];
							if( jj<is || jj>=ie ) continue;
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
							#else
								LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
							#endif
							/* x[A->L->index[j]] -= A->L->value[j] * t; */
						}
					}
				}
			#else
				for(i=0;i<n;i++)
				{
					#ifndef USE_SSE2
						LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#else
						LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#endif
					/* t   = x[i] * A->WD->value[i]; */
					for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
					{
						jj = A->U->index[j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
						#else
							LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
						#endif
						/* x[A->U->index[j]] -= A->U->value[j] * t; */
					}
				}
				for(i=n-1;i>=0;i--)
				{
					#ifndef USE_SSE2
						LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#else
						LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#endif
					x[i]  = w1.hi;
					xl[i] = w1.lo;
					/* t    = x[i] * A->WD->value[i]; */
					/* x[i] = t; */
					for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
					{
						jj = A->L->index[j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
						#else
							LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
						#endif
						/* x[A->L->index[j]] -= A->L->value[j] * t; */
					}
				}
			#endif
		}
	#endif
		break;
	}

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
LIS_INT lis_psolvet_ilut_csr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
{
#ifdef _OPENMP
  LIS_INT i,j,jj,n;
  LIS_INT is,ie,my_rank,nprocs;
  LIS_SCALAR *b,*x;
  LIS_MATRIX_ILU L,U;
  LIS_VECTOR D;
  LIS_PRECON  precon;
  LIS_QUAD_DECLAR;
  #ifdef USE_QUAD_PRECISION
    LIS_SCALAR *xl;
  #endif

  LIS_DEBUG_FUNC_IN;

  precon = solver->precon;
  L = precon->L;
  U = precon->U;
  D = precon->D;
  b = B->value;
  x = X->value;
  #ifdef USE_QUAD_PRECISION
    xl = X->value_lo;
  #endif
  n = solver->A->n;
  nprocs = omp_get_max_threads();

  #ifdef USE_QUAD_PRECISION
    if( B->precision==LIS_PRECISION_DEFAULT )
    {
  #endif
      lis_vector_copy(B,X);
      #pragma omp parallel private(i,j,jj,is,ie,my_rank)
      {
        my_rank = omp_get_thread_num();
        LIS_GET_ISIE(my_rank,nprocs,n,is,ie);

        for(i=is;i<ie;i++)
        {
          x[i] = D->value[i]*x[i];
          for(j=0;j<U->nnz[i];j++)
          {
            jj     = U->index[i][j];
            x[jj] -= U->value[i][j] * x[i];
          }
        }
        for(i=ie-1;i>=is;i--)
        {
          for(j=0;j<L->nnz[i];j++)
          {
            jj     = L->index[i][j];
            x[jj] -= L->value[i][j] * x[i];
          }
        }
      }
  #ifdef USE_QUAD_PRECISION
    }
    else
    {
      lis_vector_copyex_mm(B,X);
      nprocs = omp_get_max_threads();
      #ifndef USE_SSE2
        #pragma omp parallel private(i,j,jj,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
      #else
        #pragma omp parallel private(i,j,jj,is,ie,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
      #endif
      {
        my_rank = omp_get_thread_num();
        LIS_GET_ISIE(my_rank,nprocs,n,is,ie);

        for(i=is;i<ie;i++)
        {
          #ifndef USE_SSE2
            LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
          #else
            LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]);
          #endif
/*          x[i] = D->value[i]*x[i];*/
          for(j=0;j<U->nnz[i];j++)
          {
            jj     = U->index[i][j];
            #ifndef USE_SSE2
              LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
            #else
              LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
            #endif
/*            x[jj] -= U->value[i][j] * x[i];*/
          }
        }
        for(i=ie-1;i>=is;i--)
        {
          for(j=0;j<L->nnz[i];j++)
          {
            jj     = L->index[i][j];
            #ifndef USE_SSE2
              LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
            #else
              LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
            #endif
/*            x[jj] -= L->value[i][j] * x[i];*/
          }
        }
      }
    }
  #endif

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#else
  LIS_INT i,j,jj,n;
  LIS_SCALAR *b,*x;
  LIS_MATRIX_ILU L,U;
  LIS_VECTOR D;
  LIS_PRECON  precon;
  LIS_QUAD_DECLAR;
  #ifdef USE_QUAD_PRECISION
    LIS_SCALAR *xl;
  #endif


  LIS_DEBUG_FUNC_IN;

  precon = solver->precon;
  L = precon->L;
  U = precon->U;
  D = precon->D;
  b = B->value;
  x = X->value;
  #ifdef USE_QUAD_PRECISION
    xl = X->value_lo;
  #endif
  n = solver->A->n;

  #ifdef USE_QUAD_PRECISION
    if( B->precision==LIS_PRECISION_DEFAULT )
    {
  #endif
      lis_vector_copy(B,X);
      for(i=0; i<n; i++)
      {
        x[i] = D->value[i]*x[i];
        for(j=0;j<U->nnz[i];j++)
        {
          jj     = U->index[i][j];
          x[jj] -= U->value[i][j] * x[i];
        }
      }
      for(i=n-1; i>=0; i--)
      {
        for(j=0;j<L->nnz[i];j++)
        {
          jj     = L->index[i][j];
          x[jj] -= L->value[i][j] * x[i];
        }
      }
  #ifdef USE_QUAD_PRECISION
    }
    else
    {
      lis_vector_copy(B,X);
      for(i=0; i<n; i++)
      {
        #ifndef USE_SSE2
          LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
        #else
          LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]);
        #endif
/*        x[i] = D->value[i]*x[i];*/
        for(j=0;j<U->nnz[i];j++)
        {
          jj     = U->index[i][j];
          #ifndef USE_SSE2
            LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
          #else
            LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
          #endif
/*          x[jj] -= U->value[i][j] * x[i];*/
        }
      }
      for(i=n-1; i>=0; i--)
      {
        for(j=0;j<L->nnz[i];j++)
        {
          jj     = L->index[i][j];
          #ifndef USE_SSE2
            LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
          #else
            LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
          #endif
/*          x[jj] -= L->value[i][j] * x[i];*/
        }
      }
    }
  #endif

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#endif
}
LIS_INT lis_precon_create_ilut_csr(LIS_SOLVER solver, LIS_PRECON precon)
{
#ifdef _OPENMP
  LIS_INT        err;
  LIS_INT        i,j,k,ii,jj,kk;
  LIS_INT        is,ie,my_rank,nprocs;
  LIS_INT        n,nr,nnz,lfil,len;
  LIS_SCALAR    gamma,t,tol,toldd,m;
  LIS_MATRIX    A;
  LIS_MATRIX_ILU  L,U;
  LIS_VECTOR    D;

  LIS_SCALAR    tnorm, tolnorm;
  LIS_SCALAR    fact,lxu,*wn,*w;
  LIS_INT        lenu,lenl,col,jpos,jrow,upos,para;
  LIS_INT        *jbuf,*iw;

  LIS_DEBUG_FUNC_IN;


  A      = solver->A;
  n      = A->n;
  tol    = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
  m      = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
  gamma  = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN];
  lfil   = (LIS_INT)((double)A->nnz/(2.0*n))*m;
  nprocs = omp_get_max_threads();

  L      = NULL;
  U      = NULL;


  err = lis_matrix_ilu_create(n,1,&L);
  if( err ) return err;
  err = lis_matrix_ilu_create(n,1,&U);
  if( err ) return err;
  err = lis_matrix_ilu_setCR(L);
  if( err ) return err;
  err = lis_matrix_ilu_setCR(U);
  if( err ) return err;
  err = lis_vector_duplicate(A,&D);
  if( err )
  {
    return err;
  }

  w   = (LIS_SCALAR *)lis_malloc(nprocs*(n+1)*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w");
  if( w==NULL )
  {
    LIS_SETERR_MEM(nprocs*(n+1)*sizeof(LIS_SCALAR));
    return LIS_OUT_OF_MEMORY;
  }
  wn = (LIS_SCALAR *)lis_malloc(nprocs*n*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w");
  if( wn==NULL )
  {
    LIS_SETERR_MEM(nprocs*n*sizeof(LIS_SCALAR));
    return LIS_OUT_OF_MEMORY;
  }

  jbuf   = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw");
  if( jbuf==NULL )
  {
    LIS_SETERR_MEM(nprocs*n*sizeof(LIS_INT));
    return LIS_OUT_OF_MEMORY;
  }
  iw   = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw");
  if( iw==NULL )
  {
    LIS_SETERR_MEM(nprocs*n*sizeof(LIS_INT));
    return LIS_OUT_OF_MEMORY;
  }


  #pragma omp parallel private(is,ie,my_rank,i,j,k,jj,tnorm,tolnorm,len,lenu,lenl,col,t,jpos,jrow,fact,lxu,upos)
  {
    my_rank  = omp_get_thread_num();
    LIS_GET_ISIE(my_rank,nprocs,n,is,ie);

    for(i=is;i<ie;i++) iw[my_rank*n+i] = -1;

    for(i=is;i<ie;i++)
    {
      tnorm = 0;
      k = 0;
      for(j=A->ptr[i];j<A->ptr[i+1];j++)
      {
        jj = A->index[j];
        if( jj<is || jj>=ie ) continue;
        tnorm += fabs(A->value[j]);
        k++;
      }
      tnorm   = tnorm / (double)k;
      tolnorm = tol * tnorm;

      lenu = 0;
      lenl = 0;
      jbuf[my_rank*n+i] = i;
      w[my_rank*n+i] = 0;
      iw[my_rank*n+i] = i;

      for(j=A->ptr[i];j<A->ptr[i+1];j++)
      {
        col = A->index[j];
        if( col<is || col>=ie ) continue;
        t = A->value[j];
        if( col < i )
        {
          jbuf[my_rank*n+lenl] = col;
          iw[my_rank*n+col] = lenl;
          w[my_rank*n+lenl] = t;
          lenl++;
        }
        else if( col == i )
        {
          w[my_rank*n+i] = t;
        }
        else
        {
          lenu++;
          jpos = i + lenu;
          jbuf[my_rank*n+jpos] = col;
          iw[my_rank*n+col] = jpos;
          w[my_rank*n+jpos] = t;
        }
      }

      j = -1;
      len = 0;

      while( ++j < lenl )
      {
        jrow = jbuf[my_rank*n+j];
        jpos = j;
        for(k=j+1;k<lenl;k++)
        {
          if( jbuf[my_rank*n+k]<jrow )
          {
            jrow = jbuf[my_rank*n+k];
            jpos = k;
          }
        }
        if( jpos!=j )
        {
          col = jbuf[my_rank*n+j];
          jbuf[my_rank*n+j] = jbuf[my_rank*n+jpos];
          jbuf[my_rank*n+jpos] = col;
          iw[my_rank*n+jrow] = j;
          iw[my_rank*n+col] = jpos;
          t = w[my_rank*n+j];
          w[my_rank*n+j] = w[my_rank*n+jpos];
          w[my_rank*n+jpos] = t;
        }
        fact = w[my_rank*n+j] * D->value[jrow];
        w[my_rank*n+j] = fact;
        iw[my_rank*n+jrow] = -1;

        for(k=0;k<U->nnz[jrow];k++)
        {
          col = U->index[jrow][k];
          jpos = iw[my_rank*n+col];
          lxu = -fact * U->value[jrow][k];

          if( fabs(lxu) < tolnorm && jpos==-1 ) continue;
          if( col >= i )
          {
            if( jpos == -1 )
            {
              lenu++;
              upos = i + lenu;
              jbuf[my_rank*n+upos] = col;
              iw[my_rank*n+col] = upos;
              w[my_rank*n+upos] = lxu;
            }
            else
            {
              w[my_rank*n+jpos] += lxu;
            }
          }
          else
          {
            if( jpos == -1 )
            {
              jbuf[my_rank*n+lenl] = col;
              iw[my_rank*n+col] = lenl;
              w[my_rank*n+lenl] = lxu;
              lenl++;
            }
            else
            {
              w[my_rank*n+jpos] += lxu;
            }
          }
        }
      }

      iw[my_rank*n+i] = -1;
      for(j=0;j<lenu;j++)
      {
        iw[ my_rank*n+jbuf[my_rank*n+i+j+1] ] = -1;
      }

      D->value[i] = 1.0 / w[my_rank*n+i];


      len = _min(lfil,lenl);
      for(j=0;j<lenl;j++)
      {
        wn[my_rank*n+j] = fabs(w[my_rank*n+j]);
        iw[my_rank*n+j] = j;
      }
      lis_sort_di(0,lenl-1,&wn[my_rank*n],&iw[my_rank*n]);
      lis_sort_i(0,len-1,&iw[my_rank*n]);
      
      L->nnz[i] = len;
      if( len>0 )
      {
        L->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
        L->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
      }
      for(j=0;j<len;j++)
      {
        jpos = iw[my_rank*n+j];
        L->index[i][j] = jbuf[my_rank*n+jpos];
        L->value[i][j] = w[my_rank*n+jpos];
      }
      for(j=0;j<lenl;j++) iw[my_rank*n+j] = -1;

      len = _min(lfil,lenu);
      for(j=0;j<lenu;j++)
      {
        wn[my_rank*n+j] = fabs(w[my_rank*n+i+j+1]);
        iw[my_rank*n+j] = i+j+1;
      }
      lis_sort_di(0,lenu-1,&wn[my_rank*n],&iw[my_rank*n]);
      lis_sort_i(0,len-1,&iw[my_rank*n]);
      
      U->nnz[i] = len;
      if( len>0 )
      {
        U->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
        U->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
      }
      for(j=0;j<len;j++)
      {
        jpos = iw[my_rank*n+j];
        U->index[i][j] = jbuf[my_rank*n+jpos];
        U->value[i][j] = w[my_rank*n+jpos];
      }
      for(j=0;j<lenu;j++) iw[my_rank*n+j] = -1;
    }
  }

  precon->L  = L;
  precon->U  = U;
  precon->D  = D;

  lis_free2(4,w,iw,wn,jbuf);

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#else
  LIS_INT        err;
  LIS_INT        i,j,k;
  LIS_INT        n,lfil,len;
  LIS_SCALAR    gamma,t,tol,m;
  LIS_MATRIX    A;
  LIS_MATRIX_ILU  L,U;
  LIS_VECTOR    D;

  LIS_SCALAR    tnorm, tolnorm;
  LIS_SCALAR    fact,lxu,*wn,*w;
  LIS_INT        lenu,lenl,col,jpos,jrow,upos;
  LIS_INT        *jbuf,*iw;

  LIS_DEBUG_FUNC_IN;


  A      = solver->A;
  n      = A->n;
  tol    = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
  m      = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
  gamma  = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN];
  lfil   = (LIS_INT)(((double)A->nnz/(2.0*n))*m);

  L      = NULL;
  U      = NULL;


  err = lis_matrix_ilu_create(n,1,&L);
  if( err ) return err;
  err = lis_matrix_ilu_create(n,1,&U);
  if( err ) return err;
  err = lis_matrix_ilu_setCR(L);
  if( err ) return err;
  err = lis_matrix_ilu_setCR(U);
  if( err ) return err;
  err = lis_vector_duplicate(A,&D);
  if( err )
  {
    return err;
  }

  w   = (LIS_SCALAR *)lis_malloc((n+1)*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w");
  if( w==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
    return LIS_OUT_OF_MEMORY;
  }
  wn = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w");
  if( wn==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
    return LIS_OUT_OF_MEMORY;
  }

  jbuf   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw");
  if( jbuf==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_INT));
    return LIS_OUT_OF_MEMORY;
  }
  iw   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw");
  if( iw==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_INT));
    return LIS_OUT_OF_MEMORY;
  }


  for(i=0;i<n;i++) iw[i] = -1;

  for(i=0;i<n;i++)
  {
    tnorm = 0;
    for(j=A->ptr[i];j<A->ptr[i+1];j++)
    {
      tnorm += fabs(A->value[j]);
    }
    tnorm   = tnorm / (double)(A->ptr[i+1]-A->ptr[i]);
    tolnorm = tol * tnorm;

    lenu = 0;
    lenl = 0;
    jbuf[i] = i;
    w[i] = 0;
    iw[i] = i;

    for(j=A->ptr[i];j<A->ptr[i+1];j++)
    {
      col = A->index[j];
      #ifdef USE_MPI
        if( col>n-1 ) continue;
      #endif
      t = A->value[j];
      if( col < i )
      {
        jbuf[lenl] = col;
        iw[col] = lenl;
        w[lenl] = t;
        lenl++;
      }
      else if( col == i )
      {
        w[i] = t;
      }
      else
      {
        lenu++;
        jpos = i + lenu;
        jbuf[jpos] = col;
        iw[col] = jpos;
        w[jpos] = t;
      }
    }

    j = -1;
    len = 0;

    while( ++j < lenl )
    {
      jrow = jbuf[j];
      jpos = j;
      for(k=j+1;k<lenl;k++)
      {
        if( jbuf[k]<jrow )
        {
          jrow = jbuf[k];
          jpos = k;
        }
      }
      if( jpos!=j )
      {
        col = jbuf[j];
        jbuf[j] = jbuf[jpos];
        jbuf[jpos] = col;
        iw[jrow] = j;
        iw[col] = jpos;
        t = w[j];
        w[j] = w[jpos];
        w[jpos] = t;
      }
      fact = w[j] * D->value[jrow];
      w[j] = fact;
      iw[jrow] = -1;

      for(k=0;k<U->nnz[jrow];k++)
      {
        col = U->index[jrow][k];
        jpos = iw[col];
        lxu = -fact * U->value[jrow][k];

        if( fabs(lxu) < tolnorm && jpos==-1 ) continue;
        if( col >= i )
        {
          if( jpos == -1 )
          {
            lenu++;
            upos = i + lenu;
            jbuf[upos] = col;
            iw[col] = upos;
            w[upos] = lxu;
          }
          else
          {
            w[jpos] += lxu;
          }
        }
        else
        {
          if( jpos == -1 )
          {
            jbuf[lenl] = col;
            iw[col] = lenl;
            w[lenl] = lxu;
            lenl++;
          }
          else
          {
            w[jpos] += lxu;
          }
        }
      }
/*      for(kk=0;kk<bs;kk++)
      {
        w[bs*len+kk] = -buf_fact[kk];
      }
      jbuf[len] = jrow;
      len++;*/
    }

    iw[i] = -1;
    for(j=0;j<lenu;j++)
    {
      iw[ jbuf[i+j+1] ] = -1;
    }

    D->value[i] = 1.0 / w[i];


    len = _min(lfil,lenl);
    for(j=0;j<lenl;j++)
    {
      wn[j] = fabs(w[j]);
      iw[j] = j;
    }
    lis_sort_di(0,lenl-1,wn,iw);
    lis_sort_i(0,len-1,iw);
    
    L->nnz[i] = len;
    if( len>0 )
    {
      L->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
      L->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
    }
    for(j=0;j<len;j++)
    {
      jpos = iw[j];
      L->index[i][j] = jbuf[jpos];
      L->value[i][j] = w[jpos];
    }
    for(j=0;j<lenl;j++) iw[j] = -1;

    len = _min(lfil,lenu);
    for(j=0;j<lenu;j++)
    {
      wn[j] = fabs(w[i+j+1]);
      iw[j] = i+j+1;
    }
    lis_sort_di(0,lenu-1,wn,iw);
    lis_sort_i(0,len-1,iw);
    
    U->nnz[i] = len;
    if( len>0 )
    {
      U->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
      U->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR));
    }
    for(j=0;j<len;j++)
    {
      jpos = iw[j];
      U->index[i][j] = jbuf[jpos];
      U->value[i][j] = w[jpos];
    }
    for(j=0;j<lenu;j++) iw[j] = -1;
  }

  precon->L  = L;
  precon->U  = U;
  precon->D  = D;

  lis_free2(4,w,iw,wn,jbuf);

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#endif
}
Exemplo n.º 11
0
LIS_INT lis_vector_nrm2ex_mm(LIS_VECTOR vx, LIS_QUAD_PTR *val)
{
	LIS_INT i,n;
	LIS_SCALAR *x,*xl;
	LIS_QUAD_PTR dotm2,dotm,tmpm;
	#ifdef _OPENMP
		LIS_INT is,ie,nprocs,my_rank;
		LIS_SCALAR *gt;
	#endif
	#ifdef USE_MPI
		MPI_Comm comm;
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = vx->n;
	x  = vx->value;
	xl = vx->value_lo;
	dotm2.hi = &vx->work[0];
	dotm2.lo = &vx->work[2];
	dotm.hi = &vx->work[8];
	dotm.lo = &vx->work[9];
	tmpm.hi = &vx->work[10];
	tmpm.lo = &vx->work[11];
	#ifdef USE_MPI
		comm   = vx->comm;
	#endif
	#ifdef _OPENMP
		gt     = lis_vec_tmp;
		nprocs = omp_get_max_threads();
		#ifndef USE_SSE2
			#pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl)
		#else
			#pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank)
		#endif
		{
			my_rank = omp_get_thread_num();
			LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
			#ifndef USE_FMA2_SSE2
				gt[my_rank*LIS_VEC_TMP_PADD] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0;
				#pragma cdir nodep
				for(i=is;i<ie;i++)
				{
					LIS_QUAD_FSA(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]);
				}
			#else
				gt[my_rank*LIS_VEC_TMP_PADD  ] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0;
				gt[my_rank*LIS_VEC_TMP_PADD+2] = gt[my_rank*LIS_VEC_TMP_PADD+3] = 0.0;
				#ifdef USE_VEC_COMP
				#pragma cdir nodep
				#endif
				for(i=is;i<ie-1;i+=2)
				{
					LIS_QUAD_FSA2_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],x[i],xl[i]);
				}
				LIS_QUAD_ADD_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD+3]);
				for(;i<ie;i++)
				{
					LIS_QUAD_FSA_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]);
				}
			#endif
		}
		dotm.hi[0] = dotm.lo[0] = 0.0;
		for(i=0;i<nprocs;i++)
		{
			#ifndef USE_SSE2
				LIS_QUAD_ADD(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]);
			#else
				LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]);
			#endif
		}
	#else
		#ifndef USE_FMA2_SSE2
			dotm.hi[0] = dotm.lo[0] = 0.0;
			#pragma cdir nodep
			for(i=0;i<n;i++)
			{
				LIS_QUAD_FSA(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]);
			}
		#else
			dotm2.hi[0] = dotm2.hi[1] = 0.0;
			dotm2.lo[0] = dotm2.lo[1] = 0.0;
			for(i=0;i<n-1;i+=2)
			{
				LIS_QUAD_FSA2_SSE2(dotm2.hi[0],dotm2.lo[0],dotm2.hi[0],dotm2.lo[0],x[i],xl[i]);
			}
			LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm2.hi[0],dotm2.lo[0],dotm2.hi[1],dotm2.lo[1]);
			for(;i<n;i++)
			{
				LIS_QUAD_FSA_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]);
			}
		#endif
	#endif
	#ifdef USE_MPI
		MPI_Allreduce(dotm.hi,tmpm.hi,1,LIS_MPI_MSCALAR,LIS_MPI_MSUM,comm);
		#ifndef USE_SSE2
			LIS_QUAD_SQRT(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]);
		#else
			LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]);
		#endif
	#else
		#ifndef USE_SSE2
			LIS_QUAD_SQRT(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]);
		#else
			LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]);
		#endif
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}