Beispiel #1
0
LIS_INT lis_vector_scaleex_nm(LIS_SCALAR alpha, LIS_VECTOR vx)
{
	LIS_INT i,n,is,ie,nprocs,my_rank;
	LIS_SCALAR *aa;
	LIS_SCALAR *x,*xl;
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = vx->n;
	x  = vx->value;
	xl = vx->value_lo;
	aa = vx->work;
	#ifndef USE_FMA2_SSE2
	    #pragma cdir nodep
		#ifndef USE_SSE2
			#pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl)
		#else
			#pragma omp parallel for private(i,bh,ch,sh,th,bl,sl,tl,p1,p2,t0,t1,t2,is,ie,my_rank)
		#endif
		for(i=0; i<n; i++)
		{
			LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],alpha);
		}
	#else
		#ifdef _OPENMP
			nprocs = omp_get_max_threads();
		#else
			nprocs = 1;
		#endif
		aa[0] = aa[1] = alpha;
		#ifdef _OPENMP
		#ifndef USE_SSE2
			#pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl)
		#else
			#pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank)
		#endif
		#endif
		{
			#ifdef _OPENMP
				my_rank = omp_get_thread_num();
			#else
				my_rank = 0;
			#endif
			LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
			for(i=is;i<ie-1;i+=2)
			{
				LIS_QUAD_MULD2_SSE2(x[i],xl[i],x[i],xl[i],aa[0]);
			}
			for(;i<ie;i++)
			{
				LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],alpha);
			}
		}
	#endif
	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
 void lis_quad_mul_dd_d(LIS_QUAD *a, const LIS_QUAD *b, const double c)
{
	LIS_QUAD_DECLAR;

	#ifndef USE_SSE2
		LIS_QUAD_MULD(a->hi,a->lo,b->hi,b->lo,c);
	#else
		LIS_QUAD_MULD_SSE2(a->hi,a->lo,b->hi,b->lo,c);
	#endif
}
int lis_psolve_jacobi(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
{
	int i,n;
	LIS_SCALAR *b,*x,*d;
	LIS_PRECON	precon;
	LIS_QUAD_DECLAR;
	#ifdef USE_QUAD_PRECISION
		LIS_SCALAR *xl;
	#endif

	LIS_DEBUG_FUNC_IN;

	/*
	 *  Mx = b
	 *  M  = D
	 */

	precon = solver->precon;
	n = precon->D->n;
	d = precon->D->value;
	b = B->value;
	x = X->value;
	#ifdef USE_QUAD_PRECISION
		xl = X->value_lo;
	#endif

	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			#ifdef _OPENMP
			#pragma omp parallel for private(i)
			#endif
			for(i=0; i<n; i++)
			{
				x[i] = b[i] * d[i];
			}
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			#ifdef _OPENMP
			#ifndef USE_SSE2
				#pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
			#else
				#pragma omp parallel for private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
			#endif
			#endif
			for(i=0; i<n; i++)
			{
				#ifndef USE_SSE2
					LIS_QUAD_MULD(x[i],xl[i],B->value[i],B->value_lo[i],d[i]);
				#else
					LIS_QUAD_MULD_SSE2(x[i],xl[i],B->value[i],B->value_lo[i],d[i]);
				#endif
				/* x[i] = b[i] * d[i]; */
			}
		}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
Beispiel #4
0
LIS_INT lis_psolvet_sainv(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
{
	LIS_INT i,n;
	LIS_MATRIX A;
	LIS_MATRIX_ILU W,Z;
	LIS_VECTOR t,d;
	LIS_PRECON precon;
	LIS_QUAD_DECLAR;

	/*
	 *  x  = M'b
	 *  M' = WD^{-1}Z'
	 */

	LIS_DEBUG_FUNC_IN;

	precon = solver->precon;
	A = precon->A;
	W = precon->L;
	Z = precon->U;
	d = precon->D;
	t = precon->temp;
	n = precon->L->n;

	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			lis_matvect_ilu(A,Z,B,X);
			#ifdef _OPENMP
			#pragma omp parallel for private(i)
			#endif
			for(i=0;i<n;i++)
			{
				t->value[i] = X->value[i]*d->value[i];
			}
			lis_matvec_ilu(A,W,t,X);
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			lis_matvect_ilu(A,Z,B,X);
			#ifdef _OPENMP
			#ifndef USE_SSE2
				#pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
			#else
				#pragma omp parallel for private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
			#endif
			#endif
			for(i=0;i<n;i++)
			{
				#ifndef USE_SSE2
					LIS_QUAD_MULD(t->value[i],t->value_lo[i],X->value[i],X->value_lo[i],d->value[i]);
				#else
					LIS_QUAD_MULD_SSE2(t->value[i],t->value_lo[i],X->value[i],X->value_lo[i],d->value[i]);
				#endif
				/* t->value[i] = X->value[i]*d->value[i]; */
			}
			lis_matvec_ilu(A,W,t,X);
		}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
Beispiel #5
0
LIS_INT lis_matrix_solvet_csr(LIS_MATRIX A, LIS_VECTOR B, LIS_VECTOR X, LIS_INT flag)
{
	LIS_INT i,j,jj,n;
	LIS_SCALAR t;
	LIS_SCALAR *x;
	#ifdef _OPENMP
		LIS_INT is,ie,my_rank,nprocs;
	#endif
	#ifdef USE_QUAD_PRECISION
		LIS_QUAD w1,w2;
		LIS_SCALAR *xl;
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = A->n;
	x  = X->value;
	#ifdef USE_QUAD_PRECISION
		xl = X->value_lo;
	#endif

	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			lis_vector_copy(B,X);
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			lis_vector_copyex_mm(B,X);
		}
	#endif
	switch(flag)
	{
	case LIS_MATRIX_LOWER:
		for(i=0;i<n;i++)
		{
			x[i]   = x[i] * A->WD->value[i];
			for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
			{
				x[A->U->index[j]] -= A->U->value[j] * x[i];
			}
		}
		break;
	case LIS_MATRIX_UPPER:
		for(i=n-1;i>=0;i--)
		{
			x[i]   = x[i] * A->WD->value[i];
			for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
			{
				x[A->L->index[j]] -= A->L->value[j] * x[i];
			}
		}
		break;
	case LIS_MATRIX_SSOR:
	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			#ifdef _OPENMP
				nprocs = omp_get_max_threads();
				#pragma omp parallel private(i,j,jj,t,is,ie,my_rank)
				{
					my_rank = omp_get_thread_num();
					LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
					for(i=is;i<ie;i++)
					{
						t   = x[i] * A->WD->value[i];
						for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
						{
							jj = A->U->index[j];
							if( jj<is || jj>=ie ) continue;
							x[jj] -= A->U->value[j] * t;
						}
					}
					for(i=ie-1;i>=is;i--)
					{
						t    = x[i] * A->WD->value[i];
						x[i] = t;
						for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
						{
							jj = A->L->index[j];
							if( jj<is ) continue;
							x[jj] -= A->L->value[j] * t;
						}
					}
				}
			#else
				for(i=0;i<n;i++)
				{
					t   = x[i] * A->WD->value[i];
					for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
					{
						x[A->U->index[j]] -= A->U->value[j] * t;
					}
				}
				for(i=n-1;i>=0;i--)
				{
					t    = x[i] * A->WD->value[i];
					x[i] = t;
					for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
					{
						x[A->L->index[j]] -= A->L->value[j] * t;
					}
				}
			#endif
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			#ifdef _OPENMP
				nprocs = omp_get_max_threads();
				#ifndef USE_SSE2
					#pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
				#else
					#pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
				#endif
				{
					my_rank = omp_get_thread_num();
					LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
					for(i=is;i<ie;i++)
					{
						#ifndef USE_SSE2
							LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#else
							LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#endif
						/* t   = x[i] * A->WD->value[i]; */
						for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
						{
							jj = A->U->index[j];
							if( jj<is || jj>=ie ) continue;
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
							#else
								LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
							#endif
							/* x[A->U->index[j]] -= A->U->value[j] * t; */
						}
					}
					for(i=ie-1;i>=is;i--)
					{
						#ifndef USE_SSE2
							LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#else
							LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#endif
						x[i]  = w1.hi;
						xl[i] = w1.lo;
						/* t    = x[i] * A->WD->value[i]; */
						/* x[i] = t; */
						for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
						{
							jj = A->L->index[j];
							if( jj<is || jj>=ie ) continue;
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
							#else
								LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
							#endif
							/* x[A->L->index[j]] -= A->L->value[j] * t; */
						}
					}
				}
			#else
				for(i=0;i<n;i++)
				{
					#ifndef USE_SSE2
						LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#else
						LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#endif
					/* t   = x[i] * A->WD->value[i]; */
					for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
					{
						jj = A->U->index[j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
						#else
							LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
						#endif
						/* x[A->U->index[j]] -= A->U->value[j] * t; */
					}
				}
				for(i=n-1;i>=0;i--)
				{
					#ifndef USE_SSE2
						LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#else
						LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#endif
					x[i]  = w1.hi;
					xl[i] = w1.lo;
					/* t    = x[i] * A->WD->value[i]; */
					/* x[i] = t; */
					for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
					{
						jj = A->L->index[j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
						#else
							LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
						#endif
						/* x[A->L->index[j]] -= A->L->value[j] * t; */
					}
				}
			#endif
		}
	#endif
		break;
	}

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
LIS_INT lis_psolvet_ilut_csr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
{
#ifdef _OPENMP
  LIS_INT i,j,jj,n;
  LIS_INT is,ie,my_rank,nprocs;
  LIS_SCALAR *b,*x;
  LIS_MATRIX_ILU L,U;
  LIS_VECTOR D;
  LIS_PRECON  precon;
  LIS_QUAD_DECLAR;
  #ifdef USE_QUAD_PRECISION
    LIS_SCALAR *xl;
  #endif

  LIS_DEBUG_FUNC_IN;

  precon = solver->precon;
  L = precon->L;
  U = precon->U;
  D = precon->D;
  b = B->value;
  x = X->value;
  #ifdef USE_QUAD_PRECISION
    xl = X->value_lo;
  #endif
  n = solver->A->n;
  nprocs = omp_get_max_threads();

  #ifdef USE_QUAD_PRECISION
    if( B->precision==LIS_PRECISION_DEFAULT )
    {
  #endif
      lis_vector_copy(B,X);
      #pragma omp parallel private(i,j,jj,is,ie,my_rank)
      {
        my_rank = omp_get_thread_num();
        LIS_GET_ISIE(my_rank,nprocs,n,is,ie);

        for(i=is;i<ie;i++)
        {
          x[i] = D->value[i]*x[i];
          for(j=0;j<U->nnz[i];j++)
          {
            jj     = U->index[i][j];
            x[jj] -= U->value[i][j] * x[i];
          }
        }
        for(i=ie-1;i>=is;i--)
        {
          for(j=0;j<L->nnz[i];j++)
          {
            jj     = L->index[i][j];
            x[jj] -= L->value[i][j] * x[i];
          }
        }
      }
  #ifdef USE_QUAD_PRECISION
    }
    else
    {
      lis_vector_copyex_mm(B,X);
      nprocs = omp_get_max_threads();
      #ifndef USE_SSE2
        #pragma omp parallel private(i,j,jj,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
      #else
        #pragma omp parallel private(i,j,jj,is,ie,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
      #endif
      {
        my_rank = omp_get_thread_num();
        LIS_GET_ISIE(my_rank,nprocs,n,is,ie);

        for(i=is;i<ie;i++)
        {
          #ifndef USE_SSE2
            LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
          #else
            LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]);
          #endif
/*          x[i] = D->value[i]*x[i];*/
          for(j=0;j<U->nnz[i];j++)
          {
            jj     = U->index[i][j];
            #ifndef USE_SSE2
              LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
            #else
              LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
            #endif
/*            x[jj] -= U->value[i][j] * x[i];*/
          }
        }
        for(i=ie-1;i>=is;i--)
        {
          for(j=0;j<L->nnz[i];j++)
          {
            jj     = L->index[i][j];
            #ifndef USE_SSE2
              LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
            #else
              LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
            #endif
/*            x[jj] -= L->value[i][j] * x[i];*/
          }
        }
      }
    }
  #endif

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#else
  LIS_INT i,j,jj,n;
  LIS_SCALAR *b,*x;
  LIS_MATRIX_ILU L,U;
  LIS_VECTOR D;
  LIS_PRECON  precon;
  LIS_QUAD_DECLAR;
  #ifdef USE_QUAD_PRECISION
    LIS_SCALAR *xl;
  #endif


  LIS_DEBUG_FUNC_IN;

  precon = solver->precon;
  L = precon->L;
  U = precon->U;
  D = precon->D;
  b = B->value;
  x = X->value;
  #ifdef USE_QUAD_PRECISION
    xl = X->value_lo;
  #endif
  n = solver->A->n;

  #ifdef USE_QUAD_PRECISION
    if( B->precision==LIS_PRECISION_DEFAULT )
    {
  #endif
      lis_vector_copy(B,X);
      for(i=0; i<n; i++)
      {
        x[i] = D->value[i]*x[i];
        for(j=0;j<U->nnz[i];j++)
        {
          jj     = U->index[i][j];
          x[jj] -= U->value[i][j] * x[i];
        }
      }
      for(i=n-1; i>=0; i--)
      {
        for(j=0;j<L->nnz[i];j++)
        {
          jj     = L->index[i][j];
          x[jj] -= L->value[i][j] * x[i];
        }
      }
  #ifdef USE_QUAD_PRECISION
    }
    else
    {
      lis_vector_copy(B,X);
      for(i=0; i<n; i++)
      {
        #ifndef USE_SSE2
          LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
        #else
          LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]);
        #endif
/*        x[i] = D->value[i]*x[i];*/
        for(j=0;j<U->nnz[i];j++)
        {
          jj     = U->index[i][j];
          #ifndef USE_SSE2
            LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
          #else
            LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
          #endif
/*          x[jj] -= U->value[i][j] * x[i];*/
        }
      }
      for(i=n-1; i>=0; i--)
      {
        for(j=0;j<L->nnz[i];j++)
        {
          jj     = L->index[i][j];
          #ifndef USE_SSE2
            LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
          #else
            LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
          #endif
/*          x[jj] -= L->value[i][j] * x[i];*/
        }
      }
    }
  #endif

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#endif
}