LIS_INT lis_matvec_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y)
{
	LIS_INT i,j,jj,n,np;
	LIS_SCALAR *x,*y;
	#ifdef _OPENMP
		LIS_INT nprocs,k;
		LIS_SCALAR t,*w;
	#endif
	#ifdef USE_QUAD_PRECISION
		LIS_INT j0,j1;
		#ifdef _OPENMP
				LIS_SCALAR *ww,*wwl;
		#endif
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	np = A->np;
	n  = LU->n;
	x  = X->value;
	y  = Y->value;

	#ifdef USE_QUAD_PRECISION
	if( X->precision==LIS_PRECISION_DEFAULT )
	#endif
	{
		#ifdef USE_MPI
			LIS_MATVEC_SENDRECV;
		#endif
		#ifdef _OPENMP
			nprocs = omp_get_max_threads();
			w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs::w" );
			#pragma omp parallel private(i,j,k,jj,t)
			{
				k = omp_get_thread_num();
				#pragma omp for
				for(j=0;j<nprocs;j++)
				{
					memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
				}
				#pragma omp for 
				for(i=0;i<n;i++)
				{
					for(j=0;j<LU->nnz[i];j++)
					{
						jj = k*np + LU->index[i][j];
						w[jj] += LU->value[i][j] * X->value[i];
					}
				}
				#pragma omp for 
				for(i=0;i<np;i++)
				{
					t = 0.0;
					for(j=0;j<nprocs;j++)
					{
						t += w[j*np+i];
					}
					Y->value[i] = t;
				}
			}
			lis_free(w);
		#else
			for(i=0;i<np;i++)
			{
				Y->value[i] = 0.0;
			}
			for(i=0;i<n;i++)
			{
				for(j=0;j<LU->nnz[i];j++)
				{
					jj = LU->index[i][j];
					Y->value[jj] += LU->value[i][j] * X->value[i];
				}
			}
		#endif
	}
	#ifdef USE_QUAD_PRECISION
	else
	{
		#ifdef USE_MPI
			lis_send_recv_mp(A->commtable,X);
		#endif
		#ifdef _OPENMP
			#ifndef USE_FMA2_SSE2
				nprocs = omp_get_max_threads();
				ww  = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs_mp::ww" );
				wwl = &ww[nprocs*np];
				#ifndef USE_SSE2
					#pragma omp parallel private(i,j,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
				#else
					#pragma omp parallel private(i,j,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
				#endif
				{
					k = omp_get_thread_num();
					#pragma omp for
					for(j=0;j<nprocs;j++)
					{
						memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
						memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
					}
					#pragma omp for 
					for(i=0;i<n;i++)
					{
						for(j=0;j<LU->nnz[i];j++)
						{
							jj  = k*np + LU->index[i][j];
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
							#else
								LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
							#endif
						}
					}
					#pragma omp for 
					for(i=0;i<np;i++)
					{
						Y->value[i] = Y->value_lo[i] = 0.0;
						for(j=0;j<nprocs;j++)
						{
							#ifndef USE_SSE2
								LIS_QUAD_ADD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]);
							#else
								LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]);
							#endif
						}
					}
				}
				lis_free(ww);
			#else
				nprocs = omp_get_max_threads();
				ww  = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvect_crs_mp2::ww" );
				wwl = &ww[nprocs*np];
				#pragma omp parallel private(i,j,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
				{
					k = omp_get_thread_num();
					#pragma omp for
					for(j=0;j<nprocs;j++)
					{
						memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
						memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
					}
					#pragma omp for
					for(i=0; i<n; i++)
					{
						for(j=0;j<LU->nnz[i]-1;j+=2)
						{
							j0  = k*np + LU->index[i][j];
							j1  = k*np + LU->index[i][j+1];
							#ifdef USE_SSE2
								LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]);
							#endif
						}
						for(;j<LU->nnz[i];j++)
						{
							j0  = LU->index[i][j];
							#ifdef USE_SSE2
								LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],X->value[i],X->value_lo[i],LU->value[i][j]);
							#endif
						}
					}
					#pragma omp for 
					for(i=0;i<np;i++)
					{
						Y->value[i] = Y->value_lo[i] = 0.0;
						for(j=0;j<nprocs;j++)
						{
							#ifdef USE_SSE2
								LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]);
							#endif
						}
					}
				}
				lis_free(ww);
			#endif
		#else
			#ifndef USE_FMA2_SSE2
				for(i=0;i<np;i++)
				{
					Y->value[i]    = 0.0;
					Y->value_lo[i] = 0.0;
				}
				for(i=0;i<n;i++)
				{
					for(j=0;j<LU->nnz[i];j++)
					{
						jj  = LU->index[i][j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
						#else
							LIS_QUAD_FMAD_SSE2(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]);
						#endif
					}
				}
			#else
				for(i=0; i<np; i++)
				{
					Y->value[i]  = 0.0;
					Y->value_lo[i] = 0.0;
				}
				for(i=0; i<n; i++)
				{
					for(j=0;j<LU->nnz[i]-1;j+=2)
					{
						j0  = LU->index[i][j];
						j1  = LU->index[i][j+1];
						#ifdef USE_SSE2
							LIS_QUAD_FMAD2_SSE2_STSD(Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]);
						#endif
					}
					for(;j<LU->nnz[i];j++)
					{
						j0  = LU->index[i][j];
						#ifdef USE_SSE2
							LIS_QUAD_FMAD_SSE2(Y->value[j0],Y->value_lo[j0],Y->value[j0],Y->value_lo[j0],X->value[i],X->value_lo[i],LU->value[i][j]);
						#endif
					}
				}
			#endif
		#endif
	}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
예제 #2
0
LIS_INT lis_matrix_solvet_csr(LIS_MATRIX A, LIS_VECTOR B, LIS_VECTOR X, LIS_INT flag)
{
	LIS_INT i,j,jj,n;
	LIS_SCALAR t;
	LIS_SCALAR *x;
	#ifdef _OPENMP
		LIS_INT is,ie,my_rank,nprocs;
	#endif
	#ifdef USE_QUAD_PRECISION
		LIS_QUAD w1,w2;
		LIS_SCALAR *xl;
	#endif
	LIS_QUAD_DECLAR;

	LIS_DEBUG_FUNC_IN;

	n  = A->n;
	x  = X->value;
	#ifdef USE_QUAD_PRECISION
		xl = X->value_lo;
	#endif

	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			lis_vector_copy(B,X);
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			lis_vector_copyex_mm(B,X);
		}
	#endif
	switch(flag)
	{
	case LIS_MATRIX_LOWER:
		for(i=0;i<n;i++)
		{
			x[i]   = x[i] * A->WD->value[i];
			for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
			{
				x[A->U->index[j]] -= A->U->value[j] * x[i];
			}
		}
		break;
	case LIS_MATRIX_UPPER:
		for(i=n-1;i>=0;i--)
		{
			x[i]   = x[i] * A->WD->value[i];
			for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
			{
				x[A->L->index[j]] -= A->L->value[j] * x[i];
			}
		}
		break;
	case LIS_MATRIX_SSOR:
	#ifdef USE_QUAD_PRECISION
		if( B->precision==LIS_PRECISION_DEFAULT )
		{
	#endif
			#ifdef _OPENMP
				nprocs = omp_get_max_threads();
				#pragma omp parallel private(i,j,jj,t,is,ie,my_rank)
				{
					my_rank = omp_get_thread_num();
					LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
					for(i=is;i<ie;i++)
					{
						t   = x[i] * A->WD->value[i];
						for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
						{
							jj = A->U->index[j];
							if( jj<is || jj>=ie ) continue;
							x[jj] -= A->U->value[j] * t;
						}
					}
					for(i=ie-1;i>=is;i--)
					{
						t    = x[i] * A->WD->value[i];
						x[i] = t;
						for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
						{
							jj = A->L->index[j];
							if( jj<is ) continue;
							x[jj] -= A->L->value[j] * t;
						}
					}
				}
			#else
				for(i=0;i<n;i++)
				{
					t   = x[i] * A->WD->value[i];
					for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
					{
						x[A->U->index[j]] -= A->U->value[j] * t;
					}
				}
				for(i=n-1;i>=0;i--)
				{
					t    = x[i] * A->WD->value[i];
					x[i] = t;
					for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
					{
						x[A->L->index[j]] -= A->L->value[j] * t;
					}
				}
			#endif
	#ifdef USE_QUAD_PRECISION
		}
		else
		{
			#ifdef _OPENMP
				nprocs = omp_get_max_threads();
				#ifndef USE_SSE2
					#pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
				#else
					#pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
				#endif
				{
					my_rank = omp_get_thread_num();
					LIS_GET_ISIE(my_rank,nprocs,n,is,ie);
					for(i=is;i<ie;i++)
					{
						#ifndef USE_SSE2
							LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#else
							LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#endif
						/* t   = x[i] * A->WD->value[i]; */
						for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
						{
							jj = A->U->index[j];
							if( jj<is || jj>=ie ) continue;
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
							#else
								LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
							#endif
							/* x[A->U->index[j]] -= A->U->value[j] * t; */
						}
					}
					for(i=ie-1;i>=is;i--)
					{
						#ifndef USE_SSE2
							LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#else
							LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
						#endif
						x[i]  = w1.hi;
						xl[i] = w1.lo;
						/* t    = x[i] * A->WD->value[i]; */
						/* x[i] = t; */
						for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
						{
							jj = A->L->index[j];
							if( jj<is || jj>=ie ) continue;
							#ifndef USE_SSE2
								LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
							#else
								LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
							#endif
							/* x[A->L->index[j]] -= A->L->value[j] * t; */
						}
					}
				}
			#else
				for(i=0;i<n;i++)
				{
					#ifndef USE_SSE2
						LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#else
						LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#endif
					/* t   = x[i] * A->WD->value[i]; */
					for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++)
					{
						jj = A->U->index[j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
						#else
							LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]);
						#endif
						/* x[A->U->index[j]] -= A->U->value[j] * t; */
					}
				}
				for(i=n-1;i>=0;i--)
				{
					#ifndef USE_SSE2
						LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#else
						LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]);
					#endif
					x[i]  = w1.hi;
					xl[i] = w1.lo;
					/* t    = x[i] * A->WD->value[i]; */
					/* x[i] = t; */
					for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++)
					{
						jj = A->L->index[j];
						#ifndef USE_SSE2
							LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
						#else
							LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]);
						#endif
						/* x[A->L->index[j]] -= A->L->value[j] * t; */
					}
				}
			#endif
		}
	#endif
		break;
	}

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
LIS_INT lis_matvect_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y)
{
	LIS_INT i,j,jj,n;
	LIS_SCALAR t,*x,*y;
	LIS_QUAD_DECLAR;
	#ifdef USE_QUAD_PRECISION
		LIS_INT	j0,j1;
		LIS_QUAD_PD	tt;
	#endif

	LIS_DEBUG_FUNC_IN;

	n = LU->n;
	x = X->value;
	y = Y->value;

	#ifdef USE_QUAD_PRECISION
	if( X->precision==LIS_PRECISION_DEFAULT )
	#endif
	{
		#ifdef USE_MPI
			LIS_MATVEC_SENDRECV;
		#endif
		#ifdef _OPENMP
		#pragma omp parallel for private(i,j,jj,t)
		#endif
		for(i=0;i<n;i++)
		{
			t = 0.0;
			for(j=0;j<LU->nnz[i];j++)
			{
				jj = LU->index[i][j];
				t += LU->value[i][j] * X->value[jj];
			}
			Y->value[i] = t;
		}
	}
	#ifdef USE_QUAD_PRECISION
	else
	{
		#ifdef USE_MPI
			lis_send_recv_mp(A->commtable,X);
		#endif
		#ifndef USE_FMA2_SSE2
			#ifndef USE_SSE2
				#pragma omp parallel private(i,j,jj,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
			#else
				#pragma omp parallel private(i,j,jj,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
			#endif
			for(i=0;i<n;i++)
			{
				Y->value[i] = Y->value_lo[i] = 0.0;
				for(j=0;j<LU->nnz[i];j++)
				{
					jj = LU->index[i][j];
					#ifndef USE_SSE2
						LIS_QUAD_FMAD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]);
					#else
						LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]);
					#endif
				}
			}
		#else
			#ifdef _OPENMP
			#ifndef USE_SSE2
				#pragma omp parallel for private(i,j,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
			#else
				#pragma omp parallel for private(i,j,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
			#endif
			#endif
			for(i=0;i<n;i++)
			{
				tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0;
				for(j=0;j<LU->nnz[i]-1;j+=2)
				{
					j0 = LU->index[i][j];
					j1 = LU->index[i][j+1];
					#ifdef USE_SSE2
						LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],X->value[j0],X->value_lo[j0],X->value[j1],X->value_lo[j1],LU->value[i][j]);
					#endif
				}
				#ifdef USE_SSE2
					LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],tt.hi[0],tt.lo[0],tt.hi[1],tt.lo[1]);
				#endif
				for(;j<LU->nnz[i];j++)
				{
					j0 = LU->index[i][j];
					#ifdef USE_SSE2
						LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[j0],X->value_lo[j0],LU->value[i][j]);
					#endif
				}
			}
		#endif
	}
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
LIS_INT lis_psolvet_ilut_csr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X)
{
#ifdef _OPENMP
  LIS_INT i,j,jj,n;
  LIS_INT is,ie,my_rank,nprocs;
  LIS_SCALAR *b,*x;
  LIS_MATRIX_ILU L,U;
  LIS_VECTOR D;
  LIS_PRECON  precon;
  LIS_QUAD_DECLAR;
  #ifdef USE_QUAD_PRECISION
    LIS_SCALAR *xl;
  #endif

  LIS_DEBUG_FUNC_IN;

  precon = solver->precon;
  L = precon->L;
  U = precon->U;
  D = precon->D;
  b = B->value;
  x = X->value;
  #ifdef USE_QUAD_PRECISION
    xl = X->value_lo;
  #endif
  n = solver->A->n;
  nprocs = omp_get_max_threads();

  #ifdef USE_QUAD_PRECISION
    if( B->precision==LIS_PRECISION_DEFAULT )
    {
  #endif
      lis_vector_copy(B,X);
      #pragma omp parallel private(i,j,jj,is,ie,my_rank)
      {
        my_rank = omp_get_thread_num();
        LIS_GET_ISIE(my_rank,nprocs,n,is,ie);

        for(i=is;i<ie;i++)
        {
          x[i] = D->value[i]*x[i];
          for(j=0;j<U->nnz[i];j++)
          {
            jj     = U->index[i][j];
            x[jj] -= U->value[i][j] * x[i];
          }
        }
        for(i=ie-1;i>=is;i--)
        {
          for(j=0;j<L->nnz[i];j++)
          {
            jj     = L->index[i][j];
            x[jj] -= L->value[i][j] * x[i];
          }
        }
      }
  #ifdef USE_QUAD_PRECISION
    }
    else
    {
      lis_vector_copyex_mm(B,X);
      nprocs = omp_get_max_threads();
      #ifndef USE_SSE2
        #pragma omp parallel private(i,j,jj,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
      #else
        #pragma omp parallel private(i,j,jj,is,ie,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
      #endif
      {
        my_rank = omp_get_thread_num();
        LIS_GET_ISIE(my_rank,nprocs,n,is,ie);

        for(i=is;i<ie;i++)
        {
          #ifndef USE_SSE2
            LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
          #else
            LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]);
          #endif
/*          x[i] = D->value[i]*x[i];*/
          for(j=0;j<U->nnz[i];j++)
          {
            jj     = U->index[i][j];
            #ifndef USE_SSE2
              LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
            #else
              LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
            #endif
/*            x[jj] -= U->value[i][j] * x[i];*/
          }
        }
        for(i=ie-1;i>=is;i--)
        {
          for(j=0;j<L->nnz[i];j++)
          {
            jj     = L->index[i][j];
            #ifndef USE_SSE2
              LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
            #else
              LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
            #endif
/*            x[jj] -= L->value[i][j] * x[i];*/
          }
        }
      }
    }
  #endif

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#else
  LIS_INT i,j,jj,n;
  LIS_SCALAR *b,*x;
  LIS_MATRIX_ILU L,U;
  LIS_VECTOR D;
  LIS_PRECON  precon;
  LIS_QUAD_DECLAR;
  #ifdef USE_QUAD_PRECISION
    LIS_SCALAR *xl;
  #endif


  LIS_DEBUG_FUNC_IN;

  precon = solver->precon;
  L = precon->L;
  U = precon->U;
  D = precon->D;
  b = B->value;
  x = X->value;
  #ifdef USE_QUAD_PRECISION
    xl = X->value_lo;
  #endif
  n = solver->A->n;

  #ifdef USE_QUAD_PRECISION
    if( B->precision==LIS_PRECISION_DEFAULT )
    {
  #endif
      lis_vector_copy(B,X);
      for(i=0; i<n; i++)
      {
        x[i] = D->value[i]*x[i];
        for(j=0;j<U->nnz[i];j++)
        {
          jj     = U->index[i][j];
          x[jj] -= U->value[i][j] * x[i];
        }
      }
      for(i=n-1; i>=0; i--)
      {
        for(j=0;j<L->nnz[i];j++)
        {
          jj     = L->index[i][j];
          x[jj] -= L->value[i][j] * x[i];
        }
      }
  #ifdef USE_QUAD_PRECISION
    }
    else
    {
      lis_vector_copy(B,X);
      for(i=0; i<n; i++)
      {
        #ifndef USE_SSE2
          LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]);
        #else
          LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]);
        #endif
/*        x[i] = D->value[i]*x[i];*/
        for(j=0;j<U->nnz[i];j++)
        {
          jj     = U->index[i][j];
          #ifndef USE_SSE2
            LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
          #else
            LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]);
          #endif
/*          x[jj] -= U->value[i][j] * x[i];*/
        }
      }
      for(i=n-1; i>=0; i--)
      {
        for(j=0;j<L->nnz[i];j++)
        {
          jj     = L->index[i][j];
          #ifndef USE_SSE2
            LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
          #else
            LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]);
          #endif
/*          x[jj] -= L->value[i][j] * x[i];*/
        }
      }
    }
  #endif

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
#endif
}