LIS_INT lis_vector_scaleex_nm(LIS_SCALAR alpha, LIS_VECTOR vx) { LIS_INT i,n,is,ie,nprocs,my_rank; LIS_SCALAR *aa; LIS_SCALAR *x,*xl; LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; xl = vx->value_lo; aa = vx->work; #ifndef USE_FMA2_SSE2 #pragma cdir nodep #ifndef USE_SSE2 #pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) #else #pragma omp parallel for private(i,bh,ch,sh,th,bl,sl,tl,p1,p2,t0,t1,t2,is,ie,my_rank) #endif for(i=0; i<n; i++) { LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],alpha); } #else #ifdef _OPENMP nprocs = omp_get_max_threads(); #else nprocs = 1; #endif aa[0] = aa[1] = alpha; #ifdef _OPENMP #ifndef USE_SSE2 #pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) #else #pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank) #endif #endif { #ifdef _OPENMP my_rank = omp_get_thread_num(); #else my_rank = 0; #endif LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie-1;i+=2) { LIS_QUAD_MULD2_SSE2(x[i],xl[i],x[i],xl[i],aa[0]); } for(;i<ie;i++) { LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],alpha); } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
void lis_quad_mul_dd_d(LIS_QUAD *a, const LIS_QUAD *b, const double c) { LIS_QUAD_DECLAR; #ifndef USE_SSE2 LIS_QUAD_MULD(a->hi,a->lo,b->hi,b->lo,c); #else LIS_QUAD_MULD_SSE2(a->hi,a->lo,b->hi,b->lo,c); #endif }
int lis_psolve_jacobi(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X) { int i,n; LIS_SCALAR *b,*x,*d; LIS_PRECON precon; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_SCALAR *xl; #endif LIS_DEBUG_FUNC_IN; /* * Mx = b * M = D */ precon = solver->precon; n = precon->D->n; d = precon->D->value; b = B->value; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif #ifdef _OPENMP #pragma omp parallel for private(i) #endif for(i=0; i<n; i++) { x[i] = b[i] * d[i]; } #ifdef USE_QUAD_PRECISION } else { #ifdef _OPENMP #ifndef USE_SSE2 #pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel for private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif #endif for(i=0; i<n; i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(x[i],xl[i],B->value[i],B->value_lo[i],d[i]); #else LIS_QUAD_MULD_SSE2(x[i],xl[i],B->value[i],B->value_lo[i],d[i]); #endif /* x[i] = b[i] * d[i]; */ } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_psolvet_sainv(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X) { LIS_INT i,n; LIS_MATRIX A; LIS_MATRIX_ILU W,Z; LIS_VECTOR t,d; LIS_PRECON precon; LIS_QUAD_DECLAR; /* * x = M'b * M' = WD^{-1}Z' */ LIS_DEBUG_FUNC_IN; precon = solver->precon; A = precon->A; W = precon->L; Z = precon->U; d = precon->D; t = precon->temp; n = precon->L->n; #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_matvect_ilu(A,Z,B,X); #ifdef _OPENMP #pragma omp parallel for private(i) #endif for(i=0;i<n;i++) { t->value[i] = X->value[i]*d->value[i]; } lis_matvec_ilu(A,W,t,X); #ifdef USE_QUAD_PRECISION } else { lis_matvect_ilu(A,Z,B,X); #ifdef _OPENMP #ifndef USE_SSE2 #pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel for private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif #endif for(i=0;i<n;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(t->value[i],t->value_lo[i],X->value[i],X->value_lo[i],d->value[i]); #else LIS_QUAD_MULD_SSE2(t->value[i],t->value_lo[i],X->value[i],X->value_lo[i],d->value[i]); #endif /* t->value[i] = X->value[i]*d->value[i]; */ } lis_matvec_ilu(A,W,t,X); } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_solvet_csr(LIS_MATRIX A, LIS_VECTOR B, LIS_VECTOR X, LIS_INT flag) { LIS_INT i,j,jj,n; LIS_SCALAR t; LIS_SCALAR *x; #ifdef _OPENMP LIS_INT is,ie,my_rank,nprocs; #endif #ifdef USE_QUAD_PRECISION LIS_QUAD w1,w2; LIS_SCALAR *xl; #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = A->n; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); #ifdef USE_QUAD_PRECISION } else { lis_vector_copyex_mm(B,X); } #endif switch(flag) { case LIS_MATRIX_LOWER: for(i=0;i<n;i++) { x[i] = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { x[A->U->index[j]] -= A->U->value[j] * x[i]; } } break; case LIS_MATRIX_UPPER: for(i=n-1;i>=0;i--) { x[i] = x[i] * A->WD->value[i]; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { x[A->L->index[j]] -= A->L->value[j] * x[i]; } } break; case LIS_MATRIX_SSOR: #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); #pragma omp parallel private(i,j,jj,t,is,ie,my_rank) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { t = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; if( jj<is || jj>=ie ) continue; x[jj] -= A->U->value[j] * t; } } for(i=ie-1;i>=is;i--) { t = x[i] * A->WD->value[i]; x[i] = t; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; if( jj<is ) continue; x[jj] -= A->L->value[j] * t; } } } #else for(i=0;i<n;i++) { t = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { x[A->U->index[j]] -= A->U->value[j] * t; } } for(i=n-1;i>=0;i--) { t = x[i] * A->WD->value[i]; x[i] = t; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { x[A->L->index[j]] -= A->L->value[j] * t; } } #endif #ifdef USE_QUAD_PRECISION } else { #ifdef _OPENMP nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif /* t = x[i] * A->WD->value[i]; */ for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; if( jj<is || jj>=ie ) continue; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #endif /* x[A->U->index[j]] -= A->U->value[j] * t; */ } } for(i=ie-1;i>=is;i--) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif x[i] = w1.hi; xl[i] = w1.lo; /* t = x[i] * A->WD->value[i]; */ /* x[i] = t; */ for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; if( jj<is || jj>=ie ) continue; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #endif /* x[A->L->index[j]] -= A->L->value[j] * t; */ } } } #else for(i=0;i<n;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif /* t = x[i] * A->WD->value[i]; */ for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #endif /* x[A->U->index[j]] -= A->U->value[j] * t; */ } } for(i=n-1;i>=0;i--) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif x[i] = w1.hi; xl[i] = w1.lo; /* t = x[i] * A->WD->value[i]; */ /* x[i] = t; */ for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #endif /* x[A->L->index[j]] -= A->L->value[j] * t; */ } } #endif } #endif break; } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_psolvet_ilut_csr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X) { #ifdef _OPENMP LIS_INT i,j,jj,n; LIS_INT is,ie,my_rank,nprocs; LIS_SCALAR *b,*x; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_PRECON precon; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_SCALAR *xl; #endif LIS_DEBUG_FUNC_IN; precon = solver->precon; L = precon->L; U = precon->U; D = precon->D; b = B->value; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif n = solver->A->n; nprocs = omp_get_max_threads(); #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); #pragma omp parallel private(i,j,jj,is,ie,my_rank) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { x[i] = D->value[i]*x[i]; for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; x[jj] -= U->value[i][j] * x[i]; } } for(i=ie-1;i>=is;i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; x[jj] -= L->value[i][j] * x[i]; } } } #ifdef USE_QUAD_PRECISION } else { lis_vector_copyex_mm(B,X); nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,is,ie,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]); #else LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]); #endif /* x[i] = D->value[i]*x[i];*/ for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #endif /* x[jj] -= U->value[i][j] * x[i];*/ } } for(i=ie-1;i>=is;i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #endif /* x[jj] -= L->value[i][j] * x[i];*/ } } } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #else LIS_INT i,j,jj,n; LIS_SCALAR *b,*x; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_PRECON precon; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_SCALAR *xl; #endif LIS_DEBUG_FUNC_IN; precon = solver->precon; L = precon->L; U = precon->U; D = precon->D; b = B->value; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif n = solver->A->n; #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); for(i=0; i<n; i++) { x[i] = D->value[i]*x[i]; for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; x[jj] -= U->value[i][j] * x[i]; } } for(i=n-1; i>=0; i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; x[jj] -= L->value[i][j] * x[i]; } } #ifdef USE_QUAD_PRECISION } else { lis_vector_copy(B,X); for(i=0; i<n; i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]); #else LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]); #endif /* x[i] = D->value[i]*x[i];*/ for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #endif /* x[jj] -= U->value[i][j] * x[i];*/ } } for(i=n-1; i>=0; i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #endif /* x[jj] -= L->value[i][j] * x[i];*/ } } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #endif }