LIS_INT lis_matvec_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y) { LIS_INT i,j,jj,n,np; LIS_SCALAR *x,*y; #ifdef _OPENMP LIS_INT nprocs,k; LIS_SCALAR t,*w; #endif #ifdef USE_QUAD_PRECISION LIS_INT j0,j1; #ifdef _OPENMP LIS_SCALAR *ww,*wwl; #endif #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; np = A->np; n = LU->n; x = X->value; y = Y->value; #ifdef USE_QUAD_PRECISION if( X->precision==LIS_PRECISION_DEFAULT ) #endif { #ifdef USE_MPI LIS_MATVEC_SENDRECV; #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs::w" ); #pragma omp parallel private(i,j,k,jj,t) { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = k*np + LU->index[i][j]; w[jj] += LU->value[i][j] * X->value[i]; } } #pragma omp for for(i=0;i<np;i++) { t = 0.0; for(j=0;j<nprocs;j++) { t += w[j*np+i]; } Y->value[i] = t; } } lis_free(w); #else for(i=0;i<np;i++) { Y->value[i] = 0.0; } for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; Y->value[jj] += LU->value[i][j] * X->value[i]; } } #endif } #ifdef USE_QUAD_PRECISION else { #ifdef USE_MPI lis_send_recv_mp(A->commtable,X); #endif #ifdef _OPENMP #ifndef USE_FMA2_SSE2 nprocs = omp_get_max_threads(); ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs_mp::ww" ); wwl = &ww[nprocs*np]; #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) ); memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = k*np + LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #pragma omp for for(i=0;i<np;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<nprocs;j++) { #ifndef USE_SSE2 LIS_QUAD_ADD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #else LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #endif } } } lis_free(ww); #else nprocs = omp_get_max_threads(); ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvect_crs_mp2::ww" ); wwl = &ww[nprocs*np]; #pragma omp parallel private(i,j,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) ); memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0; i<n; i++) { for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = k*np + LU->index[i][j]; j1 = k*np + LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #pragma omp for for(i=0;i<np;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<nprocs;j++) { #ifdef USE_SSE2 LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #endif } } } lis_free(ww); #endif #else #ifndef USE_FMA2_SSE2 for(i=0;i<np;i++) { Y->value[i] = 0.0; Y->value_lo[i] = 0.0; } for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #else for(i=0; i<np; i++) { Y->value[i] = 0.0; Y->value_lo[i] = 0.0; } for(i=0; i<n; i++) { for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = LU->index[i][j]; j1 = LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_STSD(Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(Y->value[j0],Y->value_lo[j0],Y->value[j0],Y->value_lo[j0],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #endif #endif } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_solvet_csr(LIS_MATRIX A, LIS_VECTOR B, LIS_VECTOR X, LIS_INT flag) { LIS_INT i,j,jj,n; LIS_SCALAR t; LIS_SCALAR *x; #ifdef _OPENMP LIS_INT is,ie,my_rank,nprocs; #endif #ifdef USE_QUAD_PRECISION LIS_QUAD w1,w2; LIS_SCALAR *xl; #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = A->n; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); #ifdef USE_QUAD_PRECISION } else { lis_vector_copyex_mm(B,X); } #endif switch(flag) { case LIS_MATRIX_LOWER: for(i=0;i<n;i++) { x[i] = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { x[A->U->index[j]] -= A->U->value[j] * x[i]; } } break; case LIS_MATRIX_UPPER: for(i=n-1;i>=0;i--) { x[i] = x[i] * A->WD->value[i]; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { x[A->L->index[j]] -= A->L->value[j] * x[i]; } } break; case LIS_MATRIX_SSOR: #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); #pragma omp parallel private(i,j,jj,t,is,ie,my_rank) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { t = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; if( jj<is || jj>=ie ) continue; x[jj] -= A->U->value[j] * t; } } for(i=ie-1;i>=is;i--) { t = x[i] * A->WD->value[i]; x[i] = t; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; if( jj<is ) continue; x[jj] -= A->L->value[j] * t; } } } #else for(i=0;i<n;i++) { t = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { x[A->U->index[j]] -= A->U->value[j] * t; } } for(i=n-1;i>=0;i--) { t = x[i] * A->WD->value[i]; x[i] = t; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { x[A->L->index[j]] -= A->L->value[j] * t; } } #endif #ifdef USE_QUAD_PRECISION } else { #ifdef _OPENMP nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif /* t = x[i] * A->WD->value[i]; */ for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; if( jj<is || jj>=ie ) continue; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #endif /* x[A->U->index[j]] -= A->U->value[j] * t; */ } } for(i=ie-1;i>=is;i--) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif x[i] = w1.hi; xl[i] = w1.lo; /* t = x[i] * A->WD->value[i]; */ /* x[i] = t; */ for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; if( jj<is || jj>=ie ) continue; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #endif /* x[A->L->index[j]] -= A->L->value[j] * t; */ } } } #else for(i=0;i<n;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif /* t = x[i] * A->WD->value[i]; */ for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #endif /* x[A->U->index[j]] -= A->U->value[j] * t; */ } } for(i=n-1;i>=0;i--) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif x[i] = w1.hi; xl[i] = w1.lo; /* t = x[i] * A->WD->value[i]; */ /* x[i] = t; */ for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #endif /* x[A->L->index[j]] -= A->L->value[j] * t; */ } } #endif } #endif break; } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matvect_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y) { LIS_INT i,j,jj,n; LIS_SCALAR t,*x,*y; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_INT j0,j1; LIS_QUAD_PD tt; #endif LIS_DEBUG_FUNC_IN; n = LU->n; x = X->value; y = Y->value; #ifdef USE_QUAD_PRECISION if( X->precision==LIS_PRECISION_DEFAULT ) #endif { #ifdef USE_MPI LIS_MATVEC_SENDRECV; #endif #ifdef _OPENMP #pragma omp parallel for private(i,j,jj,t) #endif for(i=0;i<n;i++) { t = 0.0; for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; t += LU->value[i][j] * X->value[jj]; } Y->value[i] = t; } } #ifdef USE_QUAD_PRECISION else { #ifdef USE_MPI lis_send_recv_mp(A->commtable,X); #endif #ifndef USE_FMA2_SSE2 #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif for(i=0;i<n;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]); #endif } } #else #ifdef _OPENMP #ifndef USE_SSE2 #pragma omp parallel for private(i,j,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel for private(i,j,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif #endif for(i=0;i<n;i++) { tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0; for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = LU->index[i][j]; j1 = LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],X->value[j0],X->value_lo[j0],X->value[j1],X->value_lo[j1],LU->value[i][j]); #endif } #ifdef USE_SSE2 LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],tt.hi[0],tt.lo[0],tt.hi[1],tt.lo[1]); #endif for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[j0],X->value_lo[j0],LU->value[i][j]); #endif } } #endif } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_psolvet_ilut_csr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X) { #ifdef _OPENMP LIS_INT i,j,jj,n; LIS_INT is,ie,my_rank,nprocs; LIS_SCALAR *b,*x; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_PRECON precon; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_SCALAR *xl; #endif LIS_DEBUG_FUNC_IN; precon = solver->precon; L = precon->L; U = precon->U; D = precon->D; b = B->value; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif n = solver->A->n; nprocs = omp_get_max_threads(); #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); #pragma omp parallel private(i,j,jj,is,ie,my_rank) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { x[i] = D->value[i]*x[i]; for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; x[jj] -= U->value[i][j] * x[i]; } } for(i=ie-1;i>=is;i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; x[jj] -= L->value[i][j] * x[i]; } } } #ifdef USE_QUAD_PRECISION } else { lis_vector_copyex_mm(B,X); nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,is,ie,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]); #else LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]); #endif /* x[i] = D->value[i]*x[i];*/ for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #endif /* x[jj] -= U->value[i][j] * x[i];*/ } } for(i=ie-1;i>=is;i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #endif /* x[jj] -= L->value[i][j] * x[i];*/ } } } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #else LIS_INT i,j,jj,n; LIS_SCALAR *b,*x; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_PRECON precon; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_SCALAR *xl; #endif LIS_DEBUG_FUNC_IN; precon = solver->precon; L = precon->L; U = precon->U; D = precon->D; b = B->value; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif n = solver->A->n; #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); for(i=0; i<n; i++) { x[i] = D->value[i]*x[i]; for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; x[jj] -= U->value[i][j] * x[i]; } } for(i=n-1; i>=0; i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; x[jj] -= L->value[i][j] * x[i]; } } #ifdef USE_QUAD_PRECISION } else { lis_vector_copy(B,X); for(i=0; i<n; i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]); #else LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]); #endif /* x[i] = D->value[i]*x[i];*/ for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #endif /* x[jj] -= U->value[i][j] * x[i];*/ } } for(i=n-1; i>=0; i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #endif /* x[jj] -= L->value[i][j] * x[i];*/ } } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #endif }