LIS_INT lis_matrix_elements_copy_dia(LIS_INT n, LIS_INT nnd, LIS_INT *index, LIS_SCALAR *value, LIS_INT *o_index, LIS_SCALAR *o_value) { LIS_INT is,ie; LIS_INT nprocs,my_rank; LIS_DEBUG_FUNC_IN; #ifdef _OPENMP nprocs = omp_get_max_threads(); #else nprocs = 1; #endif memcpy(o_index,index,nnd*sizeof(LIS_INT)); #ifdef _OPENMP #pragma omp parallel private(is,ie,my_rank) #endif { #ifdef _OPENMP my_rank = omp_get_thread_num(); #else my_rank = 0; #endif LIS_GET_ISIE(my_rank,nprocs,n,is,ie) memcpy(&o_value[is*nnd],&value[is*nnd],(ie-is)*nnd*sizeof(LIS_SCALAR)); } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_vector_axpyzex_mmmm(LIS_QUAD_PTR alpha, LIS_VECTOR vx, LIS_VECTOR vy, LIS_VECTOR vz) { LIS_INT i,n,is,ie,nprocs,my_rank; LIS_QUAD_PTR aa; LIS_SCALAR *x,*y,*z; LIS_SCALAR *xl,*yl,*zl; LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; y = vy->value; z = vz->value; xl = vx->value_lo; yl = vy->value_lo; zl = vz->value_lo; aa.hi = &vx->work[4]; aa.lo = &vx->work[6]; #ifndef USE_FMA2_SSE2 #pragma cdir nodep #pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) for(i=0; i<n; i++) { LIS_QUAD_FMA(z[i],zl[i],y[i],yl[i],alpha.hi[0],alpha.lo[0],x[i],xl[i]); } #else #ifdef _OPENMP nprocs = omp_get_max_threads(); #else nprocs = 1; #endif aa.hi[0] = aa.hi[1] = alpha.hi[0]; aa.lo[0] = aa.lo[1] = alpha.lo[0]; #ifdef _OPENMP #pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank) #endif { #ifdef _OPENMP my_rank = omp_get_thread_num(); #else my_rank = 0; #endif LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie-1;i+=2) { LIS_QUAD_FMA2_SSE2(z[i],zl[i],y[i],yl[i],aa.hi[0],aa.lo[0],x[i],xl[i]); } for(;i<ie;i++) { LIS_QUAD_FMA_SSE2(z[i],zl[i],y[i],yl[i],alpha.hi[0],alpha.lo[0],x[i],xl[i]); } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_vector_scaleex_nm(LIS_SCALAR alpha, LIS_VECTOR vx) { LIS_INT i,n,is,ie,nprocs,my_rank; LIS_SCALAR *aa; LIS_SCALAR *x,*xl; LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; xl = vx->value_lo; aa = vx->work; #ifndef USE_FMA2_SSE2 #pragma cdir nodep #ifndef USE_SSE2 #pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) #else #pragma omp parallel for private(i,bh,ch,sh,th,bl,sl,tl,p1,p2,t0,t1,t2,is,ie,my_rank) #endif for(i=0; i<n; i++) { LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],alpha); } #else #ifdef _OPENMP nprocs = omp_get_max_threads(); #else nprocs = 1; #endif aa[0] = aa[1] = alpha; #ifdef _OPENMP #ifndef USE_SSE2 #pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) #else #pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank) #endif #endif { #ifdef _OPENMP my_rank = omp_get_thread_num(); #else my_rank = 0; #endif LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie-1;i+=2) { LIS_QUAD_MULD2_SSE2(x[i],xl[i],x[i],xl[i],aa[0]); } for(;i<ie;i++) { LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],alpha); } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_get_diagonal_dia(LIS_MATRIX A, LIS_SCALAR d[]) { LIS_INT i,j; LIS_INT n,nnd; #ifdef _OPENMP LIS_INT is,ie,my_rank,nprocs; #endif LIS_DEBUG_FUNC_IN; n = A->n; nnd = A->nnd; if( A->is_splited ) { #ifdef _OPENMP #pragma omp parallel for private(i) #endif for(i=0; i<n; i++) { d[i] = A->D->value[i]; } } else { #ifdef _OPENMP nprocs = omp_get_max_threads(); for(j=0;j<nnd;j++) { if( A->index[j]==0 ) break; } #pragma omp parallel private(is,ie,my_rank) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); memcpy(&d[is],&A->value[is*nnd+j*(ie-is)],(ie-is)*sizeof(LIS_SCALAR)); } #else for(j=0;j<nnd;j++) { if( A->index[j]==0 ) break; } for(i=0;i<n;i++) { d[i] = A->value[j*n+i]; } #endif } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
void lis_matvec_ell(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[]) { LIS_INT i,j,jj,is,ie; LIS_INT n,maxnzr,nprocs,my_rank; n = A->n; if( A->is_splited ) { #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0; i<n; i++) { y[i] = A->D->value[i]*x[i]; } for(j=0;j<A->L->maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[i] += A->L->value[jj + i] * x[A->L->index[jj + i]]; } } for(j=0;j<A->U->maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[i] += A->U->value[jj + i] * x[A->U->index[jj + i]]; } } } else { maxnzr = A->maxnzr; #ifdef _OPENMP nprocs = omp_get_max_threads(); #else nprocs = 1; #endif #ifdef _OPENMP #pragma omp parallel private(i,j,jj,is,ie,my_rank) #endif { #ifdef _OPENMP my_rank = omp_get_thread_num(); #else my_rank = 0; #endif LIS_GET_ISIE(my_rank,nprocs,n,is,ie); #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=is;i<ie;i++) { y[i] = 0.0; } for(j=0;j<maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=is;i<ie;i++) { y[i] += A->value[jj + i] * x[A->index[jj + i]]; } } } } }
void lis_matvect_ell(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[]) { LIS_INT i,j,jj; LIS_INT n,np,maxnzr; #ifdef _OPENMP LIS_INT k,is,ie,nprocs; LIS_SCALAR t; LIS_SCALAR *w; #endif n = A->n; np = A->np; if( A->is_splited ) { #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0; i<n; i++) { y[i] = A->D->value[i]*x[i]; } for(j=0;j<A->L->maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[A->L->index[jj + i]] += A->L->value[jj + i] * x[i]; } } for(j=0;j<A->U->maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[A->U->index[jj + i]] += A->U->value[jj + i] * x[i]; } } } else { #ifdef _OPENMP maxnzr = A->maxnzr; nprocs = omp_get_max_threads(); w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_ell::w" ); #pragma omp parallel private(i,j,t,jj,k,is,ie) { k = omp_get_thread_num(); LIS_GET_ISIE(k,nprocs,n,is,ie); #pragma omp for for(j=0;j<nprocs;j++) { memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) ); } for(j=0;j<maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=is;i<ie;i++) { w[k*np + A->index[jj + i]] += A->value[jj + i] * x[i]; } } #pragma omp barrier #pragma omp for #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<np;i++) { t = 0.0; for(j=0;j<nprocs;j++) { t += w[j*np+i]; } y[i] = t; } } lis_free(w); #else maxnzr = A->maxnzr; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0; i<n; i++) { y[i] = 0.0; } for(j=0;j<maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[A->index[jj + i]] += A->value[jj + i] * x[i]; } } #endif } }
LIS_INT lis_ranges_create(LIS_Comm comm, LIS_INT *local_n, LIS_INT *global_n, LIS_INT **ranges, LIS_INT *is, LIS_INT *ie, LIS_INT *nprocs, LIS_INT *my_rank) { #ifdef USE_MPI LIS_INT i; #endif LIS_INT *tranges; int int_nprocs,int_my_rank; LIS_DEBUG_FUNC_IN; #ifdef USE_MPI MPI_Comm_size(comm,&int_nprocs); MPI_Comm_rank(comm,&int_my_rank); *nprocs=int_nprocs; *my_rank=int_my_rank; tranges = (LIS_INT *)lis_malloc( (*nprocs+1)*sizeof(LIS_INT),"lis_ranges_create::tranges" ); if( tranges==NULL ) { LIS_SETERR_MEM((*nprocs+1)*sizeof(LIS_INT)); return LIS_OUT_OF_MEMORY; } #else *nprocs = 1; *my_rank = 0; tranges = NULL; #endif #ifdef USE_MPI MPI_Allreduce(local_n,&i,1,LIS_MPI_INT,MPI_SUM,comm); if( i==0 ) #else if( *local_n==0 ) #endif { #ifdef USE_MPI LIS_GET_ISIE(*my_rank,*nprocs,*global_n,*is,*ie); *local_n = *ie-*is; MPI_Allgather(ie,1,LIS_MPI_INT,&tranges[1],1,LIS_MPI_INT,comm); tranges[0] = 0; #else *local_n = *global_n; *is = 0; *ie = *global_n; #endif } else { #ifdef USE_MPI MPI_Allgather(local_n,1,LIS_MPI_INT,&tranges[1],1,LIS_MPI_INT,comm); tranges[0] = 0; for(i=0;i<*nprocs;i++) { tranges[i+1] += tranges[i]; } *global_n = tranges[*nprocs]; *is = tranges[*my_rank]; *ie = tranges[*my_rank+1]; #else *global_n = *local_n; *is = 0; *ie = *local_n; #endif } *ranges = tranges; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_solvet_csr(LIS_MATRIX A, LIS_VECTOR B, LIS_VECTOR X, LIS_INT flag) { LIS_INT i,j,jj,n; LIS_SCALAR t; LIS_SCALAR *x; #ifdef _OPENMP LIS_INT is,ie,my_rank,nprocs; #endif #ifdef USE_QUAD_PRECISION LIS_QUAD w1,w2; LIS_SCALAR *xl; #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = A->n; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); #ifdef USE_QUAD_PRECISION } else { lis_vector_copyex_mm(B,X); } #endif switch(flag) { case LIS_MATRIX_LOWER: for(i=0;i<n;i++) { x[i] = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { x[A->U->index[j]] -= A->U->value[j] * x[i]; } } break; case LIS_MATRIX_UPPER: for(i=n-1;i>=0;i--) { x[i] = x[i] * A->WD->value[i]; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { x[A->L->index[j]] -= A->L->value[j] * x[i]; } } break; case LIS_MATRIX_SSOR: #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); #pragma omp parallel private(i,j,jj,t,is,ie,my_rank) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { t = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; if( jj<is || jj>=ie ) continue; x[jj] -= A->U->value[j] * t; } } for(i=ie-1;i>=is;i--) { t = x[i] * A->WD->value[i]; x[i] = t; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; if( jj<is ) continue; x[jj] -= A->L->value[j] * t; } } } #else for(i=0;i<n;i++) { t = x[i] * A->WD->value[i]; for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { x[A->U->index[j]] -= A->U->value[j] * t; } } for(i=n-1;i>=0;i--) { t = x[i] * A->WD->value[i]; x[i] = t; for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { x[A->L->index[j]] -= A->L->value[j] * t; } } #endif #ifdef USE_QUAD_PRECISION } else { #ifdef _OPENMP nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,is,ie,w1,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif /* t = x[i] * A->WD->value[i]; */ for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; if( jj<is || jj>=ie ) continue; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #endif /* x[A->U->index[j]] -= A->U->value[j] * t; */ } } for(i=ie-1;i>=is;i--) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif x[i] = w1.hi; xl[i] = w1.lo; /* t = x[i] * A->WD->value[i]; */ /* x[i] = t; */ for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; if( jj<is || jj>=ie ) continue; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #endif /* x[A->L->index[j]] -= A->L->value[j] * t; */ } } } #else for(i=0;i<n;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif /* t = x[i] * A->WD->value[i]; */ for(j=A->U->ptr[i];j<A->U->ptr[i+1];j++) { jj = A->U->index[j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->U->value[j]); #endif /* x[A->U->index[j]] -= A->U->value[j] * t; */ } } for(i=n-1;i>=0;i--) { #ifndef USE_SSE2 LIS_QUAD_MULD(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #else LIS_QUAD_MULD_SSE2(w1.hi,w1.lo,x[i],xl[i],A->WD->value[i]); #endif x[i] = w1.hi; xl[i] = w1.lo; /* t = x[i] * A->WD->value[i]; */ /* x[i] = t; */ for(j=A->L->ptr[i];j<A->L->ptr[i+1];j++) { jj = A->L->index[j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],w1.hi,w1.lo,-A->L->value[j]); #endif /* x[A->L->index[j]] -= A->L->value[j] * t; */ } } #endif } #endif break; } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_psolvet_ilut_csr(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X) { #ifdef _OPENMP LIS_INT i,j,jj,n; LIS_INT is,ie,my_rank,nprocs; LIS_SCALAR *b,*x; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_PRECON precon; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_SCALAR *xl; #endif LIS_DEBUG_FUNC_IN; precon = solver->precon; L = precon->L; U = precon->U; D = precon->D; b = B->value; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif n = solver->A->n; nprocs = omp_get_max_threads(); #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); #pragma omp parallel private(i,j,jj,is,ie,my_rank) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { x[i] = D->value[i]*x[i]; for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; x[jj] -= U->value[i][j] * x[i]; } } for(i=ie-1;i>=is;i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; x[jj] -= L->value[i][j] * x[i]; } } } #ifdef USE_QUAD_PRECISION } else { lis_vector_copyex_mm(B,X); nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,is,ie,my_rank,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]); #else LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]); #endif /* x[i] = D->value[i]*x[i];*/ for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #endif /* x[jj] -= U->value[i][j] * x[i];*/ } } for(i=ie-1;i>=is;i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #endif /* x[jj] -= L->value[i][j] * x[i];*/ } } } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #else LIS_INT i,j,jj,n; LIS_SCALAR *b,*x; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_PRECON precon; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_SCALAR *xl; #endif LIS_DEBUG_FUNC_IN; precon = solver->precon; L = precon->L; U = precon->U; D = precon->D; b = B->value; x = X->value; #ifdef USE_QUAD_PRECISION xl = X->value_lo; #endif n = solver->A->n; #ifdef USE_QUAD_PRECISION if( B->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_copy(B,X); for(i=0; i<n; i++) { x[i] = D->value[i]*x[i]; for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; x[jj] -= U->value[i][j] * x[i]; } } for(i=n-1; i>=0; i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; x[jj] -= L->value[i][j] * x[i]; } } #ifdef USE_QUAD_PRECISION } else { lis_vector_copy(B,X); for(i=0; i<n; i++) { #ifndef USE_SSE2 LIS_QUAD_MULD(x[i],xl[i],x[i],xl[i],D->value[i]); #else LIS_QUAD_MULD_SSE2(x[i],xl[i],x[i],xl[i],D->value[i]); #endif /* x[i] = D->value[i]*x[i];*/ for(j=0;j<U->nnz[i];j++) { jj = U->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-U->value[i][j]); #endif /* x[jj] -= U->value[i][j] * x[i];*/ } } for(i=n-1; i>=0; i--) { for(j=0;j<L->nnz[i];j++) { jj = L->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #else LIS_QUAD_FMAD_SSE2(x[jj],xl[jj],x[jj],xl[jj],x[i],xl[i],-L->value[i][j]); #endif /* x[jj] -= L->value[i][j] * x[i];*/ } } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #endif }
LIS_INT lis_precon_create_ilut_csr(LIS_SOLVER solver, LIS_PRECON precon) { #ifdef _OPENMP LIS_INT err; LIS_INT i,j,k,ii,jj,kk; LIS_INT is,ie,my_rank,nprocs; LIS_INT n,nr,nnz,lfil,len; LIS_SCALAR gamma,t,tol,toldd,m; LIS_MATRIX A; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_SCALAR tnorm, tolnorm; LIS_SCALAR fact,lxu,*wn,*w; LIS_INT lenu,lenl,col,jpos,jrow,upos,para; LIS_INT *jbuf,*iw; LIS_DEBUG_FUNC_IN; A = solver->A; n = A->n; tol = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN]; m = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN]; gamma = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN]; lfil = (LIS_INT)((double)A->nnz/(2.0*n))*m; nprocs = omp_get_max_threads(); L = NULL; U = NULL; err = lis_matrix_ilu_create(n,1,&L); if( err ) return err; err = lis_matrix_ilu_create(n,1,&U); if( err ) return err; err = lis_matrix_ilu_setCR(L); if( err ) return err; err = lis_matrix_ilu_setCR(U); if( err ) return err; err = lis_vector_duplicate(A,&D); if( err ) { return err; } w = (LIS_SCALAR *)lis_malloc(nprocs*(n+1)*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w"); if( w==NULL ) { LIS_SETERR_MEM(nprocs*(n+1)*sizeof(LIS_SCALAR)); return LIS_OUT_OF_MEMORY; } wn = (LIS_SCALAR *)lis_malloc(nprocs*n*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w"); if( wn==NULL ) { LIS_SETERR_MEM(nprocs*n*sizeof(LIS_SCALAR)); return LIS_OUT_OF_MEMORY; } jbuf = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw"); if( jbuf==NULL ) { LIS_SETERR_MEM(nprocs*n*sizeof(LIS_INT)); return LIS_OUT_OF_MEMORY; } iw = (LIS_INT *)lis_malloc(nprocs*n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw"); if( iw==NULL ) { LIS_SETERR_MEM(nprocs*n*sizeof(LIS_INT)); return LIS_OUT_OF_MEMORY; } #pragma omp parallel private(is,ie,my_rank,i,j,k,jj,tnorm,tolnorm,len,lenu,lenl,col,t,jpos,jrow,fact,lxu,upos) { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie;i++) iw[my_rank*n+i] = -1; for(i=is;i<ie;i++) { tnorm = 0; k = 0; for(j=A->ptr[i];j<A->ptr[i+1];j++) { jj = A->index[j]; if( jj<is || jj>=ie ) continue; tnorm += fabs(A->value[j]); k++; } tnorm = tnorm / (double)k; tolnorm = tol * tnorm; lenu = 0; lenl = 0; jbuf[my_rank*n+i] = i; w[my_rank*n+i] = 0; iw[my_rank*n+i] = i; for(j=A->ptr[i];j<A->ptr[i+1];j++) { col = A->index[j]; if( col<is || col>=ie ) continue; t = A->value[j]; if( col < i ) { jbuf[my_rank*n+lenl] = col; iw[my_rank*n+col] = lenl; w[my_rank*n+lenl] = t; lenl++; } else if( col == i ) { w[my_rank*n+i] = t; } else { lenu++; jpos = i + lenu; jbuf[my_rank*n+jpos] = col; iw[my_rank*n+col] = jpos; w[my_rank*n+jpos] = t; } } j = -1; len = 0; while( ++j < lenl ) { jrow = jbuf[my_rank*n+j]; jpos = j; for(k=j+1;k<lenl;k++) { if( jbuf[my_rank*n+k]<jrow ) { jrow = jbuf[my_rank*n+k]; jpos = k; } } if( jpos!=j ) { col = jbuf[my_rank*n+j]; jbuf[my_rank*n+j] = jbuf[my_rank*n+jpos]; jbuf[my_rank*n+jpos] = col; iw[my_rank*n+jrow] = j; iw[my_rank*n+col] = jpos; t = w[my_rank*n+j]; w[my_rank*n+j] = w[my_rank*n+jpos]; w[my_rank*n+jpos] = t; } fact = w[my_rank*n+j] * D->value[jrow]; w[my_rank*n+j] = fact; iw[my_rank*n+jrow] = -1; for(k=0;k<U->nnz[jrow];k++) { col = U->index[jrow][k]; jpos = iw[my_rank*n+col]; lxu = -fact * U->value[jrow][k]; if( fabs(lxu) < tolnorm && jpos==-1 ) continue; if( col >= i ) { if( jpos == -1 ) { lenu++; upos = i + lenu; jbuf[my_rank*n+upos] = col; iw[my_rank*n+col] = upos; w[my_rank*n+upos] = lxu; } else { w[my_rank*n+jpos] += lxu; } } else { if( jpos == -1 ) { jbuf[my_rank*n+lenl] = col; iw[my_rank*n+col] = lenl; w[my_rank*n+lenl] = lxu; lenl++; } else { w[my_rank*n+jpos] += lxu; } } } } iw[my_rank*n+i] = -1; for(j=0;j<lenu;j++) { iw[ my_rank*n+jbuf[my_rank*n+i+j+1] ] = -1; } D->value[i] = 1.0 / w[my_rank*n+i]; len = _min(lfil,lenl); for(j=0;j<lenl;j++) { wn[my_rank*n+j] = fabs(w[my_rank*n+j]); iw[my_rank*n+j] = j; } lis_sort_di(0,lenl-1,&wn[my_rank*n],&iw[my_rank*n]); lis_sort_i(0,len-1,&iw[my_rank*n]); L->nnz[i] = len; if( len>0 ) { L->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT)); L->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR)); } for(j=0;j<len;j++) { jpos = iw[my_rank*n+j]; L->index[i][j] = jbuf[my_rank*n+jpos]; L->value[i][j] = w[my_rank*n+jpos]; } for(j=0;j<lenl;j++) iw[my_rank*n+j] = -1; len = _min(lfil,lenu); for(j=0;j<lenu;j++) { wn[my_rank*n+j] = fabs(w[my_rank*n+i+j+1]); iw[my_rank*n+j] = i+j+1; } lis_sort_di(0,lenu-1,&wn[my_rank*n],&iw[my_rank*n]); lis_sort_i(0,len-1,&iw[my_rank*n]); U->nnz[i] = len; if( len>0 ) { U->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT)); U->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR)); } for(j=0;j<len;j++) { jpos = iw[my_rank*n+j]; U->index[i][j] = jbuf[my_rank*n+jpos]; U->value[i][j] = w[my_rank*n+jpos]; } for(j=0;j<lenu;j++) iw[my_rank*n+j] = -1; } } precon->L = L; precon->U = U; precon->D = D; lis_free2(4,w,iw,wn,jbuf); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #else LIS_INT err; LIS_INT i,j,k; LIS_INT n,lfil,len; LIS_SCALAR gamma,t,tol,m; LIS_MATRIX A; LIS_MATRIX_ILU L,U; LIS_VECTOR D; LIS_SCALAR tnorm, tolnorm; LIS_SCALAR fact,lxu,*wn,*w; LIS_INT lenu,lenl,col,jpos,jrow,upos; LIS_INT *jbuf,*iw; LIS_DEBUG_FUNC_IN; A = solver->A; n = A->n; tol = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN]; m = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN]; gamma = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN]; lfil = (LIS_INT)(((double)A->nnz/(2.0*n))*m); L = NULL; U = NULL; err = lis_matrix_ilu_create(n,1,&L); if( err ) return err; err = lis_matrix_ilu_create(n,1,&U); if( err ) return err; err = lis_matrix_ilu_setCR(L); if( err ) return err; err = lis_matrix_ilu_setCR(U); if( err ) return err; err = lis_vector_duplicate(A,&D); if( err ) { return err; } w = (LIS_SCALAR *)lis_malloc((n+1)*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w"); if( w==NULL ) { LIS_SETERR_MEM(n*sizeof(LIS_SCALAR)); return LIS_OUT_OF_MEMORY; } wn = (LIS_SCALAR *)lis_malloc(n*sizeof(LIS_SCALAR),"lis_precon_create_ilut_csr::w"); if( wn==NULL ) { LIS_SETERR_MEM(n*sizeof(LIS_SCALAR)); return LIS_OUT_OF_MEMORY; } jbuf = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw"); if( jbuf==NULL ) { LIS_SETERR_MEM(n*sizeof(LIS_INT)); return LIS_OUT_OF_MEMORY; } iw = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_ilut_csr::iw"); if( iw==NULL ) { LIS_SETERR_MEM(n*sizeof(LIS_INT)); return LIS_OUT_OF_MEMORY; } for(i=0;i<n;i++) iw[i] = -1; for(i=0;i<n;i++) { tnorm = 0; for(j=A->ptr[i];j<A->ptr[i+1];j++) { tnorm += fabs(A->value[j]); } tnorm = tnorm / (double)(A->ptr[i+1]-A->ptr[i]); tolnorm = tol * tnorm; lenu = 0; lenl = 0; jbuf[i] = i; w[i] = 0; iw[i] = i; for(j=A->ptr[i];j<A->ptr[i+1];j++) { col = A->index[j]; #ifdef USE_MPI if( col>n-1 ) continue; #endif t = A->value[j]; if( col < i ) { jbuf[lenl] = col; iw[col] = lenl; w[lenl] = t; lenl++; } else if( col == i ) { w[i] = t; } else { lenu++; jpos = i + lenu; jbuf[jpos] = col; iw[col] = jpos; w[jpos] = t; } } j = -1; len = 0; while( ++j < lenl ) { jrow = jbuf[j]; jpos = j; for(k=j+1;k<lenl;k++) { if( jbuf[k]<jrow ) { jrow = jbuf[k]; jpos = k; } } if( jpos!=j ) { col = jbuf[j]; jbuf[j] = jbuf[jpos]; jbuf[jpos] = col; iw[jrow] = j; iw[col] = jpos; t = w[j]; w[j] = w[jpos]; w[jpos] = t; } fact = w[j] * D->value[jrow]; w[j] = fact; iw[jrow] = -1; for(k=0;k<U->nnz[jrow];k++) { col = U->index[jrow][k]; jpos = iw[col]; lxu = -fact * U->value[jrow][k]; if( fabs(lxu) < tolnorm && jpos==-1 ) continue; if( col >= i ) { if( jpos == -1 ) { lenu++; upos = i + lenu; jbuf[upos] = col; iw[col] = upos; w[upos] = lxu; } else { w[jpos] += lxu; } } else { if( jpos == -1 ) { jbuf[lenl] = col; iw[col] = lenl; w[lenl] = lxu; lenl++; } else { w[jpos] += lxu; } } } /* for(kk=0;kk<bs;kk++) { w[bs*len+kk] = -buf_fact[kk]; } jbuf[len] = jrow; len++;*/ } iw[i] = -1; for(j=0;j<lenu;j++) { iw[ jbuf[i+j+1] ] = -1; } D->value[i] = 1.0 / w[i]; len = _min(lfil,lenl); for(j=0;j<lenl;j++) { wn[j] = fabs(w[j]); iw[j] = j; } lis_sort_di(0,lenl-1,wn,iw); lis_sort_i(0,len-1,iw); L->nnz[i] = len; if( len>0 ) { L->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT)); L->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR)); } for(j=0;j<len;j++) { jpos = iw[j]; L->index[i][j] = jbuf[jpos]; L->value[i][j] = w[jpos]; } for(j=0;j<lenl;j++) iw[j] = -1; len = _min(lfil,lenu); for(j=0;j<lenu;j++) { wn[j] = fabs(w[i+j+1]); iw[j] = i+j+1; } lis_sort_di(0,lenu-1,wn,iw); lis_sort_i(0,len-1,iw); U->nnz[i] = len; if( len>0 ) { U->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT)); U->value[i] = (LIS_SCALAR *)malloc(len*sizeof(LIS_SCALAR)); } for(j=0;j<len;j++) { jpos = iw[j]; U->index[i][j] = jbuf[jpos]; U->value[i][j] = w[jpos]; } for(j=0;j<lenu;j++) iw[j] = -1; } precon->L = L; precon->U = U; precon->D = D; lis_free2(4,w,iw,wn,jbuf); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; #endif }
LIS_INT lis_vector_nrm2ex_mm(LIS_VECTOR vx, LIS_QUAD_PTR *val) { LIS_INT i,n; LIS_SCALAR *x,*xl; LIS_QUAD_PTR dotm2,dotm,tmpm; #ifdef _OPENMP LIS_INT is,ie,nprocs,my_rank; LIS_SCALAR *gt; #endif #ifdef USE_MPI MPI_Comm comm; #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; xl = vx->value_lo; dotm2.hi = &vx->work[0]; dotm2.lo = &vx->work[2]; dotm.hi = &vx->work[8]; dotm.lo = &vx->work[9]; tmpm.hi = &vx->work[10]; tmpm.lo = &vx->work[11]; #ifdef USE_MPI comm = vx->comm; #endif #ifdef _OPENMP gt = lis_vec_tmp; nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) #else #pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); #ifndef USE_FMA2_SSE2 gt[my_rank*LIS_VEC_TMP_PADD] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0; #pragma cdir nodep for(i=is;i<ie;i++) { LIS_QUAD_FSA(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]); } #else gt[my_rank*LIS_VEC_TMP_PADD ] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0; gt[my_rank*LIS_VEC_TMP_PADD+2] = gt[my_rank*LIS_VEC_TMP_PADD+3] = 0.0; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=is;i<ie-1;i+=2) { LIS_QUAD_FSA2_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],x[i],xl[i]); } LIS_QUAD_ADD_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD+3]); for(;i<ie;i++) { LIS_QUAD_FSA_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]); } #endif } dotm.hi[0] = dotm.lo[0] = 0.0; for(i=0;i<nprocs;i++) { #ifndef USE_SSE2 LIS_QUAD_ADD(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]); #else LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]); #endif } #else #ifndef USE_FMA2_SSE2 dotm.hi[0] = dotm.lo[0] = 0.0; #pragma cdir nodep for(i=0;i<n;i++) { LIS_QUAD_FSA(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]); } #else dotm2.hi[0] = dotm2.hi[1] = 0.0; dotm2.lo[0] = dotm2.lo[1] = 0.0; for(i=0;i<n-1;i+=2) { LIS_QUAD_FSA2_SSE2(dotm2.hi[0],dotm2.lo[0],dotm2.hi[0],dotm2.lo[0],x[i],xl[i]); } LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm2.hi[0],dotm2.lo[0],dotm2.hi[1],dotm2.lo[1]); for(;i<n;i++) { LIS_QUAD_FSA_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]); } #endif #endif #ifdef USE_MPI MPI_Allreduce(dotm.hi,tmpm.hi,1,LIS_MPI_MSCALAR,LIS_MPI_MSUM,comm); #ifndef USE_SSE2 LIS_QUAD_SQRT(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]); #else LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]); #endif #else #ifndef USE_SSE2 LIS_QUAD_SQRT(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]); #else LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]); #endif #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }