void lis_quad_sub(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c) { LIS_QUAD_DECLAR; #ifndef USE_SSE2 LIS_QUAD_ADD(a->hi,a->lo,b->hi,b->lo,-c->hi,-c->lo); #else LIS_QUAD_ADD_SSE2(a->hi,a->lo,b->hi,b->lo,-c->hi,-c->lo); #endif }
LIS_INT lis_reduce_mp(LIS_COMMTABLE commtable, LIS_VECTOR X) { LIS_INT neib,i,is,inum,neibpetot,pad; LIS_SCALAR *x,*xl; LIS_SCALAR *ws,*wr; LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; neibpetot = commtable->neibpetot; ws = commtable->ws; wr = commtable->wr; pad = commtable->pad; x = X->value; xl = X->value_lo; for(neib=0;neib<neibpetot;neib++) { is = commtable->import_ptr[neib]; inum = commtable->import_ptr[neib+1] - is; for(i=0;i<inum;i++) { wr[is*2+i] = x[commtable->import_index[is+i]+pad]; wr[is*2+inum+i] = xl[commtable->import_index[is+i]+pad]; } MPI_Isend(&wr[is*2],inum*2,MPI_DOUBLE,commtable->neibpe[neib],0,commtable->comm,&commtable->req1[neib]); } for(neib=0;neib<neibpetot;neib++) { is = commtable->export_ptr[neib]; inum = commtable->export_ptr[neib+1] - is; MPI_Irecv(&ws[is*2],inum*2,MPI_DOUBLE,commtable->neibpe[neib],0,commtable->comm,&commtable->req2[neib]); } MPI_Waitall(neibpetot, commtable->req2, commtable->sta2); for(neib=0;neib<neibpetot;neib++) { is = commtable->export_ptr[neib]; inum = commtable->export_ptr[neib+1] - is; for(i=0;i<inum;i++) { /*x[commtable->export_index[i]] += ws[i];*/ #ifndef USE_SSE2 LIS_QUAD_ADD(x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],ws[is*2+i],ws[is*2+inum+i]); #else LIS_QUAD_ADD_SSE2(x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],x[commtable->export_index[is+i]],xl[commtable->export_index[is+i]],ws[is*2+i],ws[is*2+inum+i]); #endif } } MPI_Waitall(neibpetot, commtable->req1, commtable->sta1); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
void lis_mpi_msum(LIS_QUAD *invec, LIS_QUAD *inoutvec, LIS_INT *len, MPI_Datatype *datatype) { LIS_INT i; LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; for(i=0;i<*len;i++) { #ifndef USE_SSE2 LIS_QUAD_ADD(inoutvec[i].hi,inoutvec[i].lo,inoutvec[i].hi,inoutvec[i].lo,invec[i].hi,invec[i].lo); #else LIS_QUAD_ADD_SSE2(inoutvec[i].hi,inoutvec[i].lo,inoutvec[i].hi,inoutvec[i].lo,invec[i].hi,invec[i].lo); #endif } LIS_DEBUG_FUNC_OUT; }
LIS_INT lis_matvec_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y) { LIS_INT i,j,jj,n,np; LIS_SCALAR *x,*y; #ifdef _OPENMP LIS_INT nprocs,k; LIS_SCALAR t,*w; #endif #ifdef USE_QUAD_PRECISION LIS_INT j0,j1; #ifdef _OPENMP LIS_SCALAR *ww,*wwl; #endif #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; np = A->np; n = LU->n; x = X->value; y = Y->value; #ifdef USE_QUAD_PRECISION if( X->precision==LIS_PRECISION_DEFAULT ) #endif { #ifdef USE_MPI LIS_MATVEC_SENDRECV; #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs::w" ); #pragma omp parallel private(i,j,k,jj,t) { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = k*np + LU->index[i][j]; w[jj] += LU->value[i][j] * X->value[i]; } } #pragma omp for for(i=0;i<np;i++) { t = 0.0; for(j=0;j<nprocs;j++) { t += w[j*np+i]; } Y->value[i] = t; } } lis_free(w); #else for(i=0;i<np;i++) { Y->value[i] = 0.0; } for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; Y->value[jj] += LU->value[i][j] * X->value[i]; } } #endif } #ifdef USE_QUAD_PRECISION else { #ifdef USE_MPI lis_send_recv_mp(A->commtable,X); #endif #ifdef _OPENMP #ifndef USE_FMA2_SSE2 nprocs = omp_get_max_threads(); ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs_mp::ww" ); wwl = &ww[nprocs*np]; #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) ); memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = k*np + LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #pragma omp for for(i=0;i<np;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<nprocs;j++) { #ifndef USE_SSE2 LIS_QUAD_ADD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #else LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #endif } } } lis_free(ww); #else nprocs = omp_get_max_threads(); ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvect_crs_mp2::ww" ); wwl = &ww[nprocs*np]; #pragma omp parallel private(i,j,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) ); memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0; i<n; i++) { for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = k*np + LU->index[i][j]; j1 = k*np + LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #pragma omp for for(i=0;i<np;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<nprocs;j++) { #ifdef USE_SSE2 LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #endif } } } lis_free(ww); #endif #else #ifndef USE_FMA2_SSE2 for(i=0;i<np;i++) { Y->value[i] = 0.0; Y->value_lo[i] = 0.0; } for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #else for(i=0; i<np; i++) { Y->value[i] = 0.0; Y->value_lo[i] = 0.0; } for(i=0; i<n; i++) { for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = LU->index[i][j]; j1 = LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_STSD(Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(Y->value[j0],Y->value_lo[j0],Y->value[j0],Y->value_lo[j0],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #endif #endif } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matvect_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y) { LIS_INT i,j,jj,n; LIS_SCALAR t,*x,*y; LIS_QUAD_DECLAR; #ifdef USE_QUAD_PRECISION LIS_INT j0,j1; LIS_QUAD_PD tt; #endif LIS_DEBUG_FUNC_IN; n = LU->n; x = X->value; y = Y->value; #ifdef USE_QUAD_PRECISION if( X->precision==LIS_PRECISION_DEFAULT ) #endif { #ifdef USE_MPI LIS_MATVEC_SENDRECV; #endif #ifdef _OPENMP #pragma omp parallel for private(i,j,jj,t) #endif for(i=0;i<n;i++) { t = 0.0; for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; t += LU->value[i][j] * X->value[jj]; } Y->value[i] = t; } } #ifdef USE_QUAD_PRECISION else { #ifdef USE_MPI lis_send_recv_mp(A->commtable,X); #endif #ifndef USE_FMA2_SSE2 #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif for(i=0;i<n;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[jj],X->value_lo[jj],LU->value[i][j]); #endif } } #else #ifdef _OPENMP #ifndef USE_SSE2 #pragma omp parallel for private(i,j,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel for private(i,j,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif #endif for(i=0;i<n;i++) { tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0; for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = LU->index[i][j]; j1 = LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],X->value[j0],X->value_lo[j0],X->value[j1],X->value_lo[j1],LU->value[i][j]); #endif } #ifdef USE_SSE2 LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],tt.hi[0],tt.lo[0],tt.hi[1],tt.lo[1]); #endif for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],X->value[j0],X->value_lo[j0],LU->value[i][j]); #endif } } #endif } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_psolve_adds(LIS_SOLVER solver, LIS_VECTOR B, LIS_VECTOR X) { LIS_INT i,k,n,np,iter,ptype; LIS_SCALAR *b,*x,*w,*r,*rl; LIS_VECTOR W,R; LIS_PRECON precon; LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; precon = solver->precon; n = precon->A->n; np = precon->A->np; W = precon->work[0]; R = precon->work[1]; b = B->value; x = X->value; w = W->value; r = R->value; rl = R->value_lo; iter = solver->options[LIS_OPTIONS_ADDS_ITER]; ptype = solver->options[LIS_OPTIONS_PRECON]; #ifdef USE_QUAD_PRECISION if( solver->precision==LIS_PRECISION_DEFAULT ) { #endif lis_vector_set_all(0.0,X); lis_vector_copy(B,R); for(k=0;k<iter+1;k++) { for(i=n;i<np;i++) { r[i] = 0.0; } lis_psolve_xxx[ptype](solver,R,W); #ifdef _OPENMP #pragma omp parallel for private(i) #endif for(i=0;i<n;i++) { x[i] += w[i]; } if(k!=iter) { lis_matvec(precon->A,X,R); #ifdef _OPENMP #pragma omp parallel for private(i) #endif for(i=0;i<n;i++) { r[i] = b[i] - r[i]; } } } #ifdef USE_QUAD_PRECISION } else { lis_vector_set_allex_nm(0.0,X); lis_vector_copyex_mm(B,R); for(k=0;k<iter+1;k++) { for(i=n;i<np;i++) { r[i] = 0.0; rl[i] = 0.0; } lis_psolve_xxx[ptype](solver,R,W); for(i=0;i<n;i++) { #ifndef USE_SSE2 LIS_QUAD_ADD(X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],W->value[i],W->value_lo[i]); #else LIS_QUAD_ADD_SSE2(X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],W->value[i],W->value_lo[i]); #endif /* x[i] += w[i];*/ } if(k==iter) break; lis_matvec(precon->A,X,R); for(i=0;i<n;i++) { #ifndef USE_SSE2 LIS_QUAD_ADD(R->value[i],R->value_lo[i],B->value[i],B->value_lo[i],-R->value[i],-R->value_lo[i]); #else LIS_QUAD_ADD_SSE2(R->value[i],R->value_lo[i],B->value[i],B->value_lo[i],-R->value[i],-R->value_lo[i]); #endif /* r[i] = b[i] - r[i];*/ } } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_vector_nrm2ex_mm(LIS_VECTOR vx, LIS_QUAD_PTR *val) { LIS_INT i,n; LIS_SCALAR *x,*xl; LIS_QUAD_PTR dotm2,dotm,tmpm; #ifdef _OPENMP LIS_INT is,ie,nprocs,my_rank; LIS_SCALAR *gt; #endif #ifdef USE_MPI MPI_Comm comm; #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; xl = vx->value_lo; dotm2.hi = &vx->work[0]; dotm2.lo = &vx->work[2]; dotm.hi = &vx->work[8]; dotm.lo = &vx->work[9]; tmpm.hi = &vx->work[10]; tmpm.lo = &vx->work[11]; #ifdef USE_MPI comm = vx->comm; #endif #ifdef _OPENMP gt = lis_vec_tmp; nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) #else #pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); #ifndef USE_FMA2_SSE2 gt[my_rank*LIS_VEC_TMP_PADD] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0; #pragma cdir nodep for(i=is;i<ie;i++) { LIS_QUAD_FSA(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]); } #else gt[my_rank*LIS_VEC_TMP_PADD ] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0; gt[my_rank*LIS_VEC_TMP_PADD+2] = gt[my_rank*LIS_VEC_TMP_PADD+3] = 0.0; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=is;i<ie-1;i+=2) { LIS_QUAD_FSA2_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],x[i],xl[i]); } LIS_QUAD_ADD_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD+3]); for(;i<ie;i++) { LIS_QUAD_FSA_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]); } #endif } dotm.hi[0] = dotm.lo[0] = 0.0; for(i=0;i<nprocs;i++) { #ifndef USE_SSE2 LIS_QUAD_ADD(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]); #else LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]); #endif } #else #ifndef USE_FMA2_SSE2 dotm.hi[0] = dotm.lo[0] = 0.0; #pragma cdir nodep for(i=0;i<n;i++) { LIS_QUAD_FSA(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]); } #else dotm2.hi[0] = dotm2.hi[1] = 0.0; dotm2.lo[0] = dotm2.lo[1] = 0.0; for(i=0;i<n-1;i+=2) { LIS_QUAD_FSA2_SSE2(dotm2.hi[0],dotm2.lo[0],dotm2.hi[0],dotm2.lo[0],x[i],xl[i]); } LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm2.hi[0],dotm2.lo[0],dotm2.hi[1],dotm2.lo[1]); for(;i<n;i++) { LIS_QUAD_FSA_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]); } #endif #endif #ifdef USE_MPI MPI_Allreduce(dotm.hi,tmpm.hi,1,LIS_MPI_MSCALAR,LIS_MPI_MSUM,comm); #ifndef USE_SSE2 LIS_QUAD_SQRT(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]); #else LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]); #endif #else #ifndef USE_SSE2 LIS_QUAD_SQRT(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]); #else LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]); #endif #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }