/** Compute Open loop NGS mode wavefront error from mode vectors. */ double calc_rms(const dmat *mod, const dmat *mcc, int istep0){ double rms=0; for(long istep=istep0; istep<mod->ny; istep++){ rms+=dwdot(PCOL(mod,istep), mcc, PCOL(mod,istep)); } return rms/(mod->ny-istep0); }
/** test type I/II filter with ideal measurement to make sure it is implemented correctly. */ dmat* servo_test(dmat *input, double dt, int dtrat, dmat *sigma2n, dmat *gain){ if(input->ny==1){/*single mode. each column is for a mode.*/ input->ny=input->nx; input->nx=1; } int nmod=input->nx; dmat* pinput=input; dmat *merr=dnew(nmod,1); dcell *mreal=dcellnew(1,1); dmat *mres=dnew(nmod,input->ny); dmat *sigman=NULL; if(dnorm(sigma2n)>0){ sigman=dchol(sigma2n); } dcell *meas=dcellnew(1,1); dmat *noise=dnew(nmod, 1); SERVO_T *st2t=servo_new(NULL, NULL, 0, dt*dtrat, gain); rand_t rstat; seed_rand(&rstat, 1); dmat* pmres=mres; /*two step delay is ensured with the order of using, copy, acc*/ for(int istep=0; istep<input->ny; istep++){ memcpy(merr->p, PCOL(pinput,istep), nmod*sizeof(double)); dadd(&merr, 1, mreal->p[0], -1); memcpy(PCOL(pmres,istep),merr->p,sizeof(double)*nmod); if(istep % dtrat == 0){ dzero(meas->p[0]); } dadd(&meas->p[0], 1, merr, 1);/*average the error. */ dcellcp(&mreal, st2t->mint->p[0]); if((istep+1) % dtrat == 0){ if(dtrat!=1) dscale(meas->p[0], 1./dtrat); if(sigman){ drandn(noise, 1, &rstat); if(sigman->nx>0){ dmm(&meas->p[0], 1, sigman, noise, "nn", 1); }else{ dadd(&meas->p[0], 1, noise, sigman->p[0]); } } servo_filter(st2t, meas); } } dfree(sigman); dfree(merr); dcellfree(mreal); dcellfree(meas); servo_free(st2t); return mres; }
/* Add a PSD scaled by scale to another. The first column of each dmat is the frequency nu, and the second column is PSD. */ void add_psd2(dmat **pout, const dmat *in, double scale){ if(!*pout){ *pout=ddup(in); }else{ dmat *out=*pout; double *p1=PCOL(out,1); dmat *p2new=0; const long nx=out->nx; const double *p2=0; if(check_psd_match(out, in)){ p2=PCOL(in, 1); }else{ dmat *nu1=dsub(out,0, nx,0,1); p2new=dinterp1(in, 0, nu1, 1e-40); p2=PCOL(p2new,0); dfree(nu1); } for(long i=0; i<nx; i++){ p1[i]+=p2[i]*scale; } dfree(p2new); } }
/* * r[] is the residual vector distributed the same way as * matrix-vector product. */ static void redist_all_to_diag(int_t n, doublecomplex r[], Glu_persist_t *Glu_persist, LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], doublecomplex work[]) { int_t i, ii, k, lk, lr, nsupers; int_t *ilsum, *xsup; int iam, knsupc, psrc, pkk; MPI_Status status; iam = grid->iam; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; ilsum = Llu->ilsum; lr = 0; for (k = 0; k < nsupers; ++k) { pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); psrc = mv_sup_to_proc[k]; knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; if ( iam == psrc ) { if ( iam != pkk ) { /* Send X component. */ MPI_Send( &r[lr], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, pkk, Xk, grid->comm ); } else { /* Local copy. */ for (i = 0; i < knsupc; ++i) work[i + ii] = r[i + lr]; } lr += knsupc; } else { if ( iam == pkk ) { /* Recv X component. */ MPI_Recv( &work[ii], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, psrc, Xk, grid->comm, &status ); } } } } /* REDIST_ALL_TO_DIAG */
int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, int_t fst_row, int_t *ilsum, doublecomplex *x, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { /* * Purpose * ======= * Re-distribute B on the diagonal processes of the 2D process mesh. * * Note * ==== * This routine can only be called after the routine pxgstrs_init(), * in which the structures of the send and receive buffers are set up. * * Arguments * ========= * * B (input) doublecomplex* * The distributed right-hand side matrix of the possibly * equilibrated system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * ldb (input) int (local) * Leading dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ilsum (input) int* (global) * Starting position of each supernode in a full array. * * x (output) doublecomplex* * The solution vector. It is valid only on the diagonal processes. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * grid (input) gridinfo_t* * The 2D process mesh. * * SOLVEstruct (input) SOLVEstruct_t* * Contains the information for the communication during the * solution phase. * * Return value * ============ * */ int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *perm_r, *perm_c; /* row and column permutation vectors */ int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *xsup, *supno; int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; int p, procs; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; xsup = Glu_persist->xsup; supno = Glu_persist->supno; SendCnt = gstrs_comm->B_to_X_SendCnt; SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; /* ------------------------------------------------------------ NOW COMMUNICATE THE ACTUAL DATA. ------------------------------------------------------------*/ k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls[p] * nrhs; } /* Copy the row indices and values to the send buffer. */ for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ k = ptr_to_ibuf[p]; send_ibuf[k] = irow; k = ptr_to_dbuf[p]; RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ send_dbuf[k++] = B[i + j*ldb]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Communicate the (permuted) row indices. */ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); /* Communicate the numerical values. */ MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* ------------------------------------------------------------ Copy buffer into X on the diagonal processes. ------------------------------------------------------------*/ ii = 0; for (p = 0; p < procs; ++p) { jj = rdispls_nrhs[p]; for (i = 0; i < RecvCnt[p]; ++i) { /* Only the diagonal processes do this; the off-diagonal processes have 0 RecvCnt. */ irow = recv_ibuf[ii]; /* The permuted row index. */ k = BlockNum( irow ); knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number. */ l = X_BLK( lk ); x[l - XK_H].r = k; /* Block number prepended in the header. */ x[l - XK_H].i = 0; irow = irow - FstBlockC(k); /* Relative row number in X-block */ RHS_ITERATE(j) { x[l + irow + j*knsupc] = recv_dbuf[jj++]; } ++ii; } } SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); #endif return 0; } /* pzReDistribute_B_to_X */
void pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, doublecomplex *B, int_t ldb, int nrhs, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int Pc, Pr, iam; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #if defined (ISEND_IRECV) || defined (BSEND) MPI_Request *send_req, recv_req; #endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerr_dist("PZGSTRS_BGLOBAL", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs_Bglobal()"); #endif /* Save the count to be altered so it can be used by subsequent call to PDGSTRS_BGLOBAL. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #if defined (ISEND_IRECV) || defined (BSEND) k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Copy B into X on the diagonal processes. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header. */ lsum[il - LSUM_H].i = 0; kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* Diagonal process. */ jj = X_BLK( lk ); x[jj - XK_H].r = k; /* Block number prepended in the header. */ x[jj - XK_H].i = 0; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ x[i + jj + j*knsupc] = B[i + ii + j*ldb]; } }
int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, int_t fst_row, int_t *ilsum, doublecomplex *x, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *perm_r, *perm_c; /* row and column permutation vectors */ int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *xsup, *supno; int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; int p, procs; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; xsup = Glu_persist->xsup; supno = Glu_persist->supno; SendCnt = gstrs_comm->B_to_X_SendCnt; SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; /* ------------------------------------------------------------ NOW COMMUNICATE THE ACTUAL DATA. ------------------------------------------------------------*/ k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls[p] * nrhs; } /* Copy the row indices and values to the send buffer. */ for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ k = ptr_to_ibuf[p]; send_ibuf[k] = irow; k = ptr_to_dbuf[p]; RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ send_dbuf[k++] = B[i + j*ldb]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Communicate the (permuted) row indices. */ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); /* Communicate the numerical values. */ MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* ------------------------------------------------------------ Copy buffer into X on the diagonal processes. ------------------------------------------------------------*/ ii = 0; for (p = 0; p < procs; ++p) { jj = rdispls_nrhs[p]; for (i = 0; i < RecvCnt[p]; ++i) { /* Only the diagonal processes do this; the off-diagonal processes have 0 RecvCnt. */ irow = recv_ibuf[ii]; /* The permuted row index. */ k = BlockNum( irow ); knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number. */ l = X_BLK( lk ); x[l - XK_H].r = k; /* Block number prepended in the header. */ x[l - XK_H].i = 0; irow = irow - FstBlockC(k); /* Relative row number in X-block */ RHS_ITERATE(j) { x[l + irow + j*knsupc] = recv_dbuf[jj++]; } ++ii; } } SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); #endif return 0; } /* pzReDistribute_B_to_X */
/*! \brief * * <pre> * Purpose * ======= * Perform parallel triangular solves * U(k,:) := A(k,:) \ L(k,k). * Only the process column that owns block column *k* participates * in the work. * * Arguments * ========= * * m (input) int (global) * Number of rows in the matrix. * * k (input) int (global) * The row number of the block row to be factorized. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization; * See SuperLUStat_t structure defined in util.h. * </pre> */ static void pdgstrs2 /************************************************************************/ #ifdef _CRAY ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 ) #else ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat ) #endif { int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; int_t nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; int_t *usub; double *lusup, *uval; /* Quick return. */ lk = LBi( k, grid ); /* Local block number */ if ( !Llu->Unzval_br_ptr[lk] ) return; /* Initialization. */ iam = grid->iam; pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); klst = FstBlockC( k+1 ); knsupc = SuperSize( k ); usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; nb = usub[0]; iukp = BR_HEADER; rukp = 0; if ( iam == pkk ) { lk = LBj( k, grid ); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; } else { nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k%2]; } /* Loop through all the row blocks. */ for (b = 0; b < nb; ++b) { gb = usub[iukp]; nsupc = SuperSize( gb ); iukp += UB_DESCRIPTOR; /* Loop through all the segments in the block. */ for (j = 0; j < nsupc; ++j) { segsize = klst - usub[iukp++]; if ( segsize ) { /* Nonzero segment. */ luptr = (knsupc - segsize) * (nsupr + 1); #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #else dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #endif stat->ops[FACT] += segsize * (segsize + 1); rukp += segsize; } } } /* for b ... */ } /* PDGSTRS2 */
static void pdgstrs2 /************************************************************************/ #ifdef _CRAY ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 ) #else ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat ) #endif /* * Purpose * ======= * Perform parallel triangular solves * U(k,:) := A(k,:) \ L(k,k). * Only the process row that owns block row *k* participates * in the work. * * Arguments * ========= * * m (input) int (global) * Number of rows in the matrix. * * k (input) int (global) * The row number of the block row to be factorized. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization; * See SuperLUStat_t structure defined in util.h. * */ { int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; int_t nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; int_t *usub; double *lusup, *uval; /* Quick return. */ lk = LBi( k, grid ); /* Local block number */ if ( !Llu->Unzval_br_ptr[lk] ) return; /* Initialization. */ iam = grid->iam; pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); klst = FstBlockC( k+1 ); knsupc = SuperSize( k ); usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; nb = usub[0]; iukp = BR_HEADER; rukp = 0; if ( iam == pkk ) { lk = LBj( k, grid ); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; } else { nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k%2]; } /* Loop through all the row blocks. */ for (b = 0; b < nb; ++b) { gb = usub[iukp]; nsupc = SuperSize( gb ); iukp += UB_DESCRIPTOR; /* Loop through all the segments in the block. */ for (j = 0; j < nsupc; ++j) { segsize = klst - usub[iukp++]; if ( segsize ) { /* Nonzero segment. */ luptr = (knsupc - segsize) * (nsupr + 1); #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #elif defined (USE_VENDOR_BLAS) dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx, 1, 1, 1); #else dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #endif stat->ops[FACT] += segsize * (segsize + 1); rukp += segsize; } } } /* for b ... */ } /* PDGSTRS2 */
void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *x, int nrhs, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PDGSTRS1 solves a system of distributed linear equations * * op( sub(A) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PDGSTRF. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures to store L and U factors, * and the permutation vectors. * See superlu_ddefs.h for the definition of 'LUstruct_t' structure. * * grid (input) gridinfo_t* * The 2D process mesh. * * x (input/output) double* * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * NOTE: the right-hand side matrix is already distributed on * the diagonal processes. * * nrhs (input) int (global) * Number of right-hand sides. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves; * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double alpha = 1.0; double *lsum; /* Local running sum of the updates to B-components */ double *lusup, *dest; double *recvbuf, *tempv; double *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; double **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -8; if ( *info ) { pxerbla("PDGSTRS1", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ Llu->SolveMsgSent = 0; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrs1()"); #endif /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Compute ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Prepended the block number in the header for lsum[]. */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H] = k; } } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( !frecv[lk] && !fmod[lk] ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* * Compute the internal nodes asynchronously by all processes. */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = *recvbuf; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) x[i + ii + j*knsupc] += tempv[i + j*knsupc]; if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel>=2 ) if ( !iam ) printf("\n.. After L-solve: y =\n"); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); } MPI_Barrier( grid->comm ); } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; } }
int_t dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, gridinfo_t *grid, int_t *colptr[], int_t *rowind[], double *a[]) { /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * Purpose * ======= * Re-distribute A on the 2D process mesh. * * Arguments * ========= * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * grid (input) gridinfo_t* * The 2D process mesh. * * colptr (output) int* * * rowind (output) int* * * a (output) double* * * Return value * ============ * */ NRformat_loc *Astore; int_t *perm_r; /* row permutation vector */ int_t *perm_c; /* column permutation vector */ int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; int_t nnz_loc; /* number of local nonzeros */ int_t nnz_remote; /* number of remote nonzeros to be sent */ int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; int_t *ia, *ja, **ia_send, *index, *itemp; int_t *ptr_to_send; double *aij, **aij_send, *nzval, *dtemp; double *nzval_a; int iam, it, p, procs; MPI_Request *send_req; MPI_Status status; /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter dReDistribute_A()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; nnzToRecv = intCalloc_dist(2*procs); nnzToSend = nnzToRecv + procs; /* ------------------------------------------------------------ COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); ++nnzToSend[p]; } } /* All-to-all communication */ MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, grid->comm); maxnnzToRecv = 0; nnz_loc = SendCnt = RecvCnt = 0; for (p = 0; p < procs; ++p) { if ( p != iam ) { SendCnt += nnzToSend[p]; RecvCnt += nnzToRecv[p]; maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); } else { nnz_loc += nnzToRecv[p]; /*assert(nnzToSend[p] == nnzToRecv[p]);*/ } } k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ /* Allocate space for storing the triplets after redistribution. */ if ( !(ia = intMalloc_dist(2*k)) ) ABORT("Malloc fails for ia[]."); ja = ia + k; if ( !(aij = doubleMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) ABORT("Malloc fails for ia_send[]."); if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) ABORT("Malloc fails for aij_send[]."); if ( !(index = intMalloc_dist(2*SendCnt)) ) ABORT("Malloc fails for index[]."); if ( !(nzval = doubleMalloc_dist(SendCnt)) ) ABORT("Malloc fails for nzval[]."); if ( !(ptr_to_send = intCalloc_dist(procs)) ) ABORT("Malloc fails for ptr_to_send[]."); if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) ABORT("Malloc fails for itemp[]."); if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) ABORT("Malloc fails for dtemp[]."); for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } } /* if procs > 1 */ if ( !(*colptr = intCalloc_dist(n+1)) ) ABORT("Malloc fails for *colptr[]."); /* ------------------------------------------------------------ LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. THIS ACCOUNTS FOR THE SECOND PASS OF A. ------------------------------------------------------------*/ nnz_loc = 0; /* Reset the local nonzero count. */ nzval_a = Astore->nzval; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); if ( p != iam ) { /* remote */ k = ptr_to_send[p]; ia_send[p][k] = irow; ia_send[p][k + nnzToSend[p]] = jcol; aij_send[p][k] = nzval_a[j]; ++ptr_to_send[p]; } else { /* local */ ia[nnz_loc] = irow; ja[nnz_loc] = jcol; aij[nnz_loc] = nzval_a[j]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } /* ------------------------------------------------------------ PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); it = nnzToSend[p]; MPI_Isend( aij_send[p], it, MPI_DOUBLE, p, iam+procs, grid->comm, &send_req[procs+p] ); } } for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, grid->comm, &status ); for (i = 0; i < nnzToRecv[p]; ++i) { ia[nnz_loc] = itemp[i]; jcol = itemp[i + nnzToRecv[p]]; /*assert(jcol<n);*/ ja[nnz_loc] = jcol; aij[nnz_loc] = dtemp[i]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } for (p = 0; p < procs; ++p) { if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } } /* ------------------------------------------------------------ DEALLOCATE TEMPORARY STORAGE ------------------------------------------------------------*/ SUPERLU_FREE(nnzToRecv); if ( procs > 1 ) { SUPERLU_FREE(send_req); SUPERLU_FREE(ia_send); SUPERLU_FREE(aij_send); SUPERLU_FREE(index); SUPERLU_FREE(nzval); SUPERLU_FREE(ptr_to_send); SUPERLU_FREE(itemp); SUPERLU_FREE(dtemp); } /* ------------------------------------------------------------ CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. ------------------------------------------------------------*/ if ( !(*rowind = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *rowind[]."); if ( !(*a = doubleMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *a[]."); /* Initialize the array of column pointers */ k = 0; jsize = (*colptr)[0]; (*colptr)[0] = 0; for (j = 1; j < n; ++j) { k += jsize; jsize = (*colptr)[j]; (*colptr)[j] = k; } /* Copy the triplets into the column oriented storage */ for (i = 0; i < nnz_loc; ++i) { j = ja[i]; k = (*colptr)[j]; (*rowind)[k] = ia[i]; (*a)[k] = aij[i]; ++(*colptr)[j]; } /* Reset the column pointers to the beginning of each column */ for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; (*colptr)[0] = 0; SUPERLU_FREE(ia); SUPERLU_FREE(aij); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit dReDistribute_A()"); #endif } /* dReDistribute_A */
/** Time domain physical simulation. noisy: - 0: no noise at all; - 1: poisson and read out noise. - 2: only poisson noise. */ dmat *skysim_sim(dmat **mresout, const dmat *mideal, const dmat *mideal_oa, double ngsol, ASTER_S *aster, const POWFS_S *powfs, const PARMS_S *parms, int idtratc, int noisy, int phystart){ int dtratc=0; if(!parms->skyc.multirate){ dtratc=parms->skyc.dtrats->p[idtratc]; } int hasphy; if(phystart>-1 && phystart<aster->nstep){ hasphy=1; }else{ hasphy=0; } const int nmod=mideal->nx; dmat *res=dnew(6,1);/*Results. 1-2: NGS and TT modes., 3-4:On axis NGS and TT modes, 4-6: On axis NGS and TT wihtout considering un-orthogonality.*/ dmat *mreal=NULL;/*modal correction at this step. */ dmat *merr=dnew(nmod,1);/*modal error */ dcell *merrm=dcellnew(1,1);dcell *pmerrm=NULL; const int nstep=aster->nstep?aster->nstep:parms->maos.nstep; dmat *mres=dnew(nmod,nstep); dmat* rnefs=parms->skyc.rnefs; dcell *zgradc=dcellnew3(aster->nwfs, 1, aster->ngs, 0); dcell *gradout=dcellnew3(aster->nwfs, 1, aster->ngs, 0); dmat *gradsave=0; if(parms->skyc.dbg){ gradsave=dnew(aster->tsa*2,nstep); } SERVO_T *st2t=0; kalman_t *kalman=0; dcell *mpsol=0; dmat *pgm=0; dmat *dtrats=0; int multirate=parms->skyc.multirate; if(multirate){ kalman=aster->kalman[0]; dtrats=aster->dtrats; }else{ if(parms->skyc.servo>0){ const double dtngs=parms->maos.dt*dtratc; st2t=servo_new(merrm, NULL, 0, dtngs, aster->gain->p[idtratc]); pgm=aster->pgm->p[idtratc]; }else{ kalman=aster->kalman[idtratc]; } } if(kalman){ kalman_init(kalman); mpsol=dcellnew(aster->nwfs, 1); //for psol grad. } const long nwvl=parms->maos.nwvl; dcell **psf=0, **mtche=0, **ints=0; ccell *wvf=0,*wvfc=0, *otf=0; if(hasphy){ psf=mycalloc(aster->nwfs,dcell*); wvf=ccellnew(aster->nwfs,1); wvfc=ccellnew(aster->nwfs,1); mtche=mycalloc(aster->nwfs,dcell*); ints=mycalloc(aster->nwfs,dcell*); otf=ccellnew(aster->nwfs,1); for(long iwfs=0; iwfs<aster->nwfs; iwfs++){ const int ipowfs=aster->wfs[iwfs].ipowfs; const long ncomp=parms->maos.ncomp[ipowfs]; const long nsa=parms->maos.nsa[ipowfs]; wvf->p[iwfs]=cnew(ncomp,ncomp); wvfc->p[iwfs]=NULL; psf[iwfs]=dcellnew(nsa,nwvl); //cfft2plan(wvf->p[iwfs], -1); if(parms->skyc.multirate){ mtche[iwfs]=aster->wfs[iwfs].pistat->mtche[(int)aster->idtrats->p[iwfs]]; }else{ mtche[iwfs]=aster->wfs[iwfs].pistat->mtche[idtratc]; } otf->p[iwfs]=cnew(ncomp,ncomp); //cfft2plan(otf->p[iwfs],-1); //cfft2plan(otf->p[iwfs],1); ints[iwfs]=dcellnew(nsa,1); int pixpsa=parms->skyc.pixpsa[ipowfs]; for(long isa=0; isa<nsa; isa++){ ints[iwfs]->p[isa]=dnew(pixpsa,pixpsa); } } } for(int irep=0; irep<parms->skyc.navg; irep++){ if(kalman){ kalman_init(kalman); }else{ servo_reset(st2t); } dcellzero(zgradc); dcellzero(gradout); if(ints){ for(int iwfs=0; iwfs<aster->nwfs; iwfs++){ dcellzero(ints[iwfs]); } } for(int istep=0; istep<nstep; istep++){ memcpy(merr->p, PCOL(mideal,istep), nmod*sizeof(double)); dadd(&merr, 1, mreal, -1);/*form NGS mode error; */ memcpy(PCOL(mres,istep),merr->p,sizeof(double)*nmod); if(mpsol){//collect averaged modes for PSOL. for(long iwfs=0; iwfs<aster->nwfs; iwfs++){ dadd(&mpsol->p[iwfs], 1, mreal, 1); } } pmerrm=0; if(istep>=parms->skyc.evlstart){/*performance evaluation*/ double res_ngs=dwdot(merr->p,parms->maos.mcc,merr->p); if(res_ngs>ngsol*100){ dfree(res); res=NULL; break; } { res->p[0]+=res_ngs; res->p[1]+=dwdot2(merr->p,parms->maos.mcc_tt,merr->p); double dot_oa=dwdot(merr->p, parms->maos.mcc_oa, merr->p); double dot_res_ideal=dwdot(merr->p, parms->maos.mcc_oa, PCOL(mideal,istep)); double dot_res_oa=0; for(int imod=0; imod<nmod; imod++){ dot_res_oa+=merr->p[imod]*IND(mideal_oa,imod,istep); } res->p[2]+=dot_oa-2*dot_res_ideal+2*dot_res_oa; res->p[4]+=dot_oa; } { double dot_oa_tt=dwdot2(merr->p, parms->maos.mcc_oa_tt, merr->p); /*Notice that mcc_oa_tt2 is 2x5 marix. */ double dot_res_ideal_tt=dwdot(merr->p, parms->maos.mcc_oa_tt2, PCOL(mideal,istep)); double dot_res_oa_tt=0; for(int imod=0; imod<2; imod++){ dot_res_oa_tt+=merr->p[imod]*IND(mideal_oa,imod,istep); } res->p[3]+=dot_oa_tt-2*dot_res_ideal_tt+2*dot_res_oa_tt; res->p[5]+=dot_oa_tt; } }//if evl if(istep<phystart || phystart<0){ /*Ztilt, noise free simulation for acquisition. */ dmm(&zgradc->m, 1, aster->gm, merr, "nn", 1);/*grad due to residual NGS mode. */ for(int iwfs=0; iwfs<aster->nwfs; iwfs++){ const int ipowfs=aster->wfs[iwfs].ipowfs; const long ng=parms->maos.nsa[ipowfs]*2; for(long ig=0; ig<ng; ig++){ zgradc->p[iwfs]->p[ig]+=aster->wfs[iwfs].ztiltout->p[istep*ng+ig]; } } for(int iwfs=0; iwfs<aster->nwfs; iwfs++){ int dtrati=(multirate?(int)dtrats->p[iwfs]:dtratc); if((istep+1) % dtrati==0){ dadd(&gradout->p[iwfs], 0, zgradc->p[iwfs], 1./dtrati); dzero(zgradc->p[iwfs]); if(noisy){ int idtrati=(multirate?(int)aster->idtrats->p[iwfs]:idtratc); dmat *nea=aster->wfs[iwfs].pistat->sanea->p[idtrati]; for(int i=0; i<nea->nx; i++){ gradout->p[iwfs]->p[i]+=nea->p[i]*randn(&aster->rand); } } pmerrm=merrm;//record output. } } }else{ /*Accumulate PSF intensities*/ for(long iwfs=0; iwfs<aster->nwfs; iwfs++){ const double thetax=aster->wfs[iwfs].thetax; const double thetay=aster->wfs[iwfs].thetay; const int ipowfs=aster->wfs[iwfs].ipowfs; const long nsa=parms->maos.nsa[ipowfs]; ccell* wvfout=aster->wfs[iwfs].wvfout[istep]; for(long iwvl=0; iwvl<nwvl; iwvl++){ double wvl=parms->maos.wvl[iwvl]; for(long isa=0; isa<nsa; isa++){ ccp(&wvfc->p[iwfs], IND(wvfout,isa,iwvl)); /*Apply NGS mode error to PSF. */ ngsmod2wvf(wvfc->p[iwfs], wvl, merr, powfs+ipowfs, isa, thetax, thetay, parms); cembedc(wvf->p[iwfs],wvfc->p[iwfs],0,C_FULL); cfft2(wvf->p[iwfs],-1); /*peak in corner. */ cabs22d(&psf[iwfs]->p[isa+nsa*iwvl], 1., wvf->p[iwfs], 1.); }/*isa */ }/*iwvl */ }/*iwfs */ /*Form detector image from accumulated PSFs*/ double igrad[2]; for(long iwfs=0; iwfs<aster->nwfs; iwfs++){ int dtrati=dtratc, idtrat=idtratc; if(multirate){//multirate idtrat=aster->idtrats->p[iwfs]; dtrati=dtrats->p[iwfs]; } if((istep+1) % dtrati == 0){/*has output */ dcellzero(ints[iwfs]); const int ipowfs=aster->wfs[iwfs].ipowfs; const long nsa=parms->maos.nsa[ipowfs]; for(long isa=0; isa<nsa; isa++){ for(long iwvl=0; iwvl<nwvl; iwvl++){ double siglev=aster->wfs[iwfs].siglev->p[iwvl]; ccpd(&otf->p[iwfs],psf[iwfs]->p[isa+nsa*iwvl]); cfft2i(otf->p[iwfs], 1); /*turn to OTF, peak in corner */ ccwm(otf->p[iwfs], powfs[ipowfs].dtf[iwvl].nominal); cfft2(otf->p[iwfs], -1); dspmulcreal(ints[iwfs]->p[isa]->p, powfs[ipowfs].dtf[iwvl].si, otf->p[iwfs]->p, siglev); } /*Add noise and apply matched filter. */ #if _OPENMP >= 200805 #pragma omp critical #endif switch(noisy){ case 0:/*no noise at all. */ break; case 1:/*both poisson and read out noise. */ { double bkgrnd=aster->wfs[iwfs].bkgrnd*dtrati; addnoise(ints[iwfs]->p[isa], &aster->rand, bkgrnd, bkgrnd, 0,0,IND(rnefs,idtrat,ipowfs)); } break; case 2:/*there is still poisson noise. */ addnoise(ints[iwfs]->p[isa], &aster->rand, 0, 0, 0,0,0); break; default: error("Invalid noisy\n"); } igrad[0]=0; igrad[1]=0; double pixtheta=parms->skyc.pixtheta[ipowfs]; if(parms->skyc.mtch){ dmulvec(igrad, mtche[iwfs]->p[isa], ints[iwfs]->p[isa]->p, 1); } if(!parms->skyc.mtch || fabs(igrad[0])>pixtheta || fabs(igrad[1])>pixtheta){ if(!parms->skyc.mtch){ warning2("fall back to cog\n"); }else{ warning_once("mtch is out of range\n"); } dcog(igrad, ints[iwfs]->p[isa], 0, 0, 0, 3*IND(rnefs,idtrat,ipowfs), 0); igrad[0]*=pixtheta; igrad[1]*=pixtheta; } gradout->p[iwfs]->p[isa]=igrad[0]; gradout->p[iwfs]->p[isa+nsa]=igrad[1]; }/*isa */ pmerrm=merrm; dcellzero(psf[iwfs]);/*reset accumulation.*/ }/*if iwfs has output*/ }/*for wfs*/ }/*if phystart */ //output to mreal after using it to ensure two cycle delay. if(st2t){//Type I or II control. if(st2t->mint->p[0]){//has output. dcp(&mreal, st2t->mint->p[0]->p[0]); } }else{//LQG control kalman_output(kalman, &mreal, 0, 1); } if(kalman){//LQG control int indk=0; //Form PSOL grads and obtain index to LQG M for(int iwfs=0; iwfs<aster->nwfs; iwfs++){ int dtrati=(multirate?(int)dtrats->p[iwfs]:dtratc); if((istep+1) % dtrati==0){ indk|=1<<iwfs; dmm(&gradout->p[iwfs], 1, aster->g->p[iwfs], mpsol->p[iwfs], "nn", 1./dtrati); dzero(mpsol->p[iwfs]); } } if(indk){ kalman_update(kalman, gradout->m, indk-1); } }else if(st2t){ if(pmerrm){ dmm(&merrm->p[0], 0, pgm, gradout->m, "nn", 1); } servo_filter(st2t, pmerrm);//do even if merrm is zero. to simulate additional latency } if(parms->skyc.dbg){ memcpy(PCOL(gradsave, istep), gradout->m->p, sizeof(double)*gradsave->nx); } }/*istep; */ } if(parms->skyc.dbg){ int dtrati=(multirate?(int)dtrats->p[0]:dtratc); writebin(gradsave,"%s/skysim_grads_aster%d_dtrat%d",dirsetup, aster->iaster,dtrati); writebin(mres,"%s/skysim_sim_mres_aster%d_dtrat%d",dirsetup,aster->iaster,dtrati); } dfree(mreal); dcellfree(mpsol); dfree(merr); dcellfree(merrm); dcellfree(zgradc); dcellfree(gradout); dfree(gradsave); if(hasphy){ dcellfreearr(psf, aster->nwfs); dcellfreearr(ints, aster->nwfs); ccellfree(wvf); ccellfree(wvfc); ccellfree(otf); free(mtche); } servo_free(st2t); /*dfree(mres); */ if(mresout) { *mresout=mres; }else{ dfree(mres); } dscale(res, 1./((nstep-parms->skyc.evlstart)*parms->skyc.navg)); return res; }
int_t pddistribute(fact_t fact, int_t n, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * * Purpose * ======= * Distribute the matrix onto the 2D process mesh. * * Arguments * ========= * * fact (input) fact_t * Specifies whether or not the L and U structures will be re-used. * = SamePattern_SameRowPerm: L and U structures are input, and * unchanged on exit. * = DOFACT or SamePattern: L and U structures are computed and output. * * n (input) int * Dimension of the matrix. * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * LUstruct (input) LUstruct_t* * Data structures for L and U factors. * * grid (input) gridinfo_t* * The 2D process mesh. * * Return value * ============ * > 0, working storage required (in bytes). * */ { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; #if 0 NCPformat *Astore; #else /* XSL ==> */ NRformat_loc *Astore; #endif double *a; int_t *asub, *xa; #if 0 int_t *xa_begin, *xa_end; #endif int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int_t *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t mem_use = 0, iword, dword; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = (NRformat_loc *) A->Store; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pddistribute()"); #endif dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, grid, &xa, &asub, &a); if ( fact == SamePattern_SameRowPerm ) { #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); for (lb = 0; lb < nrbu; ++lb) Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Unzval[]. */ for (lb = 0; lb < nrbu; ++lb) { index = Ufstnz_br_ptr[lb]; if ( index && index[Urb_indptr[lb]] == jb ) { uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb] + UB_DESCRIPTOR; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj = 0, dense_col = dense; jj < nsupc; ++jj) { j = index[len+jj]; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow+i]; dense_col[irow+i] = zero; } dense_col += ldaspa; } Urb_indptr[lb] += UB_DESCRIPTOR + nsupc; } /* if index != NULL */ } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* ------------------------------------------------------------ FIRST TIME CREATING THE L AND U DATA STRUCTURES. ------------------------------------------------------------*/ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We first need to set up the L and U data structures and then * propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = intCalloc_dist(nsupers)) ) ABORT("Calloc fails for ToRecv[]."); k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index = intMalloc_dist(j)) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use = k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = intCalloc_dist(k)) ) ABORT("Malloc fails for ToSendD[]."); if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ } /* for lb ... */ SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Ucbs); #if ( PRNTlevel>=1 ) mem_use -= 2*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); /* ------------------------------------------------ */ #if ( PRNTlevel>=1 ) mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; /* -------------------------------------------------------------- */ #if ( PRNTlevel>=1 ) mem_use += 4*k*sizeof(int_t*) + 2*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow ) bsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; len = Urb_indptr[lb]; for (k = 0; k < nsupc; ++k) index[len+k] = FstBlockC( gb+1 ); if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; } /* if myrow == pr ... */ } /* for i ... */ } /* for j ... */ /* Figure out how many nonzeros in each block, and gather the initial values of A from SPA into Uval. */ for (lb = 0; lb < nrbu; ++lb) { if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb]; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) { j = index[len+jj]; /* First nonzero in segment. */ bnnz += k - j; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow + i]; dense_col[irow + i] = zero; } dense_col += ldaspa; } index[len-1] = bnnz; /* Set block length in Descriptor */ Urb_indptr[lb] += nsupc; } } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow ) fsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(len*nsupc))) { fprintf(stderr, "col block %d ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ SUPERLU_FREE(xa); SUPERLU_FREE(asub); SUPERLU_FREE(a); #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit pddistribute()"); #endif return (mem_use); } /* PDDISTRIBUTE */
/*! \brief * * <pre> * Purpose * ======= * Set up the communication pattern for the triangular solution. * * Arguments * ========= * * n (input) int (global) * The dimension of the linear system. * * m_loc (input) int (local) * The local row dimension of the distributed input matrix. * * nrhs (input) int (global) * Number of right-hand sides. * * fst_row (input) int (global) * The row number of matrix B's first row in the global matrix. * * perm_r (input) int* (global) * The row permutation vector. * * perm_c (input) int* (global) * The column permutation vector. * * grid (input) gridinfo_t* * The 2D process mesh. * </pre> */ int_t pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, int_t perm_r[], int_t perm_c[], gridinfo_t *grid, Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct) { int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; int_t *row_to_proc; int_t i, gbi, k, l, num_diag_procs, *diag_procs; int_t irow, lk, q, knsupc, nsupers, *xsup, *supno; int iam, p, pkk, procs; pxgstrs_comm_t *gstrs_comm; procs = grid->nprow * grid->npcol; iam = grid->iam; gstrs_comm = SOLVEstruct->gstrs_comm; xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = Glu_persist->supno[n-1] + 1; row_to_proc = SOLVEstruct->row_to_proc; /* ------------------------------------------------------------ SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. ------------------------------------------------------------*/ if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) ABORT("Malloc fails for B_to_X_itemp[]."); SendCnt = itemp; SendCnt_nrhs = itemp + procs; RecvCnt = itemp + 2*procs; RecvCnt_nrhs = itemp + 3*procs; sdispls = itemp + 4*procs; sdispls_nrhs = itemp + 5*procs; rdispls = itemp + 6*procs; rdispls_nrhs = itemp + 7*procs; /* Count the number of elements to be sent to each diagonal process.*/ for (p = 0; p < procs; ++p) SendCnt[p] = 0; for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ ++SendCnt[p]; } /* Set up the displacements for alltoall. */ MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; for (p = 1; p < procs; ++p) { sdispls[p] = sdispls[p-1] + SendCnt[p-1]; rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; } for (p = 0; p < procs; ++p) { SendCnt_nrhs[p] = SendCnt[p] * nrhs; sdispls_nrhs[p] = sdispls[p] * nrhs; RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; rdispls_nrhs[p] = rdispls[p] * nrhs; } /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ gstrs_comm->B_to_X_SendCnt = SendCnt; /* ------------------------------------------------------------ SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. ------------------------------------------------------------*/ /* This is freed in pxgstrs_finalize(). */ if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) ABORT("Malloc fails for X_to_B_itemp[]."); SendCnt = itemp; SendCnt_nrhs = itemp + procs; RecvCnt = itemp + 2*procs; RecvCnt_nrhs = itemp + 3*procs; sdispls = itemp + 4*procs; sdispls_nrhs = itemp + 5*procs; rdispls = itemp + 6*procs; rdispls_nrhs = itemp + 7*procs; /* Count the number of X entries to be sent to each process.*/ for (p = 0; p < procs; ++p) SendCnt[p] = 0; num_diag_procs = SOLVEstruct->num_diag_procs; diag_procs = SOLVEstruct->diag_procs; for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ pkk = diag_procs[p]; if ( iam == pkk ) { for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number */ irow = FstBlockC( k ); for (i = 0; i < knsupc; ++i) { #if 0 q = row_to_proc[inv_perm_c[irow]]; #else q = row_to_proc[irow]; #endif ++SendCnt[q]; ++irow; } } } } MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; sdispls_nrhs[0] = rdispls_nrhs[0] = 0; SendCnt_nrhs[0] = SendCnt[0] * nrhs; RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; for (p = 1; p < procs; ++p) { sdispls[p] = sdispls[p-1] + SendCnt[p-1]; rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; sdispls_nrhs[p] = sdispls[p] * nrhs; rdispls_nrhs[p] = rdispls[p] * nrhs; SendCnt_nrhs[p] = SendCnt[p] * nrhs; RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; } /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ gstrs_comm->X_to_B_SendCnt = SendCnt; if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) ABORT("Malloc fails for ptr_to_ibuf[]."); gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; } /* PXGSTRS_INIT */
/*! \brief * * <pre> * Purpose * ======= * Panel factorization -- block column k * * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the column processes that own block column *k* participate * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * U_diag_blk_send_req (input/output) MPI_Request* * List of send requests to send down the diagonal block of U. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * </pre> */ static void pdgstrf2 ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, SuperLUStat_t *stat, int* info ) { int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst, u_diag_cnt; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup, temp; double *ujrow, *ublk_ptr; /* pointer to the U block */ double alpha = -1, zero = 0.0; int_t Pr; MPI_Status status; MPI_Comm comm = (grid->cscp).comm; /* Quick return. */ *info = 0; /* Initialization. */ iam = grid->iam; Pr = grid->nprow; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ublk_ptr = ujrow = Llu->ujrow; luptr = 0; /* Point to the diagonal entries. */ cols_left = nsupc; /* supernode size */ u_diag_cnt = 0; if ( iam == pkk ) { /* diagonal process */ if ( U_diag_blk_send_req && U_diag_blk_send_req[krow] ) { /* There are pending sends - wait for all Isend to complete */ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Wait(U_diag_blk_send_req + pr, &status); } for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ /* Diagonal pivot */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the new diagonal entry with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */ if ( ujrow[0] == zero ) { /* Test for singularity. */ *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else dger_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr + 1; /* move to next column */ } /* for column j ... */ if ( U_diag_blk_send_req && iam == pkk ) { /* Send the U block */ /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Isend(ublk_ptr, u_diag_cnt, MPI_DOUBLE, pr, ((k<<2)+2)%NTAGS, comm, U_diag_blk_send_req + pr); U_diag_blk_send_req[krow] = 1; /* flag outstanding Isend */ } } else { /* non-diagonal process */ /* Receive the diagonal block of U */ MPI_Recv(ublk_ptr, (nsupc*(nsupc+1))>>1, MPI_DOUBLE, krow, ((k<<2)+2)%NTAGS, comm, &status); for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ u_diag_cnt += cols_left; if ( !lusup ) { /* empty block column */ --cols_left; if ( ujrow[0] == zero ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == zero ) { *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { #ifdef _CRAY SGER(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else dger_(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr; /* move to next column */ } /* for column j ... */ } /* end if pkk ... */ } /* PDGSTRF2 */
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PZGSTRS solves a system of distributed linear equations * A*X = B with a general N-by-N matrix A using the LU factorization * computed by PZGSTRF. * If the equilibration, and row and column permutations were performed, * the LU factorization was performed for A1 where * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * and the linear system solved is * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and * the permutation to B1 by Pc*Pr is applied internally in this routine. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from PZGSTRF for * the possibly scaled and permuted matrix A. * See superlu_zdefs.h for the definition of 'LUstruct_t'. * A may be scaled and permuted into A1, so that * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_defs.h for the definition of 'gridinfo_t'. * * B (input/output) doublecomplex* * On entry, the distributed right-hand side matrix of the possibly * equilibrated system. That is, B may be overwritten by diag(R)*B. * On exit, the distributed solution matrix Y of the possibly * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, * and X is the solution of the original system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * SOLVEstruct (output) SOLVEstruct_t* (global) * Contains the information for the communication during the * solution phase. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
static void pdgstrf2 /************************************************************************/ ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, SuperLUStat_t *stat, int* info ) /* * Purpose * ======= * * Panel factorization -- block column k * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the column processes that owns block column *k* participate * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * U_diag_blk_send_req (input/output) MPI_Request* * List of send requests to send down the diagonal block of U. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst, u_diag_cnt; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; int_t Pr; MPI_Status status; MPI_Comm comm = (grid->cscp).comm; double *lusup, temp; double *ujrow, *ublk_ptr; /* pointer to the U block */ double alpha = -1; *info = 0; /* Quick return. */ /* Initialization. */ iam = grid->iam; Pr = grid->nprow; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ublk_ptr = ujrow = Llu->ujrow; luptr = 0; /* point to the diagonal entries. */ cols_left = nsupc; /* supernode size */ u_diag_cnt = 0; if ( iam == pkk ) { /* diagonal process */ if ( U_diag_blk_send_req && U_diag_blk_send_req[krow] ) { /* There are pending sends - wait for all Isend to complete */ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Wait(U_diag_blk_send_req + pr, &status); } for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ /* Diagonal pivot */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the new diagonal entry with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */ if ( ujrow[0] == 0.0 ) { /* Test for singularity. */ *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else dger_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr + 1; /* move to next column */ } /* for column j ... */ if ( U_diag_blk_send_req && iam == pkk ) { /* Send the U block */ /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Isend(ublk_ptr, u_diag_cnt, MPI_DOUBLE, pr, ((k<<2)+2)%NTAGS, comm, U_diag_blk_send_req + pr); U_diag_blk_send_req[krow] = 1; /* flag outstanding Isend */ } } else { /* non-diagonal process */ /* Receive the diagonal block of U */ MPI_Recv(ublk_ptr, (nsupc*(nsupc+1))>>1, MPI_DOUBLE, krow, ((k<<2)+2)%NTAGS, comm, &status); for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ u_diag_cnt += cols_left; if ( !lusup ) { /* empty block column */ --cols_left; if ( ujrow[0] == 0.0 ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == 0.0 ) { *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { #ifdef _CRAY SGER(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else dger_(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr; /* move to next column */ } /* for column j ... */ } /* end if pkk ... */ } /* PDGSTRF2 */
static void pdgstrf2 /************************************************************************/ ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, int* info ) /* * Purpose * ======= * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the process column that owns block column *k* participates * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { int c, iam, l, pkk; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup, temp; double *ujrow; double alpha = -1; *info = 0; /* Quick return. */ /* Initialization. */ iam = grid->iam; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ujrow = Llu->ujrow; luptr = 0; /* Point to the diagonal entries. */ c = nsupc; for (j = 0; j < jlst - jfst; ++j) { /* Broadcast the j-th row (nsupc - j) elements to the process column. */ if ( iam == pkk ) { /* Diagonal process. */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the replaced diagonal with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; } #if 0 dbcast_col(ujrow, c, pkk, UjROW, grid, &c); #else MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, grid, COMM_COLUMN, &c);*/ #endif #if ( DEBUGlevel>=2 ) if ( k == 3329 && j == 2 ) { if ( iam == pkk ) { printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); } else { printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); } } #endif if ( !lusup ) { /* Empty block column. */ --c; if ( ujrow[0] == 0.0 ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == 0.0 ) { *info = j+jfst+1; } else { /* Scale the j-th column of the matrix. */ temp = 1.0 / ujrow[0]; if ( iam == pkk ) { for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } else { for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } } /* Rank-1 update of the trailing submatrix. */ if ( --c ) { if ( iam == pkk ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #elif defined (USE_VENDOR_BLAS) dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else hypre_F90_NAME_BLAS(dger,DGER)(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * c; } else { #ifdef _CRAY SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #elif defined (USE_VENDOR_BLAS) dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else hypre_F90_NAME_BLAS(dger,DGER)(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * c; } } /* Move to the next column. */ if ( iam == pkk ) luptr += nsupr + 1; else luptr += nsupr; } /* for j ... */ } /* PDGSTRF2 */
/** Optimize type II servo gains beased on measurement noise and signal PSD. We try to minimize \f[ \sigma^2=\int \textrm{PSD}_{ngs,ws}H_{rej}\textrm{d}\nu + \int_0^{\nu_{nyquist}} \textrm{PSF}\textrm{d}\nu \f] */ static void setup_aster_servo(SIM_S *simu, ASTER_S *aster, const PARMS_S *parms){ int ndtrat=parms->skyc.ndtrat; if(aster->gain){ dcellfree(aster->gain); dfree(aster->res_ws); dfree(aster->res_ngs); } aster->gain=dcellnew(ndtrat,1); aster->res_ws=dnew(ndtrat,1); aster->res_ngs=dnew(ndtrat,3); dmat* pres_ngs=aster->res_ngs; for(int idtrat=0; idtrat<ndtrat; idtrat++){ int dtrat=parms->skyc.dtrats->p[idtrat]; double sigma_ngs= aster->sigman->p[idtrat]->p[0]; double sigma_tt = aster->sigman->p[idtrat]->p[1]; double sigma_ps = sigma_ngs-sigma_tt; double sigma_focus = aster->sigman->p[idtrat]->p[2]; long nmod=parms->maos.nmod; /*gsplit: 0: All modes use the same gain. 1: PS, TT, focus (if nmod>5) use different gains. 2: PS, TT use different gains. focus mode (if nmod>5) use PS gains. */ double res_ngs;/*residual error due to signal after servo rejection. */ double res_ngsn;/*residual error due to noise. */ const int servotype=parms->skyc.servo; const int ng=parms->skyc.ngain; aster->gain->p[idtrat]=dnew(ng,nmod); dmat* pgain=aster->gain->p[idtrat]; if(parms->skyc.gsplit){ double pg_tt[ng+2]; double pg_ps[ng+2]; double pg_focus[ng+2]; if(parms->skyc.interpg){ interp_gain(pg_tt, simu->gain_tt[idtrat], simu->gain_x, sigma_tt); interp_gain(pg_ps, simu->gain_ps[idtrat], simu->gain_x, sigma_ps); interp_gain(pg_focus, simu->gain_focus[idtrat], simu->gain_x, sigma_focus); }else{ dmat *sigma2=dnew(1,1); dcell *tmp; sigma2->p[0]=sigma_tt; tmp=servo_optim(simu->psd_tt, parms->maos.dt, dtrat, parms->skyc.pmargin, sigma2, servotype); memcpy(pg_tt, tmp->p[0]->p, (ng+2)*sizeof(double)); dcellfree(tmp); sigma2->p[0]=sigma_ps; tmp=servo_optim(simu->psd_ps, parms->maos.dt, dtrat, parms->skyc.pmargin, sigma2, servotype); memcpy(pg_ps, tmp->p[0]->p, (ng+2)*sizeof(double)); dcellfree(tmp); if(nmod>5){ sigma2->p[0]=sigma_focus; tmp=servo_optim(simu->psd_focus, parms->maos.dt, dtrat, parms->skyc.pmargin, sigma2, servotype); memcpy(pg_focus, tmp->p[0]->p, (ng+2)*sizeof(double)); dcellfree(tmp); } dfree(sigma2); } res_ngs = pg_tt[ng] + pg_ps[ng] + pg_focus[ng];//residual mode res_ngsn = pg_tt[ng+1] + pg_ps[ng+1] + pg_focus[ng+1];//error due to noise for(int imod=0; imod<MIN(nmod,5); imod++){ memcpy(PCOL(pgain,imod), imod<2?pg_tt:pg_ps, sizeof(double)*ng); } if(nmod>5){ memcpy(PCOL(pgain,5), pg_focus, sizeof(double)*ng); } }else{ double pg_ngs[ng+2]; if(parms->skyc.interpg){ interp_gain(pg_ngs, simu->gain_ngs[idtrat], simu->gain_x, sigma_ngs); }else{ dmat *sigma2=dnew(1,1); sigma2->p[0]=sigma_ngs; dcell *tmp; tmp=servo_optim(simu->psd_ngs, parms->maos.dt, dtrat, parms->skyc.pmargin, sigma2, servotype); memcpy(pg_ngs, tmp->p[0]->p, (ng+2)*sizeof(double)); dcellfree(tmp); } res_ngs=pg_ngs[ng]; res_ngsn=pg_ngs[ng+1]; for(int imod=0; imod<nmod; imod++){ memcpy(PCOL(pgain,imod), pg_ngs, sizeof(double)*ng); } } IND(pres_ngs,idtrat,0)=res_ngs+res_ngsn;/*error due to signal and noise */ IND(pres_ngs,idtrat,1)=res_ngs;/*error due to signal */ IND(pres_ngs,idtrat,2)=res_ngsn;/*error due to noise propagation. */ /*if(parms->skyc.reest){//estiamte error in time domain dmat *sigma2=dnew(nmod,nmod);dmat* psigma2=sigma2; dmat* pmcc=parms->maos.mcc; //convert noise into mode space from WFE space. IND(psigma2,0,0)=IND(psigma2,1,1)=sigma_tt/(2*IND(pmcc,0,0)); IND(psigma2,2,2)=IND(psigma2,3,3)=IND(psigma2,4,4)=sigma_ps/(3*IND(pmcc,2,2)); if(nmod>5){ IND(psigma2,5,5)=sigma_focus/IND(pmcc,5,5); } dmat *res=servo_test(simu->mideal, parms->maos.dt, dtrat, sigma2, aster->gain->p[idtrat]); double rms=calc_rms(res,parms->maos.mcc, parms->skyc.evlstart); IND(pres_ngs,idtrat,0)=rms; dfree(sigma2); dfree(res); }*/ dmat *g_tt=dnew_ref(ng,1,PCOL(pgain,0)); double gain_n; aster->res_ws->p[idtrat]=servo_residual(&gain_n, parms->skyc.psd_ws, parms->maos.dt, dtrat, g_tt, 2); dfree(g_tt); }//for dtrat if(parms->skyc.dbg){ writebin(aster->gain,"%s/aster%d_gain",dirsetup,aster->iaster); writebin(aster->res_ws,"%s/aster%d_res_ws",dirsetup,aster->iaster); writebin(aster->res_ngs,"%s/aster%d_res_ngs",dirsetup,aster->iaster); } }
float ddistribute(fact_t fact, int_t n, SuperMatrix *A, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; NCPformat *Astore; double *a; int_t *asub; int_t *xa_begin, *xa_end; int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ int *index1; /* temporary pointer to array of int */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t nfsendx = 0; /* Number of Xk I will send */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t nbsendx = 0; /* Number of Xk I will send */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t iword, dword; float mem_use = 0.0; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter ddistribute()"); #endif if ( fact == SamePattern_SameRowPerm ) { /* --------------------------------------------------------------- * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. * --------------------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Initialize Uval to zero. */ for (lb = 0; lb < nrbu; ++lb) { Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ index = Ufstnz_br_ptr[lb]; if ( index ) { uval = Unzval_br_ptr[lb]; len = index[1]; for (i = 0; i < len; ++i) uval[i] = zero; } /* if index != NULL */ } /* for lb ... */ for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA (for L), or into U directly. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); if ( gb < jb ) { /* in U */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; while ( (k = index[Urb_indptr[lb]]) < jb ) { /* Skip nonzero values in this block */ Urb_length[lb] += index[Urb_indptr[lb]+1]; /* Move pointer to the next block */ Urb_indptr[lb] += UB_DESCRIPTOR + SuperSize( k ); } /*assert(k == jb);*/ /* start fstnz */ istart = Urb_indptr[lb] + UB_DESCRIPTOR; len = Urb_length[lb]; fsupc1 = FstBlockC( gb+1 ); k = j - fsupc; /* Sum the lengths of the leading columns */ for (jj = 0; jj < k; ++jj) len += fsupc1 - index[istart++]; /*assert(irow>=index[istart]);*/ uval[len + irow - index[istart]] = a[i]; } else { /* in L; put in SPA first */ irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* -------------------------------------------------- * FIRST TIME CREATING THE L AND U DATA STRUCTURE. * -------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* No L and U data structures are available yet. We need to set up the L and U data structures and propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) ABORT("Malloc fails for ToRecv[]."); for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index1[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) ABORT("Malloc fails for ToSendD[]."); for (i = 0; i < k; ++i) ToSendD[i] = NO; if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Urb_fstnz[lb] = BR_HEADER; } /* for lb ... */ SUPERLU_FREE(Ucbs); #if ( PROFlevel>=1 ) t = SuperLU_timer_() - t; if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); #endif #if ( PRNTlevel>=1 ) mem_use -= 2.0*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) *sp_ienv_dist(3))))) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); #if ( PRNTlevel>=1 ) mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; #if ( PRNTlevel>=1 ) mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; dense_col = dense; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; /* NOTE: Only the first nonzero index of the segment is stored in usub[]. */ for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ bsendx_plist[ljb][pr] == EMPTY ) { bsendx_plist[ljb][pr] = YES; ++nbsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; fsupc1 = FstBlockC( gb+1 ); if (rb_marker[lb] <= jb) { /* First time see the block */ rb_marker[lb] = jb + 1; Urb_indptr[lb] = Urb_fstnz[lb];; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; /* Record the first location in index[] of the next block */ Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; len = Urb_indptr[lb];/* Start fstnz in index */ index[len-1] = 0; for (k = 0; k < nsupc; ++k) index[len+k] = fsupc1; if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { /* Already saw the block */ len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; /* Load the numerical values */ k = fsupc1 - irow; /* No. of nonzeros in segment */ index[len-1] += k; /* Increment block length in Descriptor */ irow = ilsum[lb] + irow - FstBlockC( gb ); for (ii = 0; ii < k; ++ii) { uval[Urb_length[lb]++] = dense_col[irow + ii]; dense_col[irow + ii] = zero; } } /* if myrow == pr ... */ } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { fsendx_plist[ljb][pr] = YES; ++nfsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) { fprintf(stderr, "col block " IFMT " ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = 0.0; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->nfsendx = nfsendx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->nbsendx = nbsendx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ if ( !(Llu->mod_bit = intMalloc_dist(k)) ) ABORT("Malloc fails for mod_bit[]."); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time:\n " "\tL\t%.2f\n\tU\t%.2f\n" "\tu_blks %d\tnrbu %d\n--------\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit ddistribute()"); #endif return (mem_use); } /* DDISTRIBUTE */
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; MPI_Request *send_req, recv_req; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol != kcol && fmod[lk] ) mod_bit[lk] = 1; /* contribution from off-diagonal */ } } /*PrintInt10("mod_bit", nlb, mod_bit);*/ #if ( PROFlevel>=2 ) t_reduce_tmp = SuperLU_timer_(); #endif /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); #if ( PROFlevel>=2 ) t_reduce += SuperLU_timer_() - t_reduce_tmp; #endif for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* diagonal process */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } #endif } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); Llu->SolveMsgSent = 0; MPI_Barrier( grid->comm ); /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope */ if ( mycol != kcol && bmod[lk] ) mod_bit[lk] = 1; /* Contribution from off-diagonal */ } } /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope. */ if ( mycol == kcol ) { /* diagonal process */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #endif } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(U_diag_blk_send_req = (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request)))) ABORT("Malloc fails for U_diag_blk_send_req[]."); U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */ if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } k = sp_ienv_dist(3); /* max supernode size */ if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); if ( U_diag_blk_send_req[myrow] ) { /* wait for last Isend requests to complete, deallocate objects */ for (krow = 0; krow < Pr; ++krow) if ( krow != myrow ) MPI_Wait(U_diag_blk_send_req + krow, &status); } SUPERLU_FREE(U_diag_blk_send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */
/*! \brief * * <pre> * Purpose * ======= * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the process column that owns block column *k* participates * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * </pre> */ static void pdgstrf2 /************************************************************************/ ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, int* info ) { int c, iam, l, pkk; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup, temp; double *ujrow; double alpha = -1; *info = 0; /* Quick return. */ /* Initialization. */ iam = grid->iam; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ujrow = Llu->ujrow; luptr = 0; /* Point to the diagonal entries. */ c = nsupc; for (j = 0; j < jlst - jfst; ++j) { /* Broadcast the j-th row (nsupc - j) elements to the process column. */ if ( iam == pkk ) { /* Diagonal process. */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the replaced diagonal with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; } #if 0 dbcast_col(ujrow, c, pkk, UjROW, grid, &c); #else MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, grid, COMM_COLUMN, &c);*/ #endif #if ( DEBUGlevel>=2 ) if ( k == 3329 && j == 2 ) { if ( iam == pkk ) { printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); } else { printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); } } #endif if ( !lusup ) { /* Empty block column. */ --c; if ( ujrow[0] == 0.0 ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == 0.0 ) { *info = j+jfst+1; } else { /* Scale the j-th column of the matrix. */ temp = 1.0 / ujrow[0]; if ( iam == pkk ) { for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } else { for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } } /* Rank-1 update of the trailing submatrix. */ if ( --c ) { if ( iam == pkk ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * c; } else { #ifdef _CRAY SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * c; } } /* Move to the next column. */ if ( iam == pkk ) luptr += nsupr + 1; else luptr += nsupr; } /* for j ... */ } /* PDGSTRF2 */
/*! \brief * * <pre> * Purpose * ======= * Perform local block modifications: lsum[i] -= L_i,k * X[k]. * </pre> */ void dlsum_fmod /************************************************************************/ ( double *lsum, /* Sum of local modifications. */ double *x, /* X array (local) */ double *xk, /* X[k]. */ double *rtemp, /* Result of full matrix-vector multiply. */ int nrhs, /* Number of right-hand sides. */ int knsupc, /* Size of supernode k. */ int_t k, /* The k-th component of X. */ int_t *fmod, /* Modification count for L-solve. */ int_t nlb, /* Number of L blocks. */ int_t lptr, /* Starting position in lsub[*]. */ int_t luptr, /* Starting position in lusup[*]. */ int_t *xsup, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request send_req[], SuperLUStat_t *stat ) { double alpha = 1.0, beta = 0.0; double *lusup, *lusup1; double *dest; int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; int_t i, ii, ik, il, ikcol, irow, j, lb, lk, rel; int_t *lsub, *lsub1, nlb1, lptr1, luptr1; int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ int_t *frecv = Llu->frecv; int_t **fsendx_plist = Llu->fsendx_plist; MPI_Status status; int test_flag; iam = grid->iam; myrow = MYROW( iam, grid ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Llu->Lrowind_bc_ptr[lk]; lusup = Llu->Lnzval_bc_ptr[lk]; nsupr = lsub[1]; for (lb = 0; lb < nlb; ++lb) { ik = lsub[lptr]; /* Global block number, row-wise. */ nbrow = lsub[lptr+1]; #ifdef _CRAY SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, &alpha, &lusup[luptr], &nsupr, xk, &knsupc, &beta, rtemp, &nbrow ); #else dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, &alpha, &lusup[luptr], &nsupr, xk, &knsupc, &beta, rtemp, &nbrow ); #endif stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; lk = LBi( ik, grid ); /* Local block number, row-wise. */ iknsupc = SuperSize( ik ); il = LSUM_BLK( lk ); dest = &lsum[il]; lptr += LB_DESCRIPTOR; rel = xsup[ik]; /* Global row index of block ik. */ for (i = 0; i < nbrow; ++i) { irow = lsub[lptr++] - rel; /* Relative row. */ RHS_ITERATE(j) dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; } luptr += nbrow; if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ ikcol = PCOL( ik, grid ); p = PNUM( myrow, ikcol, grid ); if ( iam != p ) { #ifdef ISEND_IRECV MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm ); #else MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); #endif } else { /* Diagonal process: X[i] += lsum[i]. */ ii = X_BLK( lk ); RHS_ITERATE(j) for (i = 0; i < iknsupc; ++i) x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( ik, grid );/* Local block number, column-wise. */ lsub1 = Llu->Lrowind_bc_ptr[lk]; lusup1 = Llu->Lnzval_bc_ptr[lk]; nsupr1 = lsub1[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, lusup1, &nsupr1, &x[ii], &iknsupc); #else dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, lusup1, &nsupr1, &x[ii], &iknsupc); #endif stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < grid->nprow; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, ikcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #else MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nlb1 = lsub1[0] - 1; lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, lptr1, luptr1, xsup, grid, Llu, send_req, stat); } /* if frecv[lk] == 0 */ } /* if iam == p */ } /* if fmod[lk] == 0 */ } /* for lb ... */
/** Create "star" data array from star information. */ static STAR_S *setup_star_create(const PARMS_S *parms, dmat *coord){ if(!coord){ return NULL;/*there are no stars available. */ } int nstar=coord->ny; dmat* pc=coord; int nwvl=parms->maos.nwvl; STAR_S *star=mycalloc(nstar,STAR_S); double ngsgrid=parms->maos.ngsgrid/206265.; double r2=pow(parms->skyc.patfov/206265./2.,2); double keepout=pow(parms->skyc.keepout/206265.,2); double minrad2=pow(parms->skyc.minrad/206265.,2); int jstar=0; assert(nwvl+2==coord->nx); for(int istar=0; istar<nstar; istar++){ if(parms->skyc.ngsalign){ star[jstar].thetax=round(IND(pc,0,istar)/ngsgrid)*ngsgrid; star[jstar].thetay=round(IND(pc,1,istar)/ngsgrid)*ngsgrid; if(pow(star[jstar].thetax,2)+pow(star[jstar].thetay,2)>r2){ star[jstar].thetax=trunc(IND(pc,0,istar)/ngsgrid)*ngsgrid; star[jstar].thetay=round(IND(pc,1,istar)/ngsgrid)*ngsgrid; if(pow(star[jstar].thetax,2)+pow(star[jstar].thetay,2)>r2){ star[jstar].thetax=round(IND(pc,0,istar)/ngsgrid)*ngsgrid; star[jstar].thetay=trunc(IND(pc,1,istar)/ngsgrid)*ngsgrid; if(pow(star[jstar].thetax,2)+pow(star[jstar].thetay,2)>r2){ star[jstar].thetax=trunc(IND(pc,0,istar)/ngsgrid)*ngsgrid; star[jstar].thetay=trunc(IND(pc,1,istar)/ngsgrid)*ngsgrid; if(pow(star[jstar].thetax,2)+pow(star[jstar].thetay,2)>r2){ error("What?\n"); } } } } }else{ star[jstar].thetax=IND(pc,0,istar); star[jstar].thetay=IND(pc,1,istar); } for(int kstar=0; kstar<jstar; kstar++){ if(pow(star[jstar].thetax-star[kstar].thetax,2) +pow(star[jstar].thetay-star[kstar].thetay,2)<keepout){ /*warning("start %d is too close to %d. use J brightest.\n", jstar, kstar); */ if(IND(pc,0,istar)<star[kstar].mags->p[0]){ memcpy(star[kstar].mags->p, PCOL(pc,istar)+2, sizeof(double)*nwvl); star[kstar].thetax=star[jstar].thetax; star[kstar].thetay=star[jstar].thetay; } continue; } } if(pow(star[istar].thetax,2)+pow(star[istar].thetay,2)<minrad2){ info2("Skip star at (%.0f, %.0f) because minrad=%g\n", star[istar].thetax*206265, star[istar].thetay*206265, parms->skyc.minrad); continue; } star[jstar].mags=dnew(nwvl,1); memcpy(star[jstar].mags->p, PCOL(pc,istar)+2, sizeof(double)*nwvl); star[jstar].use=mycalloc(parms->maos.npowfs,int); jstar++; } if(jstar<nstar){ /*warning2("%d stars dropped\n", nstar-jstar); */ coord->ny=jstar; star=myrealloc(star, jstar,STAR_S); } return star; }