void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) { int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk; int_t *xsup; i = j = *num_diag_procs = pkk = 0; nprow = grid->nprow; npcol = grid->npcol; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; do { ++(*num_diag_procs); i = (++i) % nprow; j = (++j) % npcol; pkk = PNUM( i, j, grid ); } while ( pkk != 0 ); /* Until wrap back to process 0 */ if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) ) ABORT("Malloc fails for diag_procs[]"); if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) ) ABORT("Calloc fails for diag_len[]"); for (i = j = k = 0; k < *num_diag_procs; ++k) { pkk = PNUM( i, j, grid ); (*diag_procs)[k] = pkk; i = (++i) % nprow; j = (++j) % npcol; } for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); i = k % *num_diag_procs; (*diag_len)[i] += knsupc; } }
/* * Convert a row compressed storage into a column compressed storage. */ void dCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, double *a, int_t *colind, int_t *rowptr, double **at, int_t **rowind, int_t **colptr) { register int i, j, col, relpos; int_t *marker; /* Allocate storage for another copy of the matrix. */ *at = (double *) doubleMalloc_dist(nnz); *rowind = intMalloc_dist(nnz); *colptr = intMalloc_dist(n+1); marker = intCalloc_dist(n); /* Get counts of each column of A, and set up column pointers */ for (i = 0; i < m; ++i) for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; (*colptr)[0] = 0; for (j = 0; j < n; ++j) { (*colptr)[j+1] = (*colptr)[j] + marker[j]; marker[j] = (*colptr)[j]; } /* Transfer the matrix into the compressed column storage. */ for (i = 0; i < m; ++i) { for (j = rowptr[i]; j < rowptr[i+1]; ++j) { col = colind[j]; relpos = marker[col]; (*rowind)[relpos] = i; (*at)[relpos] = a[j]; ++marker[col]; } } SUPERLU_FREE(marker); }
float ddistribute(fact_t fact, int_t n, SuperMatrix *A, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; NCPformat *Astore; double *a; int_t *asub; int_t *xa_begin, *xa_end; int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ int *index1; /* temporary pointer to array of int */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t nfsendx = 0; /* Number of Xk I will send */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t nbsendx = 0; /* Number of Xk I will send */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t iword, dword; float mem_use = 0.0; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter ddistribute()"); #endif if ( fact == SamePattern_SameRowPerm ) { /* --------------------------------------------------------------- * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. * --------------------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Initialize Uval to zero. */ for (lb = 0; lb < nrbu; ++lb) { Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ index = Ufstnz_br_ptr[lb]; if ( index ) { uval = Unzval_br_ptr[lb]; len = index[1]; for (i = 0; i < len; ++i) uval[i] = zero; } /* if index != NULL */ } /* for lb ... */ for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA (for L), or into U directly. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); if ( gb < jb ) { /* in U */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; while ( (k = index[Urb_indptr[lb]]) < jb ) { /* Skip nonzero values in this block */ Urb_length[lb] += index[Urb_indptr[lb]+1]; /* Move pointer to the next block */ Urb_indptr[lb] += UB_DESCRIPTOR + SuperSize( k ); } /*assert(k == jb);*/ /* start fstnz */ istart = Urb_indptr[lb] + UB_DESCRIPTOR; len = Urb_length[lb]; fsupc1 = FstBlockC( gb+1 ); k = j - fsupc; /* Sum the lengths of the leading columns */ for (jj = 0; jj < k; ++jj) len += fsupc1 - index[istart++]; /*assert(irow>=index[istart]);*/ uval[len + irow - index[istart]] = a[i]; } else { /* in L; put in SPA first */ irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* -------------------------------------------------- * FIRST TIME CREATING THE L AND U DATA STRUCTURE. * -------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* No L and U data structures are available yet. We need to set up the L and U data structures and propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) ABORT("Malloc fails for ToRecv[]."); for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index1[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) ABORT("Malloc fails for ToSendD[]."); for (i = 0; i < k; ++i) ToSendD[i] = NO; if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Urb_fstnz[lb] = BR_HEADER; } /* for lb ... */ SUPERLU_FREE(Ucbs); #if ( PROFlevel>=1 ) t = SuperLU_timer_() - t; if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); #endif #if ( PRNTlevel>=1 ) mem_use -= 2.0*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) *sp_ienv_dist(3))))) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); #if ( PRNTlevel>=1 ) mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; #if ( PRNTlevel>=1 ) mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; dense_col = dense; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; /* NOTE: Only the first nonzero index of the segment is stored in usub[]. */ for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ bsendx_plist[ljb][pr] == EMPTY ) { bsendx_plist[ljb][pr] = YES; ++nbsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; fsupc1 = FstBlockC( gb+1 ); if (rb_marker[lb] <= jb) { /* First time see the block */ rb_marker[lb] = jb + 1; Urb_indptr[lb] = Urb_fstnz[lb];; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; /* Record the first location in index[] of the next block */ Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; len = Urb_indptr[lb];/* Start fstnz in index */ index[len-1] = 0; for (k = 0; k < nsupc; ++k) index[len+k] = fsupc1; if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { /* Already saw the block */ len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; /* Load the numerical values */ k = fsupc1 - irow; /* No. of nonzeros in segment */ index[len-1] += k; /* Increment block length in Descriptor */ irow = ilsum[lb] + irow - FstBlockC( gb ); for (ii = 0; ii < k; ++ii) { uval[Urb_length[lb]++] = dense_col[irow + ii]; dense_col[irow + ii] = zero; } } /* if myrow == pr ... */ } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { fsendx_plist[ljb][pr] = YES; ++nfsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) { fprintf(stderr, "col block " IFMT " ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = 0.0; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->nfsendx = nfsendx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->nbsendx = nbsendx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ if ( !(Llu->mod_bit = intMalloc_dist(k)) ) ABORT("Malloc fails for mod_bit[]."); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time:\n " "\tL\t%.2f\n\tU\t%.2f\n" "\tu_blks %d\tnrbu %d\n--------\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit ddistribute()"); #endif return (mem_use); } /* DDISTRIBUTE */
void pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, double *berr, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * pzgsrfs_ABXglobal improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates * for the solution. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * A (input) SuperMatrix* * The original matrix A, or the scaled A if equilibration was done. * A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc * are permutation matrices. The type of A can be: * Stype = NCP; Dtype = Z; Mtype = GE. * * NOTE: Currently, A must reside in all processes when calling * this routine. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from pzgstrf for * the possibly scaled and permuted matrix A. * See superlu_ddefs.h for the definition of 'LUstruct_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * B (input) doublecomplex* (global) * The N-by-NRHS right-hand side matrix of the possibly equilibrated * and row permuted system. * * NOTE: Currently, B must reside on all processes when calling * this routine. * * ldb (input) int (global) * Leading dimension of matrix B. * * X (input/output) doublecomplex* (global) * On entry, the solution matrix X, as computed by pzgstrs. * On exit, the improved solution matrix X. * If DiagScale = COL or BOTH, X should be premultiplied by diag(C) * in order to obtain the solution to the original system. * * NOTE: Currently, X must reside on all processes when calling * this routine. * * ldx (input) int (global) * Leading dimension of matrix X. * * nrhs (input) int * Number of right-hand sides. * * berr (output) double*, dimension (nrhs) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics about the refinement steps. * See util.h for the definition of SuperLUStat_t. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * */ #define ITMAX 20 Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; /* * Data structures used by matrix-vector multiply routine. */ int_t N_update; /* Number of variables updated on this process */ int_t *update; /* vector elements (global index) updated on this processor. */ int_t *bindx; doublecomplex *val; int_t *mv_sup_to_proc; /* Supernode to process mapping in matrix-vector multiply. */ /*-- end data structures for matrix-vector multiply --*/ doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col, *x_trs, *dx_trs; double *rwork; int_t count, ii, j, jj, k, knsupc, lk, lwork, nprow, nsupers, notran, nz, p; int i, iam, pkk; int_t *ilsum, *xsup; double eps, lstres; double s, safmin, safe1, safe2; /* NEW STUFF */ int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ int_t *diag_len; /* Length of the X vector on diagonal processes. */ /*-- Function prototypes --*/ extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *, doublecomplex *, int, SuperLUStat_t *, int *); extern double dlamch_(char *); /* Test the input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; else if ( nrhs < 0 ) *info = -13; if (*info != 0) { i = -(*info); xerbla_("pzgsrfs_ABXglobal", &i); return; } /* Quick return if possible. */ if ( n == 0 || nrhs == 0 ) { return; } /* Initialization. */ iam = grid->iam; nprow = grid->nprow; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; ilsum = Llu->ilsum; notran = 1; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()"); #endif get_diag_procs(n, Glu_persist, grid, &num_diag_procs, &diag_procs, &diag_len); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. number of diag processes = %d\n", num_diag_procs); PrintInt10("diag_procs", num_diag_procs, diag_procs); PrintInt10("diag_len", num_diag_procs, diag_len); } #endif if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) ABORT("Calloc fails for mv_sup_to_proc[]"); pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, &val, &bindx, mv_sup_to_proc); i = CEILING( nsupers, nprow ); /* Number of local block rows */ ii = Llu->ldalsum + i * XK_H; k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); jj = diag_len[0]; for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); jj = SUPERLU_MAX( jj, N_update ); lwork = N_update /* For ax and R */ + ii /* For dx_trs */ + ii /* For x_trs */ + k /* For b */ + jj; /* for temp */ if ( !(work = doublecomplexMalloc_dist(lwork)) ) ABORT("Malloc fails for work[]"); ax = R = work; dx_trs = work + N_update; x_trs = dx_trs + ii; b = x_trs + ii; temp = b + k; if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) ) ABORT("Malloc fails for rwork[]"); #if ( DEBUGlevel>=2 ) { doublecomplex *dwork = doublecomplexMalloc_dist(n); for (i = 0; i < n; ++i) { if ( i & 1 ) dwork[i].r = 1.; else dwork[i].r = 2.; dwork[i].i = 0.; } /* Check correctness of matrix-vector multiply. */ pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); PrintDouble5("Mult A*x", N_update, ax); SUPERLU_FREE(dwork); } #endif /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dlamch_("Epsilon"); safmin = dlamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; #if ( DEBUGlevel>=1 ) if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", eps, anorm, safe1, safe2); #endif /* Do for each right-hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; /* Copy X into x on the diagonal processes. */ B_col = &B[j*ldb]; X_col = &X[j*ldx]; for (p = 0; p < num_diag_procs; ++p) { pkk = diag_procs[p]; if ( iam == pkk ) { for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; jj = FstBlockC( k ); for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */ } } } /* Copy B into b distributed the same way as matrix-vector product. */ ii = update[0]; for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ /* Matrix-vector multiply. */ pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); /* Compute residual. */ for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]); /* Compute abs(op(A))*abs(X) + abs(B). */ pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork); for (i = 0; i < N_update; ++i) rwork[i] += z_abs1(&b[i]); s = 0.0; for (i = 0; i < N_update; ++i) { if ( rwork[i] > safe2 ) s = SUPERLU_MAX(s, z_abs1(&R[i]) / rwork[i]); else s = SUPERLU_MAX(s, (z_abs1(&R[i])+safe1)/(rwork[i]+safe1)); } MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); #if ( PRNTlevel>= 1 ) if ( !iam ) printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); #endif if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { /* Compute new dx. */ redist_all_to_diag(n, R, Glu_persist, Llu, grid, mv_sup_to_proc, dx_trs); pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); /* Update solution. */ for (p = 0; p < num_diag_procs; ++p) if ( iam == diag_procs[p] ) for (k = p; k < nsupers; k += num_diag_procs) { lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; knsupc = SuperSize( k ); for (i = 0; i < knsupc; ++i) z_add(&x_trs[i + ii], &x_trs[i + ii], &dx_trs[i + ii]); } lstres = berr[j]; ++count; /* Transfer x_trs (on diagonal processes) into X (on all processes). */ gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, num_diag_procs, diag_procs, diag_len, X_col, temp); } else { break; } } /* end while */ stat->RefineSteps = count; } /* for j ... */ /* Deallocate storage used by matrix-vector multiplication. */ SUPERLU_FREE(diag_procs); SUPERLU_FREE(diag_len); if ( N_update ) { SUPERLU_FREE(update); SUPERLU_FREE(bindx); SUPERLU_FREE(val); } SUPERLU_FREE(mv_sup_to_proc); SUPERLU_FREE(work); SUPERLU_FREE(rwork); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()"); #endif } /* PZGSRFS_ABXGLOBAL */
void pdgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx, int nrhs, double *berr, SuperLUStat_t *stat, int *info) { #define ITMAX 20 Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; /* * Data structures used by matrix-vector multiply routine. */ int_t N_update; /* Number of variables updated on this process */ int_t *update; /* vector elements (global index) updated on this processor. */ int_t *bindx; double *val; int_t *mv_sup_to_proc; /* Supernode to process mapping in matrix-vector multiply. */ /*-- end data structures for matrix-vector multiply --*/ double *b, *ax, *R, *B_col, *temp, *work, *X_col, *x_trs, *dx_trs; int_t count, ii, j, jj, k, knsupc, lk, lwork, nprow, nsupers, nz, p; int i, iam, pkk; int_t *ilsum, *xsup; double eps, lstres; double s, safmin, safe1, safe2; /* NEW STUFF */ int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ int_t *diag_len; /* Length of the X vector on diagonal processes. */ /*-- Function prototypes --*/ extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *, double *, int, SuperLUStat_t *, int *); /* Test the input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NCP || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; else if ( nrhs < 0 ) *info = -13; if (*info != 0) { i = -(*info); pxerr_dist("pdgsrfs_ABXglobal", grid, i); return; } /* Quick return if possible. */ if ( n == 0 || nrhs == 0 ) { return; } /* Initialization. */ iam = grid->iam; nprow = grid->nprow; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; ilsum = Llu->ilsum; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgsrfs_ABXglobal()"); #endif get_diag_procs(n, Glu_persist, grid, &num_diag_procs, &diag_procs, &diag_len); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. number of diag processes = " IFMT "\n", num_diag_procs); PrintInt10("diag_procs", num_diag_procs, diag_procs); PrintInt10("diag_len", num_diag_procs, diag_len); } #endif if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) ABORT("Calloc fails for mv_sup_to_proc[]"); pdgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, &val, &bindx, mv_sup_to_proc); i = CEILING( nsupers, nprow ); /* Number of local block rows */ ii = Llu->ldalsum + i * XK_H; k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); jj = diag_len[0]; for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); jj = SUPERLU_MAX( jj, N_update ); lwork = N_update /* For ax and R */ + ii /* For dx_trs */ + ii /* For x_trs */ + k /* For b */ + jj; /* for temp */ if ( !(work = doubleMalloc_dist(lwork)) ) ABORT("Malloc fails for work[]"); ax = R = work; dx_trs = work + N_update; x_trs = dx_trs + ii; b = x_trs + ii; temp = b + k; #if ( DEBUGlevel>=2 ) { double *dwork = doubleMalloc_dist(n); for (i = 0; i < n; ++i) { if ( i & 1 ) dwork[i] = 1.; else dwork[i] = 2.; } /* Check correctness of matrix-vector multiply. */ pdgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); PrintDouble5("Mult A*x", N_update, ax); SUPERLU_FREE(dwork); } #endif /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dmach_dist("Epsilon"); safmin = dmach_dist("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; #if ( DEBUGlevel>=1 ) if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", eps, anorm, safe1, safe2); #endif /* Do for each right-hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; /* Copy X into x on the diagonal processes. */ B_col = &B[j*ldb]; X_col = &X[j*ldx]; for (p = 0; p < num_diag_procs; ++p) { pkk = diag_procs[p]; if ( iam == pkk ) { for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; jj = FstBlockC( k ); for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; dx_trs[ii-XK_H] = k;/* Block number prepended in header. */ } } } /* Copy B into b distributed the same way as matrix-vector product. */ if ( N_update ) ii = update[0]; for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ /* Matrix-vector multiply. */ pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); /* Compute residual. */ for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i]; /* Compute abs(op(A))*abs(X) + abs(B). */ pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp); for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]); s = 0.0; for (i = 0; i < N_update; ++i) { if ( temp[i] > safe2 ) { s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); } else if ( temp[i] != 0.0 ) { /* Adding SAFE1 to the numerator guards against spuriously zero residuals (underflow). */ s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]); } /* If temp[i] is exactly 0.0 (computed by PxGSMV), then we know the true residual also must be exactly 0.0. */ } MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); #if ( PRNTlevel>= 1 ) if ( !iam ) printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]); #endif if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { /* Compute new dx. */ redist_all_to_diag(n, R, Glu_persist, Llu, grid, mv_sup_to_proc, dx_trs); pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); /* Update solution. */ for (p = 0; p < num_diag_procs; ++p) if ( iam == diag_procs[p] ) for (k = p; k < nsupers; k += num_diag_procs) { lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; knsupc = SuperSize( k ); for (i = 0; i < knsupc; ++i) x_trs[i + ii] += dx_trs[i + ii]; } lstres = berr[j]; ++count; /* Transfer x_trs (on diagonal processes) into X (on all processes). */ gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, num_diag_procs, diag_procs, diag_len, X_col, temp); } else { break; } } /* end while */ stat->RefineSteps = count; } /* for j ... */ /* Deallocate storage used by matrix-vector multiplication. */ SUPERLU_FREE(diag_procs); SUPERLU_FREE(diag_len); if ( N_update ) { SUPERLU_FREE(update); SUPERLU_FREE(bindx); SUPERLU_FREE(val); } SUPERLU_FREE(mv_sup_to_proc); SUPERLU_FREE(work); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgsrfs_ABXglobal()"); #endif } /* PDGSRFS_ABXGLOBAL */
/*! \brief * * <pre> * Create the distributed modified sparse row (MSR) matrix: bindx/val. * For a submatrix of size m-by-n, the MSR arrays are as follows: * bindx[0] = m + 1 * bindx[0..m] = pointer to start of each row * bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k, * where, ks = bindx[k], ke = bindx[k+1]-1 * val[k] = A(k,k), k < m, diagonal elements * val[m] = not used * val[ki] = A(k, bindx[ki]), where ks <= ki <= ke * Both arrays are of length nnz + 1. * </pre> */ static void zcreate_msr_matrix ( SuperMatrix *A, /* Matrix A permuted by columns (input). The type of A can be: Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */ int_t update[], /* input (local) */ int_t N_update, /* input (local) */ doublecomplex **val, /* output */ int_t **bindx /* output */ ) { int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag; NCPformat *Astore; doublecomplex *nzval; int_t *rowcnt; doublecomplex zero = {0.0, 0.0}; if ( !N_update ) return; n = A->ncol; Astore = A->Store; nzval = Astore->nzval; /* One pass of original matrix A to count nonzeros of each row. */ if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) ) ABORT("Malloc fails for rowcnt[]"); lo = update[0]; hi = update[N_update-1]; nnz_local = 0; nnz_diag = 0; for (j = 0; j < n; ++j) { for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { irow = Astore->rowind[i]; if ( irow >= lo && irow <= hi ) { if ( irow != j ) /* Exclude diagonal */ ++rowcnt[irow - lo]; else ++nnz_diag; /* Count nonzero diagonal entries */ ++nnz_local; } } } /* Add room for the logical diagonal zeros which are not counted in nnz_local. */ nnz_local += (N_update - nnz_diag); /* Allocate storage for bindx[] and val[]. */ if ( !(*val = (doublecomplex *) doublecomplexMalloc_dist(nnz_local+1)) ) ABORT("Malloc fails for val[]"); for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */ if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) ) ABORT("Malloc fails for bindx[]"); /* Set up row pointers. */ (*bindx)[0] = N_update + 1; for (j = 1; j <= N_update; ++j) { (*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1]; rowcnt[j-1] = (*bindx)[j-1]; } /* One pass of original matrix A to fill in matrix entries. */ for (j = 0; j < n; ++j) { for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { irow = Astore->rowind[i]; if ( irow >= lo && irow <= hi ) { if ( irow == j ) /* Diagonal */ (*val)[irow - lo] = nzval[i]; else { irow -= lo; k = rowcnt[irow]; (*bindx)[k] = j; (*val)[k] = nzval[i]; ++rowcnt[irow]; } } } } SUPERLU_FREE(rowcnt); }
void pdgsmv_init ( SuperMatrix *A, /* Matrix A permuted by columns (input/output). The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. */ int_t *row_to_proc, /* Input. Mapping between rows and processes. */ gridinfo_t *grid, /* Input */ pdgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ ) { NRformat_loc *Astore; int iam, p, procs; int *SendCounts, *RecvCounts; int_t i, j, k, l, m, m_loc, n, fst_row, jcol; int_t TotalIndSend, TotalValSend; int_t *colind, *rowptr; int_t *ind_tosend = NULL, *ind_torecv = NULL; int_t *ptr_ind_tosend, *ptr_ind_torecv; int_t *extern_start, *spa, *itemp; double *nzval, *val_tosend = NULL, *val_torecv = NULL, t; MPI_Request *send_req, *recv_req; MPI_Status status; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pdgsmv_init()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; m = A->nrow; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; colind = Astore->colind; rowptr = Astore->rowptr; nzval = Astore->nzval; if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) ABORT("Malloc fails for SendCounts[]"); /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ RecvCounts = SendCounts + procs; if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) ABORT("Malloc fails for ptr_ind_tosend[]"); ptr_ind_torecv = ptr_ind_tosend + procs + 1; if ( !(extern_start = intMalloc_dist(m_loc)) ) ABORT("Malloc fails for extern_start[]"); for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; /* ------------------------------------------------------------ COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE LOCAL PART OF X. THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ ABORT("Malloc fails for spa[]"); for (p = 0; p < procs; ++p) SendCounts[p] = 0; for (i = 0; i < m_loc; ++i) { /* Loop through each row */ k = extern_start[i]; for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ jcol = colind[j]; p = row_to_proc[jcol]; if ( p != iam ) { /* External */ if ( spa[jcol] == 0 ) { /* First time see this index */ ++SendCounts[p]; spa[jcol] = 1; } } else { /* Swap to beginning the part of A corresponding to the local part of X */ l = colind[k]; t = nzval[k]; colind[k] = jcol; nzval[k] = nzval[j]; colind[j] = l; nzval[j] = t; ++k; } } extern_start[i] = k; } /* ------------------------------------------------------------ LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ /* Build pointers to ind_tosend[]. */ ptr_ind_tosend[0] = 0; for (p = 0, TotalIndSend = 0; p < procs; ++p) { TotalIndSend += SendCounts[p]; /* Total to send. */ ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; } #if 0 ptr_ind_tosend[iam] = 0; /* Local part of X */ #endif if ( TotalIndSend ) { if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ } /* Build SPA to aid global to local translation. */ for (i = 0; i < n; ++i) spa[i] = EMPTY; for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; if ( spa[jcol] == EMPTY ) { /* First time see this index */ p = row_to_proc[jcol]; if ( p == iam ) { /* Local */ /*assert(jcol>=fst_row);*/ spa[jcol] = jcol - fst_row; /* Relative position in local X */ } else { /* External */ ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ ++ptr_ind_tosend[p]; } } } } /* ------------------------------------------------------------ TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; colind[j] = spa[jcol]; } } /* ------------------------------------------------------------ COMMUNICATE THE EXTERNAL INDICES OF X. ------------------------------------------------------------*/ MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, grid->comm); /* Build pointers to ind_torecv[]. */ ptr_ind_torecv[0] = 0; for (p = 0, TotalValSend = 0; p < procs; ++p) { TotalValSend += RecvCounts[p]; /* Total to receive. */ ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; } if ( TotalValSend ) { if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) ABORT("Malloc fails for ind_torecv[]"); } if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) ABORT("Malloc fails for recv_req[]."); recv_req = send_req + procs; for (p = 0; p < procs; ++p) { ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ if ( SendCounts[p] ) { MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], mpi_int_t, p, iam, grid->comm, &send_req[p]); } if ( RecvCounts[p] ) { MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], mpi_int_t, p, p, grid->comm, &recv_req[p]); } } for (p = 0; p < procs; ++p) { if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status); } /* Allocate storage for the X values to to transferred. */ if ( TotalIndSend && !(val_torecv = doubleMalloc_dist(TotalIndSend)) ) ABORT("Malloc fails for val_torecv[]."); if ( TotalValSend && !(val_tosend = doubleMalloc_dist(TotalValSend)) ) ABORT("Malloc fails for val_tosend[]."); gsmv_comm->extern_start = extern_start; gsmv_comm->ind_tosend = ind_tosend; gsmv_comm->ind_torecv = ind_torecv; gsmv_comm->ptr_ind_tosend = ptr_ind_tosend; gsmv_comm->ptr_ind_torecv = ptr_ind_torecv; gsmv_comm->SendCounts = SendCounts; gsmv_comm->RecvCounts = RecvCounts; gsmv_comm->val_tosend = val_tosend; gsmv_comm->val_torecv = val_torecv; gsmv_comm->TotalIndSend = TotalIndSend; gsmv_comm->TotalValSend = TotalValSend; SUPERLU_FREE(spa); SUPERLU_FREE(send_req); #if ( DEBUGlevel>=2 ) PrintInt10("pdgsmv_init::rowptr", m_loc+1, rowptr); PrintInt10("pdgsmv_init::extern_start", m_loc, extern_start); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgsmv_init()"); #endif } /* PDGSMV_INIT */
int_t dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, gridinfo_t *grid, int_t *colptr[], int_t *rowind[], double *a[]) { /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * Purpose * ======= * Re-distribute A on the 2D process mesh. * * Arguments * ========= * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * grid (input) gridinfo_t* * The 2D process mesh. * * colptr (output) int* * * rowind (output) int* * * a (output) double* * * Return value * ============ * */ NRformat_loc *Astore; int_t *perm_r; /* row permutation vector */ int_t *perm_c; /* column permutation vector */ int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; int_t nnz_loc; /* number of local nonzeros */ int_t nnz_remote; /* number of remote nonzeros to be sent */ int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; int_t *ia, *ja, **ia_send, *index, *itemp; int_t *ptr_to_send; double *aij, **aij_send, *nzval, *dtemp; double *nzval_a; int iam, it, p, procs; MPI_Request *send_req; MPI_Status status; /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter dReDistribute_A()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; nnzToRecv = intCalloc_dist(2*procs); nnzToSend = nnzToRecv + procs; /* ------------------------------------------------------------ COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); ++nnzToSend[p]; } } /* All-to-all communication */ MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, grid->comm); maxnnzToRecv = 0; nnz_loc = SendCnt = RecvCnt = 0; for (p = 0; p < procs; ++p) { if ( p != iam ) { SendCnt += nnzToSend[p]; RecvCnt += nnzToRecv[p]; maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); } else { nnz_loc += nnzToRecv[p]; /*assert(nnzToSend[p] == nnzToRecv[p]);*/ } } k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ /* Allocate space for storing the triplets after redistribution. */ if ( !(ia = intMalloc_dist(2*k)) ) ABORT("Malloc fails for ia[]."); ja = ia + k; if ( !(aij = doubleMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) ABORT("Malloc fails for ia_send[]."); if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) ABORT("Malloc fails for aij_send[]."); if ( !(index = intMalloc_dist(2*SendCnt)) ) ABORT("Malloc fails for index[]."); if ( !(nzval = doubleMalloc_dist(SendCnt)) ) ABORT("Malloc fails for nzval[]."); if ( !(ptr_to_send = intCalloc_dist(procs)) ) ABORT("Malloc fails for ptr_to_send[]."); if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) ABORT("Malloc fails for itemp[]."); if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) ABORT("Malloc fails for dtemp[]."); for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } } /* if procs > 1 */ if ( !(*colptr = intCalloc_dist(n+1)) ) ABORT("Malloc fails for *colptr[]."); /* ------------------------------------------------------------ LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. THIS ACCOUNTS FOR THE SECOND PASS OF A. ------------------------------------------------------------*/ nnz_loc = 0; /* Reset the local nonzero count. */ nzval_a = Astore->nzval; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); if ( p != iam ) { /* remote */ k = ptr_to_send[p]; ia_send[p][k] = irow; ia_send[p][k + nnzToSend[p]] = jcol; aij_send[p][k] = nzval_a[j]; ++ptr_to_send[p]; } else { /* local */ ia[nnz_loc] = irow; ja[nnz_loc] = jcol; aij[nnz_loc] = nzval_a[j]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } /* ------------------------------------------------------------ PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); it = nnzToSend[p]; MPI_Isend( aij_send[p], it, MPI_DOUBLE, p, iam+procs, grid->comm, &send_req[procs+p] ); } } for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, grid->comm, &status ); for (i = 0; i < nnzToRecv[p]; ++i) { ia[nnz_loc] = itemp[i]; jcol = itemp[i + nnzToRecv[p]]; /*assert(jcol<n);*/ ja[nnz_loc] = jcol; aij[nnz_loc] = dtemp[i]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } for (p = 0; p < procs; ++p) { if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } } /* ------------------------------------------------------------ DEALLOCATE TEMPORARY STORAGE ------------------------------------------------------------*/ SUPERLU_FREE(nnzToRecv); if ( procs > 1 ) { SUPERLU_FREE(send_req); SUPERLU_FREE(ia_send); SUPERLU_FREE(aij_send); SUPERLU_FREE(index); SUPERLU_FREE(nzval); SUPERLU_FREE(ptr_to_send); SUPERLU_FREE(itemp); SUPERLU_FREE(dtemp); } /* ------------------------------------------------------------ CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. ------------------------------------------------------------*/ if ( !(*rowind = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *rowind[]."); if ( !(*a = doubleMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *a[]."); /* Initialize the array of column pointers */ k = 0; jsize = (*colptr)[0]; (*colptr)[0] = 0; for (j = 1; j < n; ++j) { k += jsize; jsize = (*colptr)[j]; (*colptr)[j] = k; } /* Copy the triplets into the column oriented storage */ for (i = 0; i < nnz_loc; ++i) { j = ja[i]; k = (*colptr)[j]; (*rowind)[k] = ia[i]; (*a)[k] = aij[i]; ++(*colptr)[j]; } /* Reset the column pointers to the beginning of each column */ for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; (*colptr)[0] = 0; SUPERLU_FREE(ia); SUPERLU_FREE(aij); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit dReDistribute_A()"); #endif } /* dReDistribute_A */
int_t pddistribute(fact_t fact, int_t n, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * * Purpose * ======= * Distribute the matrix onto the 2D process mesh. * * Arguments * ========= * * fact (input) fact_t * Specifies whether or not the L and U structures will be re-used. * = SamePattern_SameRowPerm: L and U structures are input, and * unchanged on exit. * = DOFACT or SamePattern: L and U structures are computed and output. * * n (input) int * Dimension of the matrix. * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * LUstruct (input) LUstruct_t* * Data structures for L and U factors. * * grid (input) gridinfo_t* * The 2D process mesh. * * Return value * ============ * > 0, working storage required (in bytes). * */ { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; #if 0 NCPformat *Astore; #else /* XSL ==> */ NRformat_loc *Astore; #endif double *a; int_t *asub, *xa; #if 0 int_t *xa_begin, *xa_end; #endif int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int_t *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t mem_use = 0, iword, dword; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = (NRformat_loc *) A->Store; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pddistribute()"); #endif dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, grid, &xa, &asub, &a); if ( fact == SamePattern_SameRowPerm ) { #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); for (lb = 0; lb < nrbu; ++lb) Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Unzval[]. */ for (lb = 0; lb < nrbu; ++lb) { index = Ufstnz_br_ptr[lb]; if ( index && index[Urb_indptr[lb]] == jb ) { uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb] + UB_DESCRIPTOR; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj = 0, dense_col = dense; jj < nsupc; ++jj) { j = index[len+jj]; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow+i]; dense_col[irow+i] = zero; } dense_col += ldaspa; } Urb_indptr[lb] += UB_DESCRIPTOR + nsupc; } /* if index != NULL */ } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* ------------------------------------------------------------ FIRST TIME CREATING THE L AND U DATA STRUCTURES. ------------------------------------------------------------*/ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We first need to set up the L and U data structures and then * propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = intCalloc_dist(nsupers)) ) ABORT("Calloc fails for ToRecv[]."); k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index = intMalloc_dist(j)) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use = k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = intCalloc_dist(k)) ) ABORT("Malloc fails for ToSendD[]."); if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ } /* for lb ... */ SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Ucbs); #if ( PRNTlevel>=1 ) mem_use -= 2*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); /* ------------------------------------------------ */ #if ( PRNTlevel>=1 ) mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; /* -------------------------------------------------------------- */ #if ( PRNTlevel>=1 ) mem_use += 4*k*sizeof(int_t*) + 2*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow ) bsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; len = Urb_indptr[lb]; for (k = 0; k < nsupc; ++k) index[len+k] = FstBlockC( gb+1 ); if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; } /* if myrow == pr ... */ } /* for i ... */ } /* for j ... */ /* Figure out how many nonzeros in each block, and gather the initial values of A from SPA into Uval. */ for (lb = 0; lb < nrbu; ++lb) { if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb]; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) { j = index[len+jj]; /* First nonzero in segment. */ bnnz += k - j; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow + i]; dense_col[irow + i] = zero; } dense_col += ldaspa; } index[len-1] = bnnz; /* Set block length in Descriptor */ Urb_indptr[lb] += nsupc; } } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow ) fsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(len*nsupc))) { fprintf(stderr, "col block %d ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ SUPERLU_FREE(xa); SUPERLU_FREE(asub); SUPERLU_FREE(a); #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit pddistribute()"); #endif return (mem_use); } /* PDDISTRIBUTE */
int zcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz, doublecomplex *nzval_g, int_t *rowind_g, int_t *colptr_g, gridinfo_t *grid) { SuperMatrix GA; /* global A */ int_t *rowind, *colptr; /* global */ doublecomplex *nzval; /* global */ doublecomplex *nzval_loc; /* local */ int_t *colind, *rowptr; /* local */ int_t m_loc, fst_row, nnz_loc; int_t m_loc_fst; /* Record m_loc of the first p-1 processors, when mod(m, p) is not zero. */ int_t iam, row, col, i, j, relpos; char trans[1]; int_t *marker; iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter zcreate_dist_matrix()"); #endif if ( !iam ) { /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); /* Copy the global matrix. */ #if 0 /* and ADJUST to 0-based indexing which is required by the C routines.*/ #endif for(i=0; i<nnz; i++){ nzval[i]=nzval_g[i]; rowind[i]=rowind_g[i]; /* - 1;*/ } for(i=0; i<n+1; i++) colptr[i]=colptr_g[i]; /* - 1;*/ /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); } #if 0 nzval[0]=0.1; #endif /* Compute the number of rows to be distributed to local process */ m_loc = m / (grid->nprow * grid->npcol); m_loc_fst = m_loc; /* When m / procs is not an integer */ if ((m_loc * grid->nprow * grid->npcol) != m) { m_loc = m_loc+1; m_loc_fst = m_loc; if (iam == (grid->nprow * grid->npcol - 1)) m_loc = m - m_loc_fst * (grid->nprow * grid->npcol - 1); } /* Create compressed column matrix for GA. */ zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, SLU_NC, SLU_Z, SLU_GE); /************************************************* * Change GA to a local A with NR_loc format * *************************************************/ rowptr = (int_t *) intMalloc_dist(m_loc+1); marker = (int_t *) intCalloc_dist(n); /* Get counts of each row of GA */ for (i = 0; i < n; ++i) for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; /* Set up row pointers */ rowptr[0] = 0; fst_row = iam * m_loc_fst; nnz_loc = 0; for (j = 0; j < m_loc; ++j) { row = fst_row + j; rowptr[j+1] = rowptr[j] + marker[row]; marker[j] = rowptr[j]; } nnz_loc = rowptr[m_loc]; nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); colind = (int_t *) intMalloc_dist(nnz_loc); /* Transfer the matrix into the compressed row storage */ for (i = 0; i < n; ++i) { for (j = colptr[i]; j < colptr[i+1]; ++j) { row = rowind[j]; if ( (row>=fst_row) && (row<fst_row+m_loc) ) { row = row - fst_row; relpos = marker[row]; colind[relpos] = i; nzval_loc[relpos] = nzval[j]; ++marker[row]; } } } #if ( DEBUGlevel>=1 ) if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); #endif /* Destroy GA */ Destroy_CompCol_Matrix_dist(&GA); /******************************************************/ /* Change GA to a local A with NR_loc format */ /******************************************************/ /* Set up the local A in NR_loc format */ zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, nzval_loc, colind, rowptr, SLU_NR_loc, SLU_Z, SLU_GE); SUPERLU_FREE(marker); #if ( DEBUGlevel>=1 ) printf("sizeof(NRforamt_loc) %d\n", sizeof(NRformat_loc)); CHECK_MALLOC(iam, "Exit dcreate_dist_matrix()"); #endif return 0; }