/* * Allocate storage for original matrix A */ void dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa) { *a = (double *) doubleMalloc_dist(nnz); *asub = (int_t *) intMalloc_dist(nnz); *xa = (int_t *) intMalloc_dist(n+1); }
/*! \brief Clone: Allocate memory for a new matrix B, which is of the same type * and shape as A. * The clone operation would copy all the non-pointer structure members like * nrow, ncol, Stype, Dtype, Mtype from A and allocate a new nested Store * structure. It would also copy nnz_loc, m_loc, fst_row from A->Store * into B->Store. It does not copy the matrix entries, row pointers, * or column indices. */ void dClone_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B) { NRformat_loc *Astore, *Bstore; B->Stype = A->Stype; B->Dtype = A->Dtype; B->Mtype = A->Mtype; B->nrow = A->nrow;; B->ncol = A->ncol; Astore = (NRformat_loc *) A->Store; B->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) ); if ( !(B->Store) ) ABORT("SUPERLU_MALLOC fails for B->Store"); Bstore = (NRformat_loc *) B->Store; Bstore->nnz_loc = Astore->nnz_loc; Bstore->m_loc = Astore->m_loc; Bstore->fst_row = Astore->fst_row; if ( !(Bstore->nzval = (double *) doubleMalloc_dist(Bstore->nnz_loc)) ) ABORT("doubleMalloc_dist fails for Bstore->nzval"); if ( !(Bstore->colind = (int_t *) intMalloc_dist(Bstore->nnz_loc)) ) ABORT("intMalloc_dist fails for Bstore->colind"); if ( !(Bstore->rowptr = (int_t *) intMalloc_dist(Bstore->m_loc + 1)) ) ABORT("intMalloc_dist fails for Bstore->rowptr"); return; }
/*! \brief Allocate storage for original matrix A */ void zallocateA_dist(int_t n, int_t nnz, doublecomplex **a, int_t **asub, int_t **xa) { *a = (doublecomplex *) doublecomplexMalloc_dist(nnz); *asub = (int_t *) intMalloc_dist(nnz); *xa = (int_t *) intMalloc_dist(n+1); }
/*! \brief Allocate storage in ScalePermstruct */ void ScalePermstructInit(const int_t m, const int_t n, ScalePermstruct_t *ScalePermstruct) { ScalePermstruct->DiagScale = NOEQUIL; if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) ) ABORT("Malloc fails for perm_r[]."); if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) ) ABORT("Malloc fails for perm_c[]."); }
void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) { int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk; int_t *xsup; i = j = *num_diag_procs = pkk = 0; nprow = grid->nprow; npcol = grid->npcol; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; do { ++(*num_diag_procs); i = (++i) % nprow; j = (++j) % npcol; pkk = PNUM( i, j, grid ); } while ( pkk != 0 ); /* Until wrap back to process 0 */ if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) ) ABORT("Malloc fails for diag_procs[]"); if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) ) ABORT("Calloc fails for diag_len[]"); for (i = j = k = 0; k < *num_diag_procs; ++k) { pkk = PNUM( i, j, grid ); (*diag_procs)[k] = pkk; i = (++i) % nprow; j = (++j) % npcol; } for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); i = k % *num_diag_procs; (*diag_len)[i] += knsupc; } }
int zread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, doublecomplex **nzval, int_t **rowind, int_t **colptr) { size_t isize = sizeof(int_t), dsize = sizeof(double); int nnz_read; fread(n, isize, 1, fp); fread(nnz, isize, 1, fp); printf("fread n " IFMT "\tnnz " IFMT "\n", *n, *nnz); *m = *n; *colptr = intMalloc_dist(*n+1); *rowind = intMalloc_dist(*nnz); *nzval = doublecomplexMalloc_dist(*nnz); fread(*colptr, isize, (size_t) (*n + 1), fp); fread(*rowind, isize, (size_t) *nnz, fp); nnz_read = fread(*nzval, dsize, (size_t) (2 * (*nnz)), fp); printf("# of doubles fread: %d\n", nnz_read); fclose(fp); }
/*! \brief Allocate storage in LUstruct */ void LUstructInit(const int_t m, const int_t n, LUstruct_t *LUstruct) { if ( !(LUstruct->etree = intMalloc_dist(n)) ) ABORT("Malloc fails for etree[]."); if ( !(LUstruct->Glu_persist = (Glu_persist_t *) SUPERLU_MALLOC(sizeof(Glu_persist_t))) ) ABORT("Malloc fails for Glu_persist_t."); if ( !(LUstruct->Llu = (LocalLU_t *) SUPERLU_MALLOC(sizeof(LocalLU_t))) ) ABORT("Malloc fails for LocalLU_t."); }
void get_metis( int_t n, /* dimension of matrix B */ int_t bnz, /* number of nonzeros in matrix A. */ int_t *b_colptr, /* column pointer of size n+1 for matrix B. */ int_t *b_rowind, /* row indices of size bnz for matrix B. */ int_t *perm_c /* out - the column permutation vector. */ ) { #define METISOPTIONS 8 int ct, i, j, nm, numflag = 0; /* C-Style ordering */ int metis_options[METISOPTIONS]; int_t *perm, *iperm; metis_options[0] = 0; /* Use Defaults for now */ perm = intMalloc_dist(n); iperm = intMalloc_dist(n); nm = n; #ifdef USE_METIS /* Call metis */ #undef USEEND #ifdef USEEND METIS_EdgeND(&nm, b_colptr, b_rowind, &numflag, metis_options, perm, iperm); #else METIS_NodeND(&nm, b_colptr, b_rowind, &numflag, metis_options, perm, iperm); #endif #endif /* Copy the permutation vector into SuperLU data structure. */ for (i = 0; i < n; ++i) perm_c[i] = iperm[i]; SUPERLU_FREE(b_colptr); SUPERLU_FREE(b_rowind); SUPERLU_FREE(perm); SUPERLU_FREE(iperm); }
/* * Convert a row compressed storage into a column compressed storage. */ void dCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, double *a, int_t *colind, int_t *rowptr, double **at, int_t **rowind, int_t **colptr) { register int i, j, col, relpos; int_t *marker; /* Allocate storage for another copy of the matrix. */ *at = (double *) doubleMalloc_dist(nnz); *rowind = intMalloc_dist(nnz); *colptr = intMalloc_dist(n+1); marker = intCalloc_dist(n); /* Get counts of each column of A, and set up column pointers */ for (i = 0; i < m; ++i) for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; (*colptr)[0] = 0; for (j = 0; j < n; ++j) { (*colptr)[j+1] = (*colptr)[j] + marker[j]; marker[j] = (*colptr)[j]; } /* Transfer the matrix into the compressed column storage. */ for (i = 0; i < m; ++i) { for (j = rowptr[i]; j < rowptr[i+1]; ++j) { col = colind[j]; relpos = marker[col]; (*rowind)[relpos] = i; (*at)[relpos] = a[j]; ++marker[col]; } } SUPERLU_FREE(marker); }
float ddistribute(fact_t fact, int_t n, SuperMatrix *A, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; NCPformat *Astore; double *a; int_t *asub; int_t *xa_begin, *xa_end; int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ int *index1; /* temporary pointer to array of int */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t nfsendx = 0; /* Number of Xk I will send */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t nbsendx = 0; /* Number of Xk I will send */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t iword, dword; float mem_use = 0.0; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter ddistribute()"); #endif if ( fact == SamePattern_SameRowPerm ) { /* --------------------------------------------------------------- * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. * --------------------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Initialize Uval to zero. */ for (lb = 0; lb < nrbu; ++lb) { Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ index = Ufstnz_br_ptr[lb]; if ( index ) { uval = Unzval_br_ptr[lb]; len = index[1]; for (i = 0; i < len; ++i) uval[i] = zero; } /* if index != NULL */ } /* for lb ... */ for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA (for L), or into U directly. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); if ( gb < jb ) { /* in U */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; while ( (k = index[Urb_indptr[lb]]) < jb ) { /* Skip nonzero values in this block */ Urb_length[lb] += index[Urb_indptr[lb]+1]; /* Move pointer to the next block */ Urb_indptr[lb] += UB_DESCRIPTOR + SuperSize( k ); } /*assert(k == jb);*/ /* start fstnz */ istart = Urb_indptr[lb] + UB_DESCRIPTOR; len = Urb_length[lb]; fsupc1 = FstBlockC( gb+1 ); k = j - fsupc; /* Sum the lengths of the leading columns */ for (jj = 0; jj < k; ++jj) len += fsupc1 - index[istart++]; /*assert(irow>=index[istart]);*/ uval[len + irow - index[istart]] = a[i]; } else { /* in L; put in SPA first */ irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* -------------------------------------------------- * FIRST TIME CREATING THE L AND U DATA STRUCTURE. * -------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* No L and U data structures are available yet. We need to set up the L and U data structures and propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) ABORT("Malloc fails for ToRecv[]."); for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index1[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) ABORT("Malloc fails for ToSendD[]."); for (i = 0; i < k; ++i) ToSendD[i] = NO; if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Urb_fstnz[lb] = BR_HEADER; } /* for lb ... */ SUPERLU_FREE(Ucbs); #if ( PROFlevel>=1 ) t = SuperLU_timer_() - t; if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); #endif #if ( PRNTlevel>=1 ) mem_use -= 2.0*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) *sp_ienv_dist(3))))) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); #if ( PRNTlevel>=1 ) mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; #if ( PRNTlevel>=1 ) mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; dense_col = dense; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; /* NOTE: Only the first nonzero index of the segment is stored in usub[]. */ for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ bsendx_plist[ljb][pr] == EMPTY ) { bsendx_plist[ljb][pr] = YES; ++nbsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; fsupc1 = FstBlockC( gb+1 ); if (rb_marker[lb] <= jb) { /* First time see the block */ rb_marker[lb] = jb + 1; Urb_indptr[lb] = Urb_fstnz[lb];; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; /* Record the first location in index[] of the next block */ Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; len = Urb_indptr[lb];/* Start fstnz in index */ index[len-1] = 0; for (k = 0; k < nsupc; ++k) index[len+k] = fsupc1; if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { /* Already saw the block */ len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; /* Load the numerical values */ k = fsupc1 - irow; /* No. of nonzeros in segment */ index[len-1] += k; /* Increment block length in Descriptor */ irow = ilsum[lb] + irow - FstBlockC( gb ); for (ii = 0; ii < k; ++ii) { uval[Urb_length[lb]++] = dense_col[irow + ii]; dense_col[irow + ii] = zero; } } /* if myrow == pr ... */ } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { fsendx_plist[ljb][pr] = YES; ++nfsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) { fprintf(stderr, "col block " IFMT " ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = 0.0; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->nfsendx = nfsendx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->nbsendx = nbsendx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ if ( !(Llu->mod_bit = intMalloc_dist(k)) ) ABORT("Malloc fails for mod_bit[]."); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time:\n " "\tL\t%.2f\n\tU\t%.2f\n" "\tu_blks %d\tnrbu %d\n--------\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit ddistribute()"); #endif return (mem_use); } /* DDISTRIBUTE */
int dldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], double nzval[], int_t *perm, double u[], double v[]) { int_t i, liw, ldw, num; int_t *iw, icntl[10], info[10]; double *dw; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(0, "Enter dldperm_dist()"); #endif liw = 5*n; if ( job == 3 ) liw = 10*n + nnz; if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]"); ldw = 3*n + nnz; if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]"); /* Increment one to get 1-based indexing. */ for (i = 0; i <= n; ++i) ++colptr[i]; for (i = 0; i < nnz; ++i) ++adjncy[i]; #if ( DEBUGlevel>=2 ) printf("LDPERM(): n %d, nnz %d\n", n, nnz); PrintInt10("colptr", n+1, colptr); PrintInt10("adjncy", nnz, adjncy); #endif /* * NOTE: * ===== * * MC64AD assumes that column permutation vector is defined as: * perm(i) = j means column i of permuted A is in column j of original A. * * Since a symmetric permutation preserves the diagonal entries. Then * by the following relation: * P'(A*P')P = P'A * we can apply inverse(perm) to rows of A to get large diagonal entries. * But, since 'perm' defined in MC64AD happens to be the reverse of * SuperLU's definition of permutation vector, therefore, it is already * an inverse for our purpose. We will thus use it directly. * */ mc64id_dist(icntl); /* Suppress error and warning messages. */ icntl[0] = -1; icntl[1] = -1; mc64ad_dist(&job, &n, &nnz, colptr, adjncy, nzval, &num, perm, &liw, iw, &ldw, dw, icntl, info); #if ( DEBUGlevel>=2 ) PrintInt10("perm", n, perm); printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); #endif if ( info[0] == 1 ) { /* Structurally singular */ printf(".. The last " IFMT " permutations:\n", n-num); PrintInt10("perm", n-num, &perm[num]); } /* Restore to 0-based indexing. */ for (i = 0; i <= n; ++i) --colptr[i]; for (i = 0; i < nnz; ++i) --adjncy[i]; for (i = 0; i < n; ++i) --perm[i]; if ( job == 5 ) for (i = 0; i < n; ++i) { u[i] = dw[i]; v[i] = dw[n+i]; } SUPERLU_FREE(iw); SUPERLU_FREE(dw); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(0, "Exit dldperm_dist()"); #endif return (info[0]); }
void pdgsmv_init ( SuperMatrix *A, /* Matrix A permuted by columns (input/output). The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. */ int_t *row_to_proc, /* Input. Mapping between rows and processes. */ gridinfo_t *grid, /* Input */ pdgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ ) { NRformat_loc *Astore; int iam, p, procs; int *SendCounts, *RecvCounts; int_t i, j, k, l, m, m_loc, n, fst_row, jcol; int_t TotalIndSend, TotalValSend; int_t *colind, *rowptr; int_t *ind_tosend = NULL, *ind_torecv = NULL; int_t *ptr_ind_tosend, *ptr_ind_torecv; int_t *extern_start, *spa, *itemp; double *nzval, *val_tosend = NULL, *val_torecv = NULL, t; MPI_Request *send_req, *recv_req; MPI_Status status; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pdgsmv_init()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; m = A->nrow; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; colind = Astore->colind; rowptr = Astore->rowptr; nzval = Astore->nzval; if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) ABORT("Malloc fails for SendCounts[]"); /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ RecvCounts = SendCounts + procs; if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) ABORT("Malloc fails for ptr_ind_tosend[]"); ptr_ind_torecv = ptr_ind_tosend + procs + 1; if ( !(extern_start = intMalloc_dist(m_loc)) ) ABORT("Malloc fails for extern_start[]"); for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; /* ------------------------------------------------------------ COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE LOCAL PART OF X. THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ ABORT("Malloc fails for spa[]"); for (p = 0; p < procs; ++p) SendCounts[p] = 0; for (i = 0; i < m_loc; ++i) { /* Loop through each row */ k = extern_start[i]; for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ jcol = colind[j]; p = row_to_proc[jcol]; if ( p != iam ) { /* External */ if ( spa[jcol] == 0 ) { /* First time see this index */ ++SendCounts[p]; spa[jcol] = 1; } } else { /* Swap to beginning the part of A corresponding to the local part of X */ l = colind[k]; t = nzval[k]; colind[k] = jcol; nzval[k] = nzval[j]; colind[j] = l; nzval[j] = t; ++k; } } extern_start[i] = k; } /* ------------------------------------------------------------ LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ /* Build pointers to ind_tosend[]. */ ptr_ind_tosend[0] = 0; for (p = 0, TotalIndSend = 0; p < procs; ++p) { TotalIndSend += SendCounts[p]; /* Total to send. */ ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; } #if 0 ptr_ind_tosend[iam] = 0; /* Local part of X */ #endif if ( TotalIndSend ) { if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ } /* Build SPA to aid global to local translation. */ for (i = 0; i < n; ++i) spa[i] = EMPTY; for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; if ( spa[jcol] == EMPTY ) { /* First time see this index */ p = row_to_proc[jcol]; if ( p == iam ) { /* Local */ /*assert(jcol>=fst_row);*/ spa[jcol] = jcol - fst_row; /* Relative position in local X */ } else { /* External */ ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ ++ptr_ind_tosend[p]; } } } } /* ------------------------------------------------------------ TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; colind[j] = spa[jcol]; } } /* ------------------------------------------------------------ COMMUNICATE THE EXTERNAL INDICES OF X. ------------------------------------------------------------*/ MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, grid->comm); /* Build pointers to ind_torecv[]. */ ptr_ind_torecv[0] = 0; for (p = 0, TotalValSend = 0; p < procs; ++p) { TotalValSend += RecvCounts[p]; /* Total to receive. */ ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; } if ( TotalValSend ) { if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) ABORT("Malloc fails for ind_torecv[]"); } if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) ABORT("Malloc fails for recv_req[]."); recv_req = send_req + procs; for (p = 0; p < procs; ++p) { ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ if ( SendCounts[p] ) { MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], mpi_int_t, p, iam, grid->comm, &send_req[p]); } if ( RecvCounts[p] ) { MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], mpi_int_t, p, p, grid->comm, &recv_req[p]); } } for (p = 0; p < procs; ++p) { if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status); } /* Allocate storage for the X values to to transferred. */ if ( TotalIndSend && !(val_torecv = doubleMalloc_dist(TotalIndSend)) ) ABORT("Malloc fails for val_torecv[]."); if ( TotalValSend && !(val_tosend = doubleMalloc_dist(TotalValSend)) ) ABORT("Malloc fails for val_tosend[]."); gsmv_comm->extern_start = extern_start; gsmv_comm->ind_tosend = ind_tosend; gsmv_comm->ind_torecv = ind_torecv; gsmv_comm->ptr_ind_tosend = ptr_ind_tosend; gsmv_comm->ptr_ind_torecv = ptr_ind_torecv; gsmv_comm->SendCounts = SendCounts; gsmv_comm->RecvCounts = RecvCounts; gsmv_comm->val_tosend = val_tosend; gsmv_comm->val_torecv = val_torecv; gsmv_comm->TotalIndSend = TotalIndSend; gsmv_comm->TotalValSend = TotalValSend; SUPERLU_FREE(spa); SUPERLU_FREE(send_req); #if ( DEBUGlevel>=2 ) PrintInt10("pdgsmv_init::rowptr", m_loc+1, rowptr); PrintInt10("pdgsmv_init::extern_start", m_loc, extern_start); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgsmv_init()"); #endif } /* PDGSMV_INIT */
void pdgssvx(superlu_options_t *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, double B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, SuperLUStat_t *stat, int *info) { /* * -- Distributed SuperLU routine (version 2.2) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * November 1, 2007 * Feburary 20, 2008 * * * Purpose * ======= * * PDGSSVX solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * The input matrices A and B are distributed by block rows. * Here is a graphical illustration (0-based indexing): * * A B * 0 --------------- ------ * | | | | * | | P0 | | * | | | | * --------------- ------ * - fst_row->| | | | * | | | | | * m_loc | | P1 | | * | | | | | * - | | | | * --------------- ------ * | . | |. | * | . | |. | * | . | |. | * --------------- ------ * * where, fst_row is the row number of the first row, * m_loc is the number of rows local to this processor * These are defined in the 'SuperMatrix' structure, see supermatrix.h. * * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right-hand sides, distributed by block rows, * and its dimensions ldb (local) and nrhs (global) * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has four options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * o A, the input matrix * * as well as the following options to determine what matrix to * factorize. * * o options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * o options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * o options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * o options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * o ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * . ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * . ScalePermstruct->R, array of row scale factors * . ScalePermstruct->C, array of column scale factors * . ScalePermstruct->perm_r, row permutation vector * . ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * o A, the input matrix A overwritten by the scaled and permuted * matrix diag(R)*A*diag(C)*Pc^T, where * Pc is the row permutation matrix determined by * ScalePermstruct->perm_c * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * o LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In * this case the algorithm saves time by reusing the previously * computed column permutation vector stored in * ScalePermstruct->perm_c and the "elimination tree" of A * stored in LUstruct->etree * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->RowPerm * o options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * o A, the input matrix * o ScalePermstruct->perm_c, the column permutation * o LUstruct->etree, the elimination tree * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * o LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are * ignored. This is because the permutations from ScalePermstruct->perm_r * and ScalePermstruct->perm_c are used as input. * * The user must also supply * * o A, the input matrix * o ScalePermstruct->DiagScale, how the previous matrix was row * and/or column scaled * o ScalePermstruct->R, the row scalings of the previous matrix, * if any * o ScalePermstruct->C, the columns scalings of the previous matrix, * if any * o ScalePermstruct->perm_r, the row permutation of the previous * matrix * o ScalePermstruct->perm_c, the column permutation of the previous * matrix * o all of LUstruct, the previously computed information about * L and U (the actual numerical values of L and U * stored in LUstruct->Llu are ignored) * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated (thus ScalePermstruct->DiagScale, * R and C may be modified) * o LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * o A, the unfactored matrix, only in the case that iterative * refinment is to be done (specifically A must be the output * A from the previous call, so that it has been scaled and permuted) * o all of ScalePermstruct * o all of LUstruct, including the actual numerical values of * L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t* (global) * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = DOUBLE: accumulate residual in double precision. * = EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* (local) * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. * That is, A is stored in distributed compressed row format. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine PDGSTRF can factorize rectangular matrices. * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, * depending on ScalePermstruct->DiagScale and options->ColPerm: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * diag(R)*A*diag(C)*Pc^T. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * ScalePermstruct (input/output) ScalePermstruct_t* (global) * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) double* (local) * On entry, the right-hand side matrix of dimension (m_loc, nrhs), * where, m_loc is the number of rows stored locally on my * process and is defined in the data structure of matrix A. * On exit, the solution matrix if info = 0; * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* (global) * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) (global) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) (global) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) (local) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * SOLVEstruct (input/output) SOLVEstruct_t* * The data structure to hold the communication pattern used * in the phases of triangular solution and iterative refinement. * This pattern should be intialized only once for repeated solutions. * If options->SolveInitialized = YES, it is an input argument. * If options->SolveInitialized = NO and nrhs != 0, it is an output * argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'. * * berr (output) double*, dimension (nrhs) (global) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * See superlu_ddefs.h for the definitions of varioous data types. * */ NRformat_loc *Astore; SuperMatrix GA; /* Global A in NC format */ NCformat *GAstore; double *a_GA; SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ NCPformat *GACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by PDDISTRIBUTE routine. They will be freed after PDDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; double *a; int_t *colptr, *rowind; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *rowptr, *colind; /* Local A in NR*/ int_t *rowind_loc, *colptr_loc; int_t colequ, Equil, factored, job, notran, rowequ, need_value; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int_t nnz_loc, m_loc, fst_row, icol; int iam; int ldx; /* LDA for matrix X (local). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; static mem_usage_t num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif int_t procs; /* Structures needed for parallel symbolic factorization */ int_t *sizes, *fstVtxSep, parSymbFact; int noDomains, nprocs_num; MPI_Comm symb_comm; /* communicator for symbolic factorization */ int col, key; /* parameters for creating a new communicator */ Pslu_freeable_t Pslu_freeable; float flinfo; /* Initialization. */ m = A->nrow; n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = (double *) Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; sizes = NULL; fstVtxSep = NULL; symb_comm = MPI_COMM_NULL; /* Test the input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) *info = -1; else if ( options->IterRefine == EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < m_loc ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pdgssvx", grid, -*info); return; } factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; /* The following arrays are replicated on all processes. */ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; /********/ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgssvx()"); #endif /* Not factored & ask for equilibration */ if ( Equil && Fact != SamePattern_SameRowPerm ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { a[i] *= R[irow]; /* Scale rows. */ } ++irow; } break; case COL: for (j = 0; j < m_loc; ++j) for (i = rowptr[j]; i < rowptr[j+1]; ++i){ icol = colind[i]; a[i] *= C[icol]; /* Scale columns. */ } break; case BOTH: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ } ++irow; } break; } } else { /* Compute R & C from scratch */ /* Compute the row and column scalings. */ pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); /* Equilibrate matrix A if it is badly-scaled. */ pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* if Equil ... */ if ( !factored ) { /* Skip this if already factored. */ /* * Gather A from the distributed compressed row format to * global A in compressed column format. * Numerical values are gathered only when a row permutation * for large diagonal is sought after. */ if ( Fact != SamePattern_SameRowPerm ) { need_value = (options->RowPerm == LargeDiag); pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); GAstore = (NCformat *) GA.Store; colptr = GAstore->colptr; rowind = GAstore->rowind; nnz = GAstore->nnz; if ( need_value ) a_GA = (double *) GAstore->nzval; else assert(GAstore->nzval == NULL); } /* ------------------------------------------------------------ Find the row permutation for A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( Fact != SamePattern_SameRowPerm ) { if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ /* Permute the global matrix GA for symbfact() */ for (i = 0; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } else { /* options->RowPerm == LargeDiag */ /* Get a new perm_r[] */ if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = doubleMalloc_dist(m)) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = doubleMalloc_dist(n)) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation */ dldperm(job, m, nnz, colptr, rowind, a_GA, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } /* Scale the distributed matrix */ irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R1[irow] * C1[icol]; #if ( PRNTlevel>=2 ) if ( perm_r[irow] == icol ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if ( job == 4 ) dsum += fabs(a[i]); else if ( job == 5 ) dprod *= fabs(a[i]); } #endif } ++irow; } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } /* end Equil */ /* Now permute global A to prepare for symbfact() */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } /* end for i ... */ } /* end for j ... */ } /* end else job ... */ #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* end if options->RowPerm ... */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); #endif } /* end if Fact ... */ } else { /* options->RowPerm == NOROWPERM */ for (i = 0; i < m; ++i) perm_r[i] = i; } #if ( DEBUGlevel>=2 ) if ( !iam ) PrintInt10("perm_r", m, perm_r); #endif } /* end if (!factored) */ if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = pdlangs(norm, A, grid); #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. anorm %e\n", anorm); #endif } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A * permc_spec = PARMETIS: parallel METIS on structure of A'+A * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; parSymbFact = options->ParSymbFact; #if ( PRNTlevel>=1 ) if ( parSymbFact && permc_spec != PARMETIS ) if ( !iam ) printf(".. Parallel symbolic factorization" " only works wth ParMetis!\n"); #endif if ( parSymbFact == YES || permc_spec == PARMETIS ) { nprocs_num = grid->nprow * grid->npcol; noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); /* create a new communicator for the first noDomains processors in grid->comm */ key = iam; if (iam < noDomains) col = 0; else col = MPI_UNDEFINED; MPI_Comm_split (grid->comm, col, key, &symb_comm ); permc_spec = PARMETIS; /* only works with PARMETIS */ } if ( permc_spec != MY_PERMC && Fact == DOFACT ) { if ( permc_spec == PARMETIS ) { /* Get column permutation vector in perm_c. * * This routine takes as input the distributed input matrix A * * and does not modify it. It also allocates memory for * * sizes[] and fstVtxSep[] arrays, that contain information * * on the separator tree computed by ParMETIS. */ flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, noDomains, &sizes, &fstVtxSep, grid, &symb_comm); if (flinfo > 0) ABORT("ERROR in get perm_c parmetis."); } else { get_perm_c_dist(iam, permc_spec, &GA, perm_c); } } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ if ( Fact != SamePattern_SameRowPerm ) { if ( parSymbFact == NO ) { int_t *GACcolbeg, *GACcolend, *GACrowind; sp_colorder(options, &GA, perm_c, etree, &GAC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ GACstore = (NCPformat *) GAC.Store; GACcolbeg = GACstore->colbeg; GACcolend = GACstore->colend; GACrowind = GACstore->rowind; for (j = 0; j < n; ++j) { for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { irow = GACrowind[i]; GACrowind[i] = perm_c[irow]; } } /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); /* Every process does this. */ iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { /* Successful return */ QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr,"symbfact() error returns %d\n",iinfo); exit(-1); } } } /* end if serial symbolic factorization */ else { /* parallel symbolic factorization */ t = SuperLU_timer_(); flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, sizes, fstVtxSep, &Pslu_freeable, &(grid->comm), &symb_comm, &symb_mem_usage); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if (flinfo > 0) ABORT("Insufficient memory for parallel symbolic factorization."); } } /* end if Fact ... */ #if ( PRNTlevel>=1 ) if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); #endif if (sizes) SUPERLU_FREE (sizes); if (fstVtxSep) SUPERLU_FREE (fstVtxSep); if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ t = SuperLU_timer_(); dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } } else { /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; t = SuperLU_timer_(); dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, &Pslu_freeable, LUstruct, grid); if (dist_mem_use > 0) ABORT ("Not enough memory available for dist_psymbtonum\n"); stat->utime[DIST] = SuperLU_timer_() - t; } #if ( PRNTlevel>=1 ) if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); #endif /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); if (parSymbFact == TRUE) /* The memory used in the redistribution routine includes the memory used for storing the symbolic structure and the memory allocated for numerical factorization */ temp = SUPERLU_MAX(symb_mem_usage.total, (float)dist_mem_use); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif /* Destroy GA */ if ( Fact != SamePattern_SameRowPerm ) Destroy_CompCol_Matrix_dist(&GA); } /* end if (!factored) */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doubleMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ if ( options->SolveInitialized == NO ) { dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, SOLVEstruct); } pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, fst_row, ldb, nrhs, SOLVEstruct, stat, info); /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ t = SuperLU_timer_(); if ( options->RefineInitialized == NO || Fact == DOFACT ) { /* All these cases need to re-initialize gsmv structure */ if ( options->RefineInitialized ) pdgsmv_finalize(SOLVEstruct->gsmv_comm); pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, SOLVEstruct->gsmv_comm); /* Save a copy of the transformed local col indices in colind_gsmv[]. */ if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); if ( !(it = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for colind_gsmv[]"); colind_gsmv = SOLVEstruct->A_colind_gsmv = it; for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; options->RefineInitialized = YES; } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { double at; int_t k, jcol, p; /* Swap to beginning the part of A corresponding to the local part of X, as was done in pdgsmv_init() */ for (i = 0; i < m_loc; ++i) { /* Loop through each row */ k = rowptr[i]; for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; p = SOLVEstruct->row_to_proc[jcol]; if ( p == iam ) { /* Local */ at = a[k]; a[k] = a[j]; a[j] = at; ++k; } } } /* Re-use the local col indices of A obtained from the previous call to pdgsmv_init() */ for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; } if ( nrhs == 1 ) { /* Use the existing solve structure */ SOLVEstruct1 = SOLVEstruct; } else { /* For nrhs > 1, since refinement is performed for RHS one at a time, the communication structure for pdgstrs is different than the solve with nrhs RHS. So we use SOLVEstruct1 for the refinement step. */ if ( !(SOLVEstruct1 = (SOLVEstruct_t *) SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) ABORT("Malloc fails for SOLVEstruct1"); /* Copy the same stuff */ SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; SOLVEstruct1->diag_len = SOLVEstruct->diag_len; SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; /* Initialize the *gstrs_comm for 1 RHS. */ if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, Glu_persist, SOLVEstruct1); } pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); /* Deallocate the storage associated with SOLVEstruct1 */ if ( nrhs > 1 ) { pxgstrs_finalize(SOLVEstruct1->gstrs_comm); SUPERLU_FREE(SOLVEstruct1); } stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix B <= Pc'*X. */ pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, SOLVEstruct->inv_perm_c, X, ldx, B, ldb, nrhs, grid); #if ( DEBUGlevel>=2 ) printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); for (i = 0; i < m_loc; ++i) printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); #endif /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate R and/or C if it was not used. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) Destroy_CompCol_Permuted_dist(&GAC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgssvx()"); #endif }
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PZGSTRS solves a system of distributed linear equations * A*X = B with a general N-by-N matrix A using the LU factorization * computed by PZGSTRF. * If the equilibration, and row and column permutations were performed, * the LU factorization was performed for A1 where * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * and the linear system solved is * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and * the permutation to B1 by Pc*Pr is applied internally in this routine. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from PZGSTRF for * the possibly scaled and permuted matrix A. * See superlu_zdefs.h for the definition of 'LUstruct_t'. * A may be scaled and permuted into A1, so that * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_defs.h for the definition of 'gridinfo_t'. * * B (input/output) doublecomplex* * On entry, the distributed right-hand side matrix of the possibly * equilibrated system. That is, B may be overwritten by diag(R)*B. * On exit, the distributed solution matrix Y of the possibly * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, * and X is the solution of the original system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * SOLVEstruct (output) SOLVEstruct_t* (global) * Contains the information for the communication during the * solution phase. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
int static_schedule(superlu_options_t * options, int m, int n, LUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, int_t *perm_c_supno, int_t *iperm_c_supno, int *info) { int_t *xsup; int_t i, ib, jb, lb, nlb, il, iu; int_t Pc, Pr; int iam, krow, yourcol, mycol, myrow; int j, k, nsupers; /* k - current panel to work on */ int_t *index; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int ncb, nrb, p, pr, pc, nblocks; int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, *Lblock, *Lrows, *sf_block, *sf_block_l, *nnodes_l, *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno; float edag_supno_l_bytes; int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows; etree_node *head, *tail, *ptr; int *num_child; int iword = sizeof (int_t); /* Test the input parameters. */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; if (*info) { pxerbla ("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if (m == 0 || n == 0) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW (iam, grid); mycol = MYCOL (iam, grid); nsupers = Glu_persist->supno[n - 1] + 1; xsup = Glu_persist->xsup; nblocks = 0; ncb = nsupers / Pc; nrb = nsupers / Pr; #if ( DEBUGlevel >= 1 ) print_memorylog(stat, "before static schedule"); #endif /* ================================================== * * static scheduling of j-th step of LU-factorization * * ================================================== */ if (options->lookahead_etree == YES && /* use e-tree of symmetrized matrix and */ (options->ParSymbFact == NO || /* 1) symmetric fact with serial symbolic, or */ (options->SymPattern == YES && /* 2) symmetric pattern, and */ options->RowPerm == NOROWPERM))) { /* no rowperm to destroy symmetry */ /* if symmetric pattern or using e-tree of |A^T|+|A|, then we can use a simple tree structure for static schduling */ if (options->ParSymbFact == NO) { /* Use the etree computed from serial symb. fact., and turn it into supernodal tree. */ int_t *etree = LUstruct->etree; #if ( PRNTlevel>=1 ) if (grid->iam == 0) printf (" === using column e-tree ===\n"); #endif /* look for the first off-diagonal blocks */ etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t)); log_memory(nsupers * iword, stat); for (i = 0; i < nsupers; i++) etree_supno[i] = nsupers; for (j = 0, lb = 0; lb < nsupers; lb++) { for (k = 0; k < SuperSize (lb); k++) { jb = Glu_persist->supno[etree[j + k]]; if (jb != lb) etree_supno[lb] = SUPERLU_MIN (etree_supno[lb], jb); } j += SuperSize (lb); } } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */ /* Compute an "etree" based on struct(L), assuming struct(U) = struct(L'). */ #if ( PRNTlevel>=1 ) if (grid->iam == 0) printf (" === using supernodal e-tree ===\n"); #endif /* find the first block in each supernodal-column of local L-factor */ etree_supno_l = SUPERLU_MALLOC (nsupers * sizeof (int_t)); log_memory(nsupers * iword, stat); for (i = 0; i < nsupers; i++) etree_supno_l[i] = nsupers; for (lb = 0; lb < ncb; lb++) { jb = lb * grid->npcol + mycol; index = Llu->Lrowind_bc_ptr[lb]; if (index) { /* Not an empty column */ i = index[0]; k = BC_HEADER; krow = PROW (jb, grid); if (krow == myrow) { /* skip the diagonal block */ k += LB_DESCRIPTOR + index[k + 1]; i--; } if (i > 0) { etree_supno_l[jb] = index[k]; k += LB_DESCRIPTOR + index[k + 1]; i--; } for (j = 0; j < i; j++) { etree_supno_l[jb] = SUPERLU_MIN (etree_supno_l[jb], index[k]); k += LB_DESCRIPTOR + index[k + 1]; } } } if (mycol < nsupers % grid->npcol) { jb = ncb * grid->npcol + mycol; index = Llu->Lrowind_bc_ptr[ncb]; if (index) { /* Not an empty column */ i = index[0]; k = BC_HEADER; krow = PROW (jb, grid); if (krow == myrow) { /* skip the diagonal block */ k += LB_DESCRIPTOR + index[k + 1]; i--; } if (i > 0) { etree_supno_l[jb] = index[k]; k += LB_DESCRIPTOR + index[k + 1]; i--; } for (j = 0; j < i; j++) { etree_supno_l[jb] = SUPERLU_MIN (etree_supno_l[jb], index[k]); k += LB_DESCRIPTOR + index[k + 1]; } } } /* form global e-tree */ etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t)); MPI_Allreduce (etree_supno_l, etree_supno, nsupers, mpi_int_t, MPI_MIN, grid->comm); SUPERLU_FREE (etree_supno_l); } /* initialize number of children for each node */ num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t)); for (i = 0; i < nsupers; i++) num_child[i] = 0; for (i = 0; i < nsupers; i++) if (etree_supno[i] != nsupers) num_child[etree_supno[i]]++; /* push initial leaves to the fifo queue */ nnodes = 0; for (i = 0; i < nsupers; i++) { if (num_child[i] == 0) { ptr = SUPERLU_MALLOC (sizeof (etree_node)); ptr->id = i; ptr->next = NULL; /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */ nnodes++; if (nnodes == 1) { head = ptr; tail = ptr; } else { tail->next = ptr; tail = ptr; } } } /* process fifo queue, and compute the ordering */ i = 0; while (nnodes > 0) { ptr = head; j = ptr->id; head = ptr->next; perm_c_supno[i] = j; SUPERLU_FREE (ptr); i++; nnodes--; if (etree_supno[j] != nsupers) { num_child[etree_supno[j]]--; if (num_child[etree_supno[j]] == 0) { nnodes++; ptr = SUPERLU_MALLOC (sizeof (etree_node)); ptr->id = etree_supno[j]; ptr->next = NULL; /*printf( "=== push %d ===\n",ptr->id ); */ if (nnodes == 1) { head = ptr; tail = ptr; } else { tail->next = ptr; tail = ptr; } } } /*printf( "\n" ); */ } SUPERLU_FREE (num_child); SUPERLU_FREE (etree_supno); log_memory(-2 * nsupers * iword, stat); } else { /* Unsymmetric pattern */ /* Need to process both L- and U-factors, use the symmetrically pruned graph of L & U instead of tree (very naive implementation) */ int nrbp1 = nrb + 1; float Ublock_bytes, Urows_bytes, Lblock_bytes, Lrows_bytes; /* allocate some workspace */ if (! (sendcnts = SUPERLU_MALLOC ((4 + 2 * nrbp1) * Pr * Pc * sizeof (int)))) ABORT ("Malloc fails for sendcnts[]."); log_memory((4 + 2 * nrbp1) * Pr * Pc * sizeof (int), stat); sdispls = &sendcnts[Pr * Pc]; recvcnts = &sdispls[Pr * Pc]; rdispls = &recvcnts[Pr * Pc]; srows = &rdispls[Pr * Pc]; rrows = &srows[Pr * Pc * nrbp1]; myrow = MYROW (iam, grid); #if ( PRNTlevel>=1 ) if (grid->iam == 0) printf (" === using DAG ===\n"); #endif /* send supno block of local U-factor to a processor * * who owns the corresponding block of L-factor */ /* srows : # of block to send to a processor from each supno row */ /* sendcnts: total # of blocks to send to a processor */ for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0; for (p = 0; p < Pr * Pc; p++) sendcnts[p] = 0; /* sending blocks of U-factors corresponding to L-factors */ /* count the number of blocks to send */ for (lb = 0; lb < nrb; ++lb) { jb = lb * Pr + myrow; pc = jb % Pc; index = Llu->Ufstnz_br_ptr[lb]; if (index) { /* Not an empty row */ k = BR_HEADER; nblocks += index[0]; for (j = 0; j < index[0]; ++j) { ib = index[k]; pr = ib % Pr; p = pr * Pc + pc; sendcnts[p]++; srows[p * nrbp1 + lb]++; k += UB_DESCRIPTOR + SuperSize (index[k]); } } } if (myrow < nsupers % grid->nprow) { jb = nrb * Pr + myrow; pc = jb % Pc; index = Llu->Ufstnz_br_ptr[nrb]; if (index) { /* Not an empty row */ k = BR_HEADER; nblocks += index[0]; for (j = 0; j < index[0]; ++j) { ib = index[k]; pr = ib % Pr; p = pr * Pc + pc; sendcnts[p]++; srows[p * nrbp1 + nrb]++; k += UB_DESCRIPTOR + SuperSize (index[k]); } } } /* insert blocks to send */ sdispls[0] = 0; for (p = 1; p < Pr * Pc; p++) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; if (!(blocks = intMalloc_dist (nblocks))) ABORT ("Malloc fails for blocks[]."); log_memory( nblocks * iword, stat ); for (lb = 0; lb < nrb; ++lb) { jb = lb * Pr + myrow; pc = jb % Pc; index = Llu->Ufstnz_br_ptr[lb]; if (index) { /* Not an empty row */ k = BR_HEADER; for (j = 0; j < index[0]; ++j) { ib = index[k]; pr = ib % Pr; p = pr * Pc + pc; blocks[sdispls[p]] = ib; sdispls[p]++; k += UB_DESCRIPTOR + SuperSize (index[k]); } } } if (myrow < nsupers % grid->nprow) { jb = nrb * Pr + myrow; pc = jb % Pc; index = Llu->Ufstnz_br_ptr[nrb]; if (index) { /* Not an empty row */ k = BR_HEADER; for (j = 0; j < index[0]; ++j) { ib = index[k]; pr = ib % Pr; p = pr * Pc + pc; blocks[sdispls[p]] = ib; sdispls[p]++; k += UB_DESCRIPTOR + SuperSize (index[k]); } } } /* communication */ MPI_Alltoall (sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); MPI_Alltoall (srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm); log_memory( -(nblocks * iword), stat ); /* blocks[] to be freed soon */ nblocks = recvcnts[0]; rdispls[0] = sdispls[0] = 0; for (p = 1; p < Pr * Pc; p++) { rdispls[p] = rdispls[p - 1] + recvcnts[p - 1]; sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; nblocks += recvcnts[p]; } if (!(blockr = intMalloc_dist (nblocks))) ABORT ("Malloc fails for blockr[]."); log_memory( nblocks * iword, stat ); MPI_Alltoallv (blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts, rdispls, mpi_int_t, grid->comm); SUPERLU_FREE (blocks); /* memory logged before */ /* store the received U-blocks by rows */ nlb = nsupers / Pc; if (!(Ublock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Ublock[]."); if (!(Urows = intMalloc_dist (1 + nlb))) ABORT ("Malloc fails for Urows[]."); Ublock_bytes = nblocks * iword; Urows_bytes = (1 + nlb) * iword; log_memory( Ublock_bytes + Urows_bytes, stat ); k = 0; for (jb = 0; jb < nlb; jb++) { j = jb * Pc + mycol; pr = j % Pr; lb = j / Pr; Urows[jb] = 0; for (pc = 0; pc < Pc; pc++) { p = pr * Pc + pc; /* the processor owning this block of U-factor */ for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++) { Ublock[k] = blockr[i]; k++; Urows[jb]++; } rdispls[p] += rrows[p * nrbp1 + lb]; } /* sort by the column indices to make things easier for later on */ #ifdef ISORT isort1 (Urows[jb], &(Ublock[k - Urows[jb]])); #else qsort (&(Ublock[k - Urows[jb]]), (size_t) (Urows[jb]), sizeof (int_t), &superlu_sort_perm); #endif } if (mycol < nsupers % grid->npcol) { j = nlb * Pc + mycol; pr = j % Pr; lb = j / Pr; Urows[nlb] = 0; for (pc = 0; pc < Pc; pc++) { p = pr * Pc + pc; for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++) { Ublock[k] = blockr[i]; k++; Urows[nlb]++; } rdispls[p] += rrows[p * nrb + lb]; } #ifdef ISORT isort1 (Urows[nlb], &(Ublock[k - Urows[nlb]])); #else qsort (&(Ublock[k - Urows[nlb]]), (size_t) (Urows[nlb]), sizeof (int_t), &superlu_sort_perm); #endif } SUPERLU_FREE (blockr); log_memory( -nblocks * iword, stat ); /* sort the block in L-factor */ nblocks = 0; for (lb = 0; lb < ncb; lb++) { jb = lb * Pc + mycol; index = Llu->Lrowind_bc_ptr[lb]; if (index) { /* Not an empty column */ nblocks += index[0]; } } if (mycol < nsupers % grid->npcol) { jb = ncb * Pc + mycol; index = Llu->Lrowind_bc_ptr[ncb]; if (index) { /* Not an empty column */ nblocks += index[0]; } } if (!(Lblock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Lblock[]."); if (!(Lrows = intMalloc_dist (1 + ncb))) ABORT ("Malloc fails for Lrows[]."); Lblock_bytes = nblocks * iword; Lrows_bytes = (1 + ncb) * iword; log_memory(Lblock_bytes + Lrows_bytes, stat); for (lb = 0; lb <= ncb; lb++) Lrows[lb] = 0; nblocks = 0; for (lb = 0; lb < ncb; lb++) { Lrows[lb] = 0; jb = lb * Pc + mycol; index = Llu->Lrowind_bc_ptr[lb]; if (index) { /* Not an empty column */ i = index[0]; k = BC_HEADER; krow = PROW (jb, grid); if (krow == myrow) /* skip the diagonal block */ { k += LB_DESCRIPTOR + index[k + 1]; i--; } for (j = 0; j < i; j++) { Lblock[nblocks] = index[k]; Lrows[lb]++; nblocks++; k += LB_DESCRIPTOR + index[k + 1]; } } #ifdef ISORT isort1 (Lrows[lb], &(Lblock[nblocks - Lrows[lb]])); #else qsort (&(Lblock[nblocks - Lrows[lb]]), (size_t) (Lrows[lb]), sizeof (int_t), &superlu_sort_perm); #endif } if (mycol < nsupers % grid->npcol) { Lrows[ncb] = 0; jb = ncb * Pc + mycol; index = Llu->Lrowind_bc_ptr[ncb]; if (index) { /* Not an empty column */ i = index[0]; k = BC_HEADER; krow = PROW (jb, grid); if (krow == myrow) { /* skip the diagonal block */ k += LB_DESCRIPTOR + index[k + 1]; i--; } for (j = 0; j < i; j++) { Lblock[nblocks] = index[k]; Lrows[ncb]++; nblocks++; k += LB_DESCRIPTOR + index[k + 1]; } #ifdef ISORT isort1 (Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]])); #else qsort (&(Lblock[nblocks - Lrows[ncb]]), (size_t) (Lrows[ncb]), sizeof (int_t), &superlu_sort_perm); #endif } } /* look for the first local symmetric nonzero block match */ if (!(sf_block = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block[]."); if (!(sf_block_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block_l[]."); log_memory( 2 * nsupers * iword, stat ); for (lb = 0; lb < nsupers; lb++) sf_block_l[lb] = nsupers; i = 0; j = 0; for (jb = 0; jb < nlb; jb++) { if (Urows[jb] > 0) { ib = i + Urows[jb]; lb = jb * Pc + mycol; for (k = 0; k < Lrows[jb]; k++) { while (Ublock[i] < Lblock[j] && i + 1 < ib) i++; if (Ublock[i] == Lblock[j]) { sf_block_l[lb] = Lblock[j]; j += (Lrows[jb] - k); k = Lrows[jb]; } else { j++; } } i = ib; } else { j += Lrows[jb]; } } if (mycol < nsupers % grid->npcol) { if (Urows[nlb] > 0) { ib = i + Urows[nlb]; lb = nlb * Pc + mycol; for (k = 0; k < Lrows[nlb]; k++) { while (Ublock[i] < Lblock[j] && i + 1 < ib) i++; if (Ublock[i] == Lblock[j]) { sf_block_l[lb] = Lblock[j]; j += (Lrows[nlb] - k); k = Lrows[nlb]; } else { j++; } } i = ib; } else { j += Lrows[nlb]; } } /* compute the first global symmetric matchs */ MPI_Allreduce (sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN, grid->comm); SUPERLU_FREE (sf_block_l); log_memory( -nsupers * iword, stat ); /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */ if (!(nnodes_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_l[]."); if (!(nnodes_u = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_u[]."); log_memory( 2 * nsupers * iword, stat ); for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; for (lb = 0; lb < nsupers; lb++) nnodes_u[lb] = 0; nblocks = 0; /* from U-factor */ for (i = 0, jb = 0; jb < nlb; jb++) { lb = jb * Pc + mycol; ib = i + Urows[jb]; while (i < ib) { if (Ublock[i] <= sf_block[lb]) { nnodes_u[lb]++; i++; nblocks++; } else { /* get out */ i = ib; } } i = ib; } if (mycol < nsupers % grid->npcol) { lb = nlb * Pc + mycol; ib = i + Urows[nlb]; while (i < ib) { if (Ublock[i] <= sf_block[lb]) { nnodes_u[lb]++; i++; nblocks++; } else { /* get out */ i = ib; } } i = ib; } /* from L-factor */ for (i = 0, jb = 0; jb < nlb; jb++) { lb = jb * Pc + mycol; ib = i + Lrows[jb]; while (i < ib) { if (Lblock[i] < sf_block[lb]) { nnodes_l[lb]++; i++; nblocks++; } else { i = ib; } } i = ib; } if (mycol < nsupers % grid->npcol) { lb = nlb * Pc + mycol; ib = i + Lrows[nlb]; while (i < ib) { if (Lblock[i] < sf_block[lb]) { nnodes_l[lb]++; i++; nblocks++; } else { i = ib; } } i = ib; } #ifdef USE_ALLGATHER /* insert local nodes in DAG */ if (!(edag_supno_l = intMalloc_dist (nsupers + nblocks))) ABORT ("Malloc fails for edag_supno_l[]."); edag_supno_l_bytes = (nsupers + nblocks) * iword; log_memory(edag_supno_l_bytes, stat); iu = il = nblocks = 0; for (lb = 0; lb < nsupers; lb++) { j = lb / Pc; pc = lb % Pc; edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks++; if (mycol == pc) { /* from U-factor */ ib = iu + Urows[j]; for (jb = 0; jb < nnodes_u[lb]; jb++) { edag_supno_l[nblocks] = Ublock[iu]; iu++; nblocks++; } iu = ib; /* from L-factor */ ib = il + Lrows[j]; for (jb = 0; jb < nnodes_l[lb]; jb++) { edag_supno_l[nblocks] = Lblock[il]; il++; nblocks++; } il = ib; } } SUPERLU_FREE (nnodes_u); log_memory(-nsupers * iword, stat); /* form global DAG on each processor */ MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); nblocks = recvcnts[0]; rdispls[0] = 0; for (lb = 1; lb < Pc * Pr; lb++) { rdispls[lb] = nblocks; nblocks += recvcnts[lb]; } if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[]."); log_memory(nblocks * iword, stat); MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t, recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm); SUPERLU_FREE (edag_supno_l); log_memory(-edag_supno_l_bytes, stat); if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *)))) ABORT ("Malloc fails for edag_supno[]."); log_memory(nsupers * iword, stat); k = 0; for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; for (p = 0; p < Pc * Pr; p++) { for (lb = 0; lb < nsupers; lb++) { nnodes_l[lb] += recvbuf[k]; k += (1 + recvbuf[k]); } } for (lb = 0; lb < nsupers; lb++) { if (nnodes_l[lb] > 0) if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb]))) ABORT ("Malloc fails for edag_supno[lb]."); nnodes_l[lb] = 0; } k = 0; for (p = 0; p < Pc * Pr; p++) { for (lb = 0; lb < nsupers; lb++) { jb = k + recvbuf[k] + 1; k++; for (; k < jb; k++) { edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; nnodes_l[lb]++; } } } SUPERLU_FREE (recvbuf); log_memory(-nblocks * iword, stat); #else /* not USE_ALLGATHER */ int nlsupers = nsupers / Pc; if (mycol < nsupers % Pc) nlsupers++; /* insert local nodes in DAG */ if (!(edag_supno_l = intMalloc_dist (nlsupers + nblocks))) ABORT ("Malloc fails for edag_supno_l[]."); edag_supno_l_bytes = (nlsupers + nblocks) * iword; log_memory(edag_supno_l_bytes, stat); iu = il = nblocks = 0; for (lb = 0; lb < nsupers; lb++) { j = lb / Pc; pc = lb % Pc; if (mycol == pc) { edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks++; /* from U-factor */ ib = iu + Urows[j]; for (jb = 0; jb < nnodes_u[lb]; jb++) { edag_supno_l[nblocks] = Ublock[iu]; iu++; nblocks++; } iu = ib; /* from L-factor */ ib = il + Lrows[j]; for (jb = 0; jb < nnodes_l[lb]; jb++) { edag_supno_l[nblocks] = Lblock[il]; il++; nblocks++; } il = ib; } else if (nnodes_l[lb] + nnodes_u[lb] != 0) printf (" # %d: nnodes[%d]=%d+%d\n", grid->iam, lb, nnodes_l[lb], nnodes_u[lb]); } SUPERLU_FREE (nnodes_u); log_memory(-nsupers * iword, stat); /* form global DAG on each processor */ MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); nblocks = recvcnts[0]; rdispls[0] = 0; for (lb = 1; lb < Pc * Pr; lb++) { rdispls[lb] = nblocks; nblocks += recvcnts[lb]; } if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[]."); log_memory(nblocks * iword, stat); MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t, recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm); SUPERLU_FREE (edag_supno_l); log_memory(-edag_supno_l_bytes, stat); if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *)))) ABORT ("Malloc fails for edag_supno[]."); log_memory(nsupers * sizeof(int_t *), stat); k = 0; for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; for (p = 0; p < Pc * Pr; p++) { yourcol = MYCOL (p, grid); for (lb = 0; lb < nsupers; lb++) { j = lb / Pc; pc = lb % Pc; if (yourcol == pc) { nnodes_l[lb] += recvbuf[k]; k += (1 + recvbuf[k]); } } } for (lb = 0; lb < nsupers; lb++) { if (nnodes_l[lb] > 0) if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb]))) ABORT ("Malloc fails for edag_supno[lb]."); nnodes_l[lb] = 0; } k = 0; for (p = 0; p < Pc * Pr; p++) { yourcol = MYCOL (p, grid); for (lb = 0; lb < nsupers; lb++) { j = lb / Pc; pc = lb % Pc; if (yourcol == pc) { jb = k + recvbuf[k] + 1; k++; for (; k < jb; k++) { edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; nnodes_l[lb]++; } } } } SUPERLU_FREE (recvbuf); log_memory( -nblocks * iword , stat); #endif /* end USE_ALL_GATHER */ /* initialize the num of child for each node */ num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t)); for (i = 0; i < nsupers; i++) num_child[i] = 0; for (i = 0; i < nsupers; i++) { for (jb = 0; jb < nnodes_l[i]; jb++) { num_child[edag_supno[i][jb]]++; } } /* push initial leaves to the fifo queue */ nnodes = 0; for (i = 0; i < nsupers; i++) { if (num_child[i] == 0) { ptr = SUPERLU_MALLOC (sizeof (etree_node)); ptr->id = i; ptr->next = NULL; /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */ nnodes++; if (nnodes == 1) { head = ptr; tail = ptr; } else { tail->next = ptr; tail = ptr; } } } /* process fifo queue, and compute the ordering */ i = 0; while (nnodes > 0) { /*printf( "=== pop %d (%d) ===\n",head->id,i ); */ ptr = head; j = ptr->id; head = ptr->next; perm_c_supno[i] = j; SUPERLU_FREE (ptr); i++; nnodes--; for (jb = 0; jb < nnodes_l[j]; jb++) { num_child[edag_supno[j][jb]]--; if (num_child[edag_supno[j][jb]] == 0) { nnodes++; ptr = SUPERLU_MALLOC (sizeof (etree_node)); ptr->id = edag_supno[j][jb]; ptr->next = NULL; /*printf( "=== push %d ===\n",ptr->id ); */ if (nnodes == 1) { head = ptr; tail = ptr; } else { tail->next = ptr; tail = ptr; } } } /*printf( "\n" ); */ } for (lb = 0; lb < nsupers; lb++) if (nnodes_l[lb] > 0) SUPERLU_FREE (edag_supno[lb]); SUPERLU_FREE (num_child); SUPERLU_FREE (edag_supno); SUPERLU_FREE (nnodes_l); SUPERLU_FREE (sf_block); SUPERLU_FREE (sendcnts); log_memory(-(4 * nsupers + (4 + 2 * nrbp1)*Pr*Pc) * iword, stat); SUPERLU_FREE (Ublock); SUPERLU_FREE (Urows); SUPERLU_FREE (Lblock); SUPERLU_FREE (Lrows); log_memory(-(Ublock_bytes + Urows_bytes + Lblock_bytes + Lrows_bytes), stat); } /* ======================== * * end of static scheduling * * ======================== */ for (lb = 0; lb < nsupers; lb++) iperm_c_supno[perm_c_supno[lb]] = lb; #if ( DEBUGlevel >= 1 ) print_memorylog(stat, "after static schedule"); #endif return 0; } /* STATIC_SCHEDULE */
int_t pddistribute(fact_t fact, int_t n, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * * Purpose * ======= * Distribute the matrix onto the 2D process mesh. * * Arguments * ========= * * fact (input) fact_t * Specifies whether or not the L and U structures will be re-used. * = SamePattern_SameRowPerm: L and U structures are input, and * unchanged on exit. * = DOFACT or SamePattern: L and U structures are computed and output. * * n (input) int * Dimension of the matrix. * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * LUstruct (input) LUstruct_t* * Data structures for L and U factors. * * grid (input) gridinfo_t* * The 2D process mesh. * * Return value * ============ * > 0, working storage required (in bytes). * */ { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; #if 0 NCPformat *Astore; #else /* XSL ==> */ NRformat_loc *Astore; #endif double *a; int_t *asub, *xa; #if 0 int_t *xa_begin, *xa_end; #endif int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int_t *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t mem_use = 0, iword, dword; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = (NRformat_loc *) A->Store; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pddistribute()"); #endif dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, grid, &xa, &asub, &a); if ( fact == SamePattern_SameRowPerm ) { #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); for (lb = 0; lb < nrbu; ++lb) Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Unzval[]. */ for (lb = 0; lb < nrbu; ++lb) { index = Ufstnz_br_ptr[lb]; if ( index && index[Urb_indptr[lb]] == jb ) { uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb] + UB_DESCRIPTOR; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj = 0, dense_col = dense; jj < nsupc; ++jj) { j = index[len+jj]; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow+i]; dense_col[irow+i] = zero; } dense_col += ldaspa; } Urb_indptr[lb] += UB_DESCRIPTOR + nsupc; } /* if index != NULL */ } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* ------------------------------------------------------------ FIRST TIME CREATING THE L AND U DATA STRUCTURES. ------------------------------------------------------------*/ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We first need to set up the L and U data structures and then * propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = intCalloc_dist(nsupers)) ) ABORT("Calloc fails for ToRecv[]."); k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index = intMalloc_dist(j)) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use = k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = intCalloc_dist(k)) ) ABORT("Malloc fails for ToSendD[]."); if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ } /* for lb ... */ SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Ucbs); #if ( PRNTlevel>=1 ) mem_use -= 2*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); /* ------------------------------------------------ */ #if ( PRNTlevel>=1 ) mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; /* -------------------------------------------------------------- */ #if ( PRNTlevel>=1 ) mem_use += 4*k*sizeof(int_t*) + 2*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow ) bsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; len = Urb_indptr[lb]; for (k = 0; k < nsupc; ++k) index[len+k] = FstBlockC( gb+1 ); if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; } /* if myrow == pr ... */ } /* for i ... */ } /* for j ... */ /* Figure out how many nonzeros in each block, and gather the initial values of A from SPA into Uval. */ for (lb = 0; lb < nrbu; ++lb) { if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb]; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) { j = index[len+jj]; /* First nonzero in segment. */ bnnz += k - j; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow + i]; dense_col[irow + i] = zero; } dense_col += ldaspa; } index[len-1] = bnnz; /* Set block length in Descriptor */ Urb_indptr[lb] += nsupc; } } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow ) fsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(len*nsupc))) { fprintf(stderr, "col block %d ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ SUPERLU_FREE(xa); SUPERLU_FREE(asub); SUPERLU_FREE(a); #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit pddistribute()"); #endif return (mem_use); } /* PDDISTRIBUTE */
int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, int_t fst_row, int_t *ilsum, doublecomplex *x, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { /* * Purpose * ======= * Re-distribute B on the diagonal processes of the 2D process mesh. * * Note * ==== * This routine can only be called after the routine pxgstrs_init(), * in which the structures of the send and receive buffers are set up. * * Arguments * ========= * * B (input) doublecomplex* * The distributed right-hand side matrix of the possibly * equilibrated system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * ldb (input) int (local) * Leading dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ilsum (input) int* (global) * Starting position of each supernode in a full array. * * x (output) doublecomplex* * The solution vector. It is valid only on the diagonal processes. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * grid (input) gridinfo_t* * The 2D process mesh. * * SOLVEstruct (input) SOLVEstruct_t* * Contains the information for the communication during the * solution phase. * * Return value * ============ * */ int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *perm_r, *perm_c; /* row and column permutation vectors */ int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *xsup, *supno; int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; int p, procs; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; xsup = Glu_persist->xsup; supno = Glu_persist->supno; SendCnt = gstrs_comm->B_to_X_SendCnt; SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; /* ------------------------------------------------------------ NOW COMMUNICATE THE ACTUAL DATA. ------------------------------------------------------------*/ k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls[p] * nrhs; } /* Copy the row indices and values to the send buffer. */ for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ k = ptr_to_ibuf[p]; send_ibuf[k] = irow; k = ptr_to_dbuf[p]; RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ send_dbuf[k++] = B[i + j*ldb]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Communicate the (permuted) row indices. */ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); /* Communicate the numerical values. */ MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* ------------------------------------------------------------ Copy buffer into X on the diagonal processes. ------------------------------------------------------------*/ ii = 0; for (p = 0; p < procs; ++p) { jj = rdispls_nrhs[p]; for (i = 0; i < RecvCnt[p]; ++i) { /* Only the diagonal processes do this; the off-diagonal processes have 0 RecvCnt. */ irow = recv_ibuf[ii]; /* The permuted row index. */ k = BlockNum( irow ); knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number. */ l = X_BLK( lk ); x[l - XK_H].r = k; /* Block number prepended in the header. */ x[l - XK_H].i = 0; irow = irow - FstBlockC(k); /* Relative row number in X-block */ RHS_ITERATE(j) { x[l + irow + j*knsupc] = recv_dbuf[jj++]; } ++ii; } } SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); #endif return 0; } /* pzReDistribute_B_to_X */
/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B. */ int pzPermute_Dense_Matrix ( int_t fst_row, int_t m_loc, int_t row_to_proc[], int_t perm[], doublecomplex X[], int ldx, doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid ) { int_t i, j, k, l; int p, procs; int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzPermute_Dense_Matrix()"); #endif procs = grid->nprow * grid->npcol; if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) ABORT("Malloc fails for sendcnts[]."); sendcnts_nrhs = sendcnts + procs; recvcnts = sendcnts_nrhs + procs; recvcnts_nrhs = recvcnts + procs; sdispls = recvcnts_nrhs + procs; sdispls_nrhs = sdispls + procs; rdispls = sdispls_nrhs + procs; rdispls_nrhs = rdispls + procs; ptr_to_ibuf = rdispls_nrhs + procs; ptr_to_dbuf = ptr_to_ibuf + procs; for (i = 0; i < procs; ++i) sendcnts[i] = 0; /* Count the number of X entries to be sent to each process.*/ for (i = fst_row; i < fst_row + m_loc; ++i) { p = row_to_proc[perm[i]]; ++sendcnts[p]; } MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; sdispls_nrhs[0] = rdispls_nrhs[0] = 0; sendcnts_nrhs[0] = sendcnts[0] * nrhs; recvcnts_nrhs[0] = recvcnts[0] * nrhs; for (i = 1; i < procs; ++i) { sdispls[i] = sdispls[i-1] + sendcnts[i-1]; sdispls_nrhs[i] = sdispls[i] * nrhs; rdispls[i] = rdispls[i-1] + recvcnts[i-1]; rdispls_nrhs[i] = rdispls[i] * nrhs; sendcnts_nrhs[i] = sendcnts[i] * nrhs; recvcnts_nrhs[i] = recvcnts[i] * nrhs; } k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ /*assert(k == m_loc);*/ /*assert(l == m_loc);*/ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (i = 0; i < procs; ++i) { ptr_to_ibuf[i] = sdispls[i]; ptr_to_dbuf[i] = sdispls_nrhs[i]; } /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ for (i = fst_row; i < fst_row + m_loc; ++i) { j = perm[i]; p = row_to_proc[j]; send_ibuf[ptr_to_ibuf[p]] = j; j = ptr_to_dbuf[p]; RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ send_dbuf[j++] = X[i-fst_row + k*ldx]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Transfer the (permuted) row indices and numerical values. */ MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, recvcnts_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* Copy the buffer into b. */ for (i = 0, l = 0; i < m_loc; ++i) { j = recv_ibuf[i] - fst_row; /* Relative row number */ RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ B[j + k*ldb] = recv_dbuf[l++]; } } SUPERLU_FREE(sendcnts); SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzPermute_Dense_Matrix()"); #endif return 0; } /* pzPermute_Dense_Matrix */
int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(U_diag_blk_send_req = (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request)))) ABORT("Malloc fails for U_diag_blk_send_req[]."); U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */ if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } k = sp_ienv_dist(3); /* max supernode size */ if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); if ( U_diag_blk_send_req[myrow] ) { /* wait for last Isend requests to complete, deallocate objects */ for (krow = 0; krow < Pr; ++krow) if ( krow != myrow ) MPI_Wait(U_diag_blk_send_req + krow, &status); } SUPERLU_FREE(U_diag_blk_send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */
int main(int argc, char *argv[]) { superlu_options_t options; SuperLUStat_t stat; SuperMatrix A; NRformat_loc *Astore; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; SOLVEstruct_t SOLVEstruct; gridinfo_t grid; double *berr; double *b, *b1, *xtrue, *nzval, *nzval1; int_t *colind, *colind1, *rowptr, *rowptr1; int_t i, j, m, n, nnz_loc, m_loc, fst_row; int nprow, npcol; int iam, info, ldb, ldx, nrhs; char **cpp, c; FILE *fp, *fopen(); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default %d)\n", nprow); printf("\t-c <int>: process columns (default %d)\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; if ( !iam ) { printf("Input matrix file: %s\n", *cpp); printf("\tProcess grid\t%d X %d\n", grid.nprow, grid.npcol); } #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid); if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) ABORT("Malloc fails for b1[]"); for (j = 0; j < nrhs; ++j) for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb]; if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); m = A.nrow; n = A.ncol; /* Save a copy of the matrix A. */ Astore = (NRformat_loc *) A.Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; nzval = Astore->nzval; colind = Astore->colind; rowptr = Astore->rowptr; nzval1 = doubleMalloc_dist(nnz_loc); colind1 = intMalloc_dist(nnz_loc); rowptr1 = intMalloc_dist(m_loc+1); for (i = 0; i < nnz_loc; ++i) { nzval1[i] = nzval[i]; colind1[i] = colind[i]; } for (i = 0; i < m_loc+1; ++i) rowptr1[i] = rowptr[i]; /* ------------------------------------------------------------ WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag; options.ReplaceTinyPivot = YES; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); } /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver: factorize and solve. */ pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ SUPERLU_FREE(b); /* Free storage of right-hand side. */ /* ------------------------------------------------------------ NOW WE SOLVE ANOTHER LINEAR SYSTEM. THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM. ------------------------------------------------------------*/ options.Fact = SamePattern_SameRowPerm; PStatInit(&stat); /* Initialize the statistics variables. */ /* Set up the local A in NR_loc format */ dCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row, nzval1, colind1, rowptr1, SLU_NR_loc, SLU_D, SLU_GE); /* Solve the linear system. */ pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid); /* Print the statistics. */ PStatPrint(&options, &stat, &grid); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with the L and U matrices. */ ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ if ( options.SolveInitialized ) { dSolveFinalize(&options, &SOLVEstruct); } SUPERLU_FREE(b1); /* Free storage of right-hand side. */ SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ SUPERLU_FREE(berr); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: superlu_gridexit(&grid); /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }
void GenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist, gridinfo_t *grid, double **xact, int *ldx, double **b, int *ldb) { int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows, nsupr, nsupers, rel; int_t *supno, *xsup, *lxsup; double *x, *bb; NCformat *Astore; double *Aval; n = A->ncol; *ldb = 0; supno = Glu_persist->supno; xsup = Glu_persist->xsup; nsupers = supno[n-1] + 1; iam = grid->iam; myrow = MYROW( iam, grid ); Astore = A->Store; Aval = Astore->nzval; lb = CEILING( nsupers, grid->nprow ) + 1; if ( !(lxsup = intMalloc_dist(lb)) ) ABORT("Malloc fails for lxsup[]."); lsup = 0; nlrows = 0; for (j = 0; j < nsupers; ++j) { i = PROW( j, grid ); if ( myrow == i ) { nsupr = SuperSize( j ); *ldb += nsupr; lxsup[lsup++] = nlrows; nlrows += nsupr; } } *ldx = n; if ( !(x = doubleMalloc_dist(((size_t)*ldx) * nrhs)) ) ABORT("Malloc fails for x[]."); if ( !(bb = doubleCalloc_dist(*ldb * nrhs)) ) ABORT("Calloc fails for bb[]."); for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0; /* Form b = A*x. */ for (j = 0; j < n; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; gb = supno[irow]; gbrow = PROW( gb, grid ); if ( myrow == gbrow ) { rel = irow - xsup[gb]; lb = LBi( gb, grid ); bb[lxsup[lb] + rel] += Aval[i] * x[j]; } } /* Memory allocated but not freed: xact, b */ *xact = x; *b = bb; SUPERLU_FREE(lxsup); #if ( PRNTlevel>=2 ) for (i = 0; i < grid->nprow*grid->npcol; ++i) { if ( iam == i ) { printf("\n(%d)\n", iam); PrintDouble5("rhs", *ldb, *b); } MPI_Barrier( grid->comm ); } #endif } /* GENXTRUERHS */
int pzgsmv_AXglobal_setup ( SuperMatrix *A, /* Matrix A permuted by columns (input). The type of A can be: Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */ Glu_persist_t *Glu_persist, /* input */ gridinfo_t *grid, /* input */ int_t *m, /* output */ int_t *update[], /* output */ doublecomplex *val[], /* output */ int_t *bindx[], /* output */ int_t *mv_sup_to_proc /* output */ ) { int n; int input_option; int N_update; /* Number of variables updated on this process (output) */ int iam = grid->iam; int nprocs = grid->nprow * grid->npcol; int_t *xsup = Glu_persist->xsup; int_t *supno = Glu_persist->supno; int_t nsupers; int i, nsup, p, t1, t2, t3; /* Initialize the list of global indices. * NOTE: the list of global indices must be in ascending order. */ n = A->nrow; input_option = SUPER_LINEAR; nsupers = supno[n-1] + 1; #if ( DEBUGlevel>=2 ) if ( !iam ) { PrintInt10("xsup", supno[n-1]+1, xsup); PrintInt10("supno", n, supno); } #endif if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on individual rows. */ /* Figure out mv_sup_to_proc[] on all processes. */ for (p = 0; p < nprocs; ++p) { t1 = n / nprocs; /* Number of rows */ t2 = n - t1 * nprocs; /* left-over, which will be assigned to the first t2 processes. */ if ( p >= t2 ) t2 += (p * t1); /* Starting row number */ else { /* First t2 processes will get one more row. */ ++t1; /* Number of rows. */ t2 = p * t1; /* Starting row. */ } /* Make sure the starting and ending rows are at the supernode boundaries. */ t3 = t2 + t1; /* Ending row. */ nsup = supno[t2]; if ( t2 > xsup[nsup] ) { /* Round up the starting row. */ t1 -= xsup[nsup+1] - t2; t2 = xsup[nsup+1]; } nsup = supno[t3]; if ( t3 > xsup[nsup] ) /* Round up the ending row. */ t1 += xsup[nsup+1] - t3; t3 = t2 + t1 - 1; if ( t1 ) { for (i = supno[t2]; i <= supno[t3]; ++i) { mv_sup_to_proc[i] = p; #if ( DEBUGlevel>=3 ) if ( mv_sup_to_proc[i] == p-1 ) { fprintf(stderr, "mv_sup_to_proc conflicts at supno %d\n", i); exit(-1); } #endif } } if ( iam == p ) { N_update = t1; if ( N_update ) { if ( !(*update = intMalloc_dist(N_update)) ) ABORT("Malloc fails for update[]"); } for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i; #if ( DEBUGlevel>=3 ) printf("(%2d) N_update = %4d\t" "supers %4d to %4d\trows %4d to %4d\n", iam, N_update, supno[t2], supno[t3], t2, t3); #endif } } /* for p ... */ } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on individual supernodes. */ /* This may cause bad load balance, because the blocks are usually small in the beginning and large toward the end. */ t1 = nsupers / nprocs; t2 = nsupers - t1 * nprocs; /* left-over */ if ( iam >= t2 ) t2 += (iam * t1); else { ++t1; /* Number of blocks. */ t2 = iam * t1; /* Starting block. */ } N_update = xsup[t2+t1] - xsup[t2]; if ( !(*update = intMalloc_dist(N_update)) ) ABORT("Malloc fails for update[]"); for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i; } /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */ zcreate_msr_matrix(A, *update, N_update, val, bindx); #if ( DEBUGlevel>=2 ) PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc); zPrintMSRmatrix(N_update, *val, *bindx, grid); #endif *m = N_update; return 0; } /* PZGSMV_AXglobal_SETUP */
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; MPI_Request *send_req, recv_req; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol != kcol && fmod[lk] ) mod_bit[lk] = 1; /* contribution from off-diagonal */ } } /*PrintInt10("mod_bit", nlb, mod_bit);*/ #if ( PROFlevel>=2 ) t_reduce_tmp = SuperLU_timer_(); #endif /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); #if ( PROFlevel>=2 ) t_reduce += SuperLU_timer_() - t_reduce_tmp; #endif for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* diagonal process */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } #endif } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); Llu->SolveMsgSent = 0; MPI_Barrier( grid->comm ); /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope */ if ( mycol != kcol && bmod[lk] ) mod_bit[lk] = 1; /* Contribution from off-diagonal */ } } /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope. */ if ( mycol == kcol ) { /* diagonal process */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #endif } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
/*! \brief * * <pre> * Purpose * ======= * symbfact() performs a symbolic factorization on matrix A and sets up * the nonzero data structures which are suitable for supernodal Gaussian * elimination with no pivoting (GENP). This routine features: * o depth-first search (DFS) * o supernodes * o symmetric structure pruning * * Return value * ============ * < 0, number of bytes needed for LSUB. * = 0, matrix dimension is 1. * > 0, number of bytes allocated when out of memory. * </pre> */ int_t symbfact /************************************************************************/ ( superlu_options_t *options, /* input options */ int pnum, /* process number */ SuperMatrix *A, /* original matrix A permuted by columns (input) */ int_t *perm_c, /* column permutation vector (input) */ int_t *etree, /* column elimination tree (input) */ Glu_persist_t *Glu_persist, /* output */ Glu_freeable_t *Glu_freeable /* output */ ) { int_t m, n, min_mn, j, i, k, irep, nseg, pivrow, info; int_t *iwork, *perm_r, *segrep, *repfnz; int_t *xprune, *marker, *parent, *xplore; int_t relax, *desc, *relax_end; int_t nnzL, nnzU; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(pnum, "Enter symbfact()"); #endif m = A->nrow; n = A->ncol; min_mn = SUPERLU_MIN(m, n); /* Allocate storage common to the symbolic factor routines */ info = symbfact_SubInit(DOFACT, NULL, 0, m, n, ((NCPformat*)A->Store)->nnz, Glu_persist, Glu_freeable); iwork = (int_t *) intMalloc_dist(6*m+2*n); perm_r = iwork; segrep = iwork + m; repfnz = segrep + m; marker = repfnz + m; parent = marker + m; xplore = parent + m; xprune = xplore + m; relax_end = xprune + n; relax = sp_ienv_dist(2); ifill_dist(perm_r, m, EMPTY); ifill_dist(repfnz, m, EMPTY); ifill_dist(marker, m, EMPTY); Glu_persist->supno[0] = -1; Glu_persist->xsup[0] = 0; Glu_freeable->xlsub[0] = 0; Glu_freeable->xusub[0] = 0; /*for (j = 0; j < n; ++j) iperm_c[perm_c[j]] = j;*/ /* Identify relaxed supernodes. */ if ( !(desc = intMalloc_dist(n+1)) ) ABORT("Malloc fails for desc[]");; relax_snode(n, etree, relax, desc, relax_end); SUPERLU_FREE(desc); for (j = 0; j < min_mn; ) { if ( relax_end[j] != EMPTY ) { /* beginning of a relaxed snode */ k = relax_end[j]; /* end of the relaxed snode */ /* Determine union of the row structure of supernode (j:k). */ if ( (info = snode_dfs(A, j, k, xprune, marker, Glu_persist, Glu_freeable)) != 0 ) return info; for (i = j; i <= k; ++i) pivotL(i, perm_r, &pivrow, Glu_persist, Glu_freeable); j = k+1; } else { /* Perform a symbolic factorization on column j, and detects whether column j starts a new supernode. */ if ((info = column_dfs(A, j, perm_r, &nseg, segrep, repfnz, xprune, marker, parent, xplore, Glu_persist, Glu_freeable)) != 0) return info; /* Copy the U-segments to usub[*]. */ if ((info = set_usub(min_mn, j, nseg, segrep, repfnz, Glu_persist, Glu_freeable)) != 0) return info; pivotL(j, perm_r, &pivrow, Glu_persist, Glu_freeable); /* Prune columns [0:j-1] using column j. */ pruneL(j, perm_r, pivrow, nseg, segrep, repfnz, xprune, Glu_persist, Glu_freeable); /* Reset repfnz[*] to prepare for the next column. */ for (i = 0; i < nseg; i++) { irep = segrep[i]; repfnz[irep] = EMPTY; } ++j; } /* else */ } /* for j ... */ countnz_dist(min_mn, xprune, &nnzL, &nnzU, Glu_persist, Glu_freeable); /* Apply perm_r to L; Compress LSUB array. */ i = fixupL_dist(min_mn, perm_r, Glu_persist, Glu_freeable); if ( !pnum && (options->PrintStat == YES)) { printf("\tNonzeros in L %ld\n", nnzL); printf("\tNonzeros in U %ld\n", nnzU); printf("\tnonzeros in L+U %ld\n", nnzL + nnzU - min_mn); printf("\tnonzeros in LSUB %ld\n", i); } SUPERLU_FREE(iwork); #if ( PRNTlevel>=3 ) PrintInt10("lsub", Glu_freeable->xlsub[n], Glu_freeable->lsub); PrintInt10("xlsub", n+1, Glu_freeable->xlsub); PrintInt10("xprune", n, xprune); PrintInt10("usub", Glu_freeable->xusub[n], Glu_freeable->usub); PrintInt10("xusub", n+1, Glu_freeable->xusub); PrintInt10("supno", n, Glu_persist->supno); PrintInt10("xsup", (Glu_persist->supno[n])+2, Glu_persist->xsup); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(pnum, "Exit symbfact()"); #endif return (-i); } /* SYMBFACT */
int_t pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t fst_row, int_t nrhs, doublecomplex *x, int_t *ilsum, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { /* * Purpose * ======= * Re-distribute X on the diagonal processes to B distributed on all * the processes. * * Note * ==== * This routine can only be called after the routine pxgstrs_init(), * in which the structures of the send and receive buffers are set up. * */ int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; int_t *xsup, *supno; int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; int iam, p, q, pkk, procs; int_t num_diag_procs, *diag_procs; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_X_to_B()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = Glu_persist->supno[n-1] + 1; iam = grid->iam; procs = grid->nprow * grid->npcol; SendCnt = gstrs_comm->X_to_B_SendCnt; SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls_nrhs[p]; } num_diag_procs = SOLVEstruct->num_diag_procs; diag_procs = SOLVEstruct->diag_procs; for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ pkk = diag_procs[p]; if ( iam == pkk ) { for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number */ irow = FstBlockC( k ); l = X_BLK( lk ); for (i = 0; i < knsupc; ++i) { #if 0 ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ #else ii = irow; #endif q = row_to_proc[ii]; jj = ptr_to_ibuf[q]; send_ibuf[jj] = ii; jj = ptr_to_dbuf[q]; RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ send_dbuf[jj++] = x[l + i + j*knsupc]; } ++ptr_to_ibuf[q]; ptr_to_dbuf[q] += nrhs; ++irow; } } } } /* ------------------------------------------------------------ COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. ------------------------------------------------------------*/ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* ------------------------------------------------------------ COPY THE BUFFER INTO B. ------------------------------------------------------------*/ for (i = 0, k = 0; i < m_loc; ++i) { irow = recv_ibuf[i]; irow -= fst_row; /* Relative row number */ RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ B[irow + j*ldb] = recv_dbuf[k++]; } } SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_X_to_B()"); #endif return 0; } /* pzReDistribute_X_to_B */
int_t dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, gridinfo_t *grid, int_t *colptr[], int_t *rowind[], double *a[]) { /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * Purpose * ======= * Re-distribute A on the 2D process mesh. * * Arguments * ========= * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * grid (input) gridinfo_t* * The 2D process mesh. * * colptr (output) int* * * rowind (output) int* * * a (output) double* * * Return value * ============ * */ NRformat_loc *Astore; int_t *perm_r; /* row permutation vector */ int_t *perm_c; /* column permutation vector */ int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; int_t nnz_loc; /* number of local nonzeros */ int_t nnz_remote; /* number of remote nonzeros to be sent */ int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; int_t *ia, *ja, **ia_send, *index, *itemp; int_t *ptr_to_send; double *aij, **aij_send, *nzval, *dtemp; double *nzval_a; int iam, it, p, procs; MPI_Request *send_req; MPI_Status status; /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter dReDistribute_A()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; nnzToRecv = intCalloc_dist(2*procs); nnzToSend = nnzToRecv + procs; /* ------------------------------------------------------------ COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); ++nnzToSend[p]; } } /* All-to-all communication */ MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, grid->comm); maxnnzToRecv = 0; nnz_loc = SendCnt = RecvCnt = 0; for (p = 0; p < procs; ++p) { if ( p != iam ) { SendCnt += nnzToSend[p]; RecvCnt += nnzToRecv[p]; maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); } else { nnz_loc += nnzToRecv[p]; /*assert(nnzToSend[p] == nnzToRecv[p]);*/ } } k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ /* Allocate space for storing the triplets after redistribution. */ if ( !(ia = intMalloc_dist(2*k)) ) ABORT("Malloc fails for ia[]."); ja = ia + k; if ( !(aij = doubleMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) ABORT("Malloc fails for ia_send[]."); if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) ABORT("Malloc fails for aij_send[]."); if ( !(index = intMalloc_dist(2*SendCnt)) ) ABORT("Malloc fails for index[]."); if ( !(nzval = doubleMalloc_dist(SendCnt)) ) ABORT("Malloc fails for nzval[]."); if ( !(ptr_to_send = intCalloc_dist(procs)) ) ABORT("Malloc fails for ptr_to_send[]."); if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) ABORT("Malloc fails for itemp[]."); if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) ABORT("Malloc fails for dtemp[]."); for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } } /* if procs > 1 */ if ( !(*colptr = intCalloc_dist(n+1)) ) ABORT("Malloc fails for *colptr[]."); /* ------------------------------------------------------------ LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. THIS ACCOUNTS FOR THE SECOND PASS OF A. ------------------------------------------------------------*/ nnz_loc = 0; /* Reset the local nonzero count. */ nzval_a = Astore->nzval; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); if ( p != iam ) { /* remote */ k = ptr_to_send[p]; ia_send[p][k] = irow; ia_send[p][k + nnzToSend[p]] = jcol; aij_send[p][k] = nzval_a[j]; ++ptr_to_send[p]; } else { /* local */ ia[nnz_loc] = irow; ja[nnz_loc] = jcol; aij[nnz_loc] = nzval_a[j]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } /* ------------------------------------------------------------ PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); it = nnzToSend[p]; MPI_Isend( aij_send[p], it, MPI_DOUBLE, p, iam+procs, grid->comm, &send_req[procs+p] ); } } for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, grid->comm, &status ); for (i = 0; i < nnzToRecv[p]; ++i) { ia[nnz_loc] = itemp[i]; jcol = itemp[i + nnzToRecv[p]]; /*assert(jcol<n);*/ ja[nnz_loc] = jcol; aij[nnz_loc] = dtemp[i]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } for (p = 0; p < procs; ++p) { if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } } /* ------------------------------------------------------------ DEALLOCATE TEMPORARY STORAGE ------------------------------------------------------------*/ SUPERLU_FREE(nnzToRecv); if ( procs > 1 ) { SUPERLU_FREE(send_req); SUPERLU_FREE(ia_send); SUPERLU_FREE(aij_send); SUPERLU_FREE(index); SUPERLU_FREE(nzval); SUPERLU_FREE(ptr_to_send); SUPERLU_FREE(itemp); SUPERLU_FREE(dtemp); } /* ------------------------------------------------------------ CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. ------------------------------------------------------------*/ if ( !(*rowind = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *rowind[]."); if ( !(*a = doubleMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *a[]."); /* Initialize the array of column pointers */ k = 0; jsize = (*colptr)[0]; (*colptr)[0] = 0; for (j = 1; j < n; ++j) { k += jsize; jsize = (*colptr)[j]; (*colptr)[j] = k; } /* Copy the triplets into the column oriented storage */ for (i = 0; i < nnz_loc; ++i) { j = ja[i]; k = (*colptr)[j]; (*rowind)[k] = ia[i]; (*a)[k] = aij[i]; ++(*colptr)[j]; } /* Reset the column pointers to the beginning of each column */ for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; (*colptr)[0] = 0; SUPERLU_FREE(ia); SUPERLU_FREE(aij); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit dReDistribute_A()"); #endif } /* dReDistribute_A */
/*! \brief Gather A from the distributed compressed row format to global A in compressed column format. */ int pzCompRow_loc_to_CompCol_global ( int_t need_value, /* Input. Whether need to gather numerical values */ SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ gridinfo_t *grid, /* Input */ SuperMatrix *GA /* Output */ ) { NRformat_loc *Astore; NCformat *GAstore; doublecomplex *a, *a_loc; int_t *colind, *rowptr; int_t *colptr_loc, *rowind_loc; int_t m_loc, n, i, j, k, l; int_t colnnz, fst_row, nnz_loc, nnz; doublecomplex *a_recv; /* Buffer to receive the blocks of values. */ doublecomplex *a_buf; /* Buffer to merge blocks into block columns. */ int_t *itemp; int_t *colptr_send; /* Buffer to redistribute the column pointers of the local block rows. Use n_loc+1 pointers for each block. */ int_t *colptr_blk; /* The column pointers for each block, after redistribution to the local block columns. Use n_loc+1 pointers for each block. */ int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ int_t *fst_rows, *n_locs; int *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32; int it, n_loc, procs; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzCompRow_loc_to_CompCol_global"); #endif /* Initialization. */ n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */ /* ------------------------------------------------------------ FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN. ------------------------------------------------------------*/ zCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc, &rowind_loc, &colptr_loc); /* Change local row index numbers to global numbers. */ for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row; #if ( DEBUGlevel>=2 ) printf("Proc %d\n", grid->iam); PrintInt10("rowind_loc", nnz_loc, rowind_loc); PrintInt10("colptr_loc", n+1, colptr_loc); #endif procs = grid->nprow * grid->npcol; if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) ABORT("Malloc fails for fst_rows[]"); n_locs = fst_rows + procs; MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, grid->comm); for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; n_locs[procs-1] = n - fst_rows[procs-1]; if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) ABORT("Malloc fails for recvcnts[]"); sendcnts = recvcnts + procs; rdispls = sendcnts + procs; sdispls = rdispls + procs; itemp_32 = sdispls + procs; /* All-to-all transfer column pointers of each block. Now the matrix view is P-by-P block-partition. */ /* n column starts for each column, and procs column ends for each block */ if ( !(colptr_send = intMalloc_dist(n + procs)) ) ABORT("Malloc fails for colptr_send[]"); if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) ABORT("Malloc fails for colptr_blk[]"); for (i = 0, j = 0; i < procs; ++i) { for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ sendcnts[i] = n_locs[i] + 1; #if ( DEBUGlevel>=1 ) assert(j == fst_rows[i]); #endif sdispls[i] = j + i; recvcnts[i] = n_loc + 1; rdispls[i] = i * (n_loc + 1); j += n_locs[i]; /* First column of next block in colptr_loc[] */ } MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); /* Adjust colptr_blk[] so that they contain the local indices of the column pointers in the receive buffer. */ nnz = 0; /* The running sum of the nonzeros counted by far */ k = 0; for (i = 0; i < procs; ++i) { for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { colnnz = colptr_blk[j+1] - colptr_blk[j]; /*assert(k<=j);*/ colptr_blk[k] = nnz; nnz += colnnz; /* Start of the next column */ ++k; } colptr_blk[k++] = nnz; /* Add an END marker for each block */ } /*assert(k == (n_loc+1)*procs);*/ /* Now prepare to transfer row indices and values. */ sdispls[0] = 0; for (i = 0; i < procs-1; ++i) { sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; sdispls[i+1] = sdispls[i] + sendcnts[i]; } sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; for (i = 0; i < procs; ++i) { j = rdispls[i]; /* Point to this block in colptr_blk[]. */ recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; } rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) ABORT("Malloc fails for rowind_recv[]"); rowind_buf = rowind_recv + k; MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); if ( need_value ) { if ( !(a_recv = (doublecomplex *) doublecomplexMalloc_dist(2*k)) ) ABORT("Malloc fails for rowind_recv[]"); a_buf = a_recv + k; MPI_Alltoallv(a_loc, sendcnts, sdispls, SuperLU_MPI_DOUBLE_COMPLEX, a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); } /* Reset colptr_loc[] to point to the n_loc global columns. */ colptr_loc[0] = 0; itemp = colptr_send; for (j = 0; j < n_loc; ++j) { colnnz = 0; for (i = 0; i < procs; ++i) { k = i * (n_loc + 1) + j; /* j-th column in i-th block */ colnnz += colptr_blk[k+1] - colptr_blk[k]; } colptr_loc[j+1] = colptr_loc[j] + colnnz; itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ } itemp[n_loc] = colptr_loc[n_loc]; /* Merge blocks of row indices into columns of row indices. */ for (i = 0; i < procs; ++i) { k = i * (n_loc + 1); for (j = 0; j < n_loc; ++j) { /* i-th block */ for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { rowind_buf[itemp[j]] = rowind_recv[l]; ++itemp[j]; } } } if ( need_value ) { for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; for (i = 0; i < procs; ++i) { k = i * (n_loc + 1); for (j = 0; j < n_loc; ++j) { /* i-th block */ for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { a_buf[itemp[j]] = a_recv[l]; ++itemp[j]; } } } } /* ------------------------------------------------------------ SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT. ------------------------------------------------------------*/ GA->nrow = A->nrow; GA->ncol = A->ncol; GA->Stype = SLU_NC; GA->Dtype = A->Dtype; GA->Mtype = A->Mtype; GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) ); if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore"); /* First gather the size of each piece. */ nnz_loc = colptr_loc[n_loc]; MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i]; GAstore->nnz = nnz; if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); /* Allgatherv for row indices. */ rdispls[0] = 0; for (i = 0; i < procs-1; ++i) { rdispls[i+1] = rdispls[i] + itemp[i]; itemp_32[i] = itemp[i]; } itemp_32[procs-1] = itemp[procs-1]; it = nnz_loc; MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, itemp_32, rdispls, mpi_int_t, grid->comm); if ( need_value ) { if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); } else GAstore->nzval = NULL; /* Now gather the column pointers. */ rdispls[0] = 0; for (i = 0; i < procs-1; ++i) { rdispls[i+1] = rdispls[i] + n_locs[i]; itemp_32[i] = n_locs[i]; } itemp_32[procs-1] = n_locs[procs-1]; MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, itemp_32, rdispls, mpi_int_t, grid->comm); /* Recompute column pointers. */ for (i = 1; i < procs; ++i) { k = rdispls[i]; for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; itemp[i] += itemp[i-1]; /* prefix sum */ } GAstore->colptr[n] = nnz; #if ( DEBUGlevel>=2 ) if ( !grid->iam ) { printf("After pdCompRow_loc_to_CompCol_global()\n"); zPrint_CompCol_Matrix_dist(GA); } #endif SUPERLU_FREE(a_loc); SUPERLU_FREE(rowind_loc); SUPERLU_FREE(colptr_loc); SUPERLU_FREE(fst_rows); SUPERLU_FREE(recvcnts); SUPERLU_FREE(colptr_send); SUPERLU_FREE(colptr_blk); SUPERLU_FREE(rowind_recv); if ( need_value) SUPERLU_FREE(a_recv); #if ( DEBUGlevel>=1 ) if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat)); CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global"); #endif return 0; } /* pzCompRow_loc_to_CompCol_global */
int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, int_t fst_row, int_t *ilsum, doublecomplex *x, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *perm_r, *perm_c; /* row and column permutation vectors */ int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *xsup, *supno; int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; int p, procs; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; xsup = Glu_persist->xsup; supno = Glu_persist->supno; SendCnt = gstrs_comm->B_to_X_SendCnt; SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; /* ------------------------------------------------------------ NOW COMMUNICATE THE ACTUAL DATA. ------------------------------------------------------------*/ k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls[p] * nrhs; } /* Copy the row indices and values to the send buffer. */ for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ k = ptr_to_ibuf[p]; send_ibuf[k] = irow; k = ptr_to_dbuf[p]; RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ send_dbuf[k++] = B[i + j*ldb]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Communicate the (permuted) row indices. */ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); /* Communicate the numerical values. */ MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* ------------------------------------------------------------ Copy buffer into X on the diagonal processes. ------------------------------------------------------------*/ ii = 0; for (p = 0; p < procs; ++p) { jj = rdispls_nrhs[p]; for (i = 0; i < RecvCnt[p]; ++i) { /* Only the diagonal processes do this; the off-diagonal processes have 0 RecvCnt. */ irow = recv_ibuf[ii]; /* The permuted row index. */ k = BlockNum( irow ); knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number. */ l = X_BLK( lk ); x[l - XK_H].r = k; /* Block number prepended in the header. */ x[l - XK_H].i = 0; irow = irow - FstBlockC(k); /* Relative row number in X-block */ RHS_ITERATE(j) { x[l + irow + j*knsupc] = recv_dbuf[jj++]; } ++ii; } } SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); #endif return 0; } /* pzReDistribute_B_to_X */
/*! \brief Initialize the data structure for the solution phase. */ int zSolveInit(superlu_options_t *options, SuperMatrix *A, int_t perm_r[], int_t perm_c[], int_t nrhs, LUstruct_t *LUstruct, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { int_t *row_to_proc, *inv_perm_c, *itemp; NRformat_loc *Astore; int_t i, fst_row, m_loc, p; int procs; /* prototypes */ extern int_t pxgstrs_init(int_t, int_t, int_t, int_t, int_t [], int_t [], gridinfo_t *grid, Glu_persist_t *, SOLVEstruct_t *); Astore = (NRformat_loc *) A->Store; fst_row = Astore->fst_row; m_loc = Astore->m_loc; procs = grid->nprow * grid->npcol; if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) ABORT("Malloc fails for row_to_proc[]"); SOLVEstruct->row_to_proc = row_to_proc; if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) ABORT("Malloc fails for inv_perm_c[]."); for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; SOLVEstruct->inv_perm_c = inv_perm_c; /* ------------------------------------------------------------ EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. NOTE: For those processes that do not own any row, it must must be set so that fst_row == A->nrow. ------------------------------------------------------------*/ if ( !(itemp = intMalloc_dist(procs+1)) ) ABORT("Malloc fails for itemp[]"); MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); itemp[procs] = A->nrow; for (p = 0; p < procs; ++p) { for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; } #if ( DEBUGlevel>=2 ) if ( !grid->iam ) { printf("fst_row = %d\n", fst_row); PrintInt10("row_to_proc", A->nrow, row_to_proc); PrintInt10("inv_perm_c", A->ncol, inv_perm_c); } #endif SUPERLU_FREE(itemp); #if 0 /* Compute the mapping between rows and processes. */ /* XSL NOTE: What happens if # of mapped processes is smaller than total Procs? For the processes without any row, let fst_row be EMPTY (-1). Make sure this case works! */ MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); itemp[procs] = n; for (p = 0; p < procs; ++p) { j = itemp[p]; if ( j != EMPTY ) { k = itemp[p+1]; if ( k == EMPTY ) k = n; for (i = j ; i < k; ++i) row_to_proc[i] = p; } } #endif get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, &SOLVEstruct->num_diag_procs, &SOLVEstruct->diag_procs, &SOLVEstruct->diag_len); if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, LUstruct->Glu_persist, SOLVEstruct); if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *) SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) ) ABORT("Malloc fails for gsmv_comm[]"); SOLVEstruct->A_colind_gsmv = NULL; options->SolveInitialized = YES; return 0; } /* zSolveInit */
void pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, doublecomplex *B, int_t ldb, int nrhs, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int Pc, Pr, iam; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #if defined (ISEND_IRECV) || defined (BSEND) MPI_Request *send_req, recv_req; #endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerr_dist("PZGSTRS_BGLOBAL", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs_Bglobal()"); #endif /* Save the count to be altered so it can be used by subsequent call to PDGSTRS_BGLOBAL. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #if defined (ISEND_IRECV) || defined (BSEND) k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Copy B into X on the diagonal processes. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header. */ lsum[il - LSUM_H].i = 0; kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* Diagonal process. */ jj = X_BLK( lk ); x[jj - XK_H].r = k; /* Block number prepended in the header. */ x[jj - XK_H].i = 0; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ x[i + jj + j*knsupc] = B[i + ii + j*ldb]; } }