int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(U_diag_blk_send_req = (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request)))) ABORT("Malloc fails for U_diag_blk_send_req[]."); U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */ if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } k = sp_ienv_dist(3); /* max supernode size */ if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); if ( U_diag_blk_send_req[myrow] ) { /* wait for last Isend requests to complete, deallocate objects */ for (krow = 0; krow < Pr; ++krow) if ( krow != myrow ) MPI_Wait(U_diag_blk_send_req + krow, &status); } SUPERLU_FREE(U_diag_blk_send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */
int sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, double *x, int *info) { /* * Purpose * ======= * * sp_dtrsv_dist() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = D, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = D, Mtype = TRU. * * x - (input/output) double* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * */ #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3; #endif SCformat *Lstore; NCformat *Ustore; double *Lval, *Uval; int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; double *work; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_dtrsv_dist", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = doubleCalloc_dist(L->nrow)) ) ABORT("Malloc fails for work in sp_dtrsv_dist()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1); solve_ops += 2 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; x[irow] -= x[fsupc] * Lval[luptr]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif /* _CRAY */ #else dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); x[irow] -= work[i]; /* Scatter */ work[i] = 0.0; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); x[irow] -= x[fsupc] * Uval[i]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("N", strlen("N")); STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[irow] -= x[jcol] * Uval[i]; } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 2 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); x[jcol] -= x[irow] * Lval[i]; iptr++; } } if ( nsupc > 1 ) { solve_ops += nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[jcol] -= x[irow] * Uval[i]; } } solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } SuperLUStat.ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
float ddistribute(fact_t fact, int_t n, SuperMatrix *A, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; NCPformat *Astore; double *a; int_t *asub; int_t *xa_begin, *xa_end; int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ int *index1; /* temporary pointer to array of int */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t nfsendx = 0; /* Number of Xk I will send */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t nbsendx = 0; /* Number of Xk I will send */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t iword, dword; float mem_use = 0.0; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter ddistribute()"); #endif if ( fact == SamePattern_SameRowPerm ) { /* --------------------------------------------------------------- * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. * --------------------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Initialize Uval to zero. */ for (lb = 0; lb < nrbu; ++lb) { Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ index = Ufstnz_br_ptr[lb]; if ( index ) { uval = Unzval_br_ptr[lb]; len = index[1]; for (i = 0; i < len; ++i) uval[i] = zero; } /* if index != NULL */ } /* for lb ... */ for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA (for L), or into U directly. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); if ( gb < jb ) { /* in U */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; while ( (k = index[Urb_indptr[lb]]) < jb ) { /* Skip nonzero values in this block */ Urb_length[lb] += index[Urb_indptr[lb]+1]; /* Move pointer to the next block */ Urb_indptr[lb] += UB_DESCRIPTOR + SuperSize( k ); } /*assert(k == jb);*/ /* start fstnz */ istart = Urb_indptr[lb] + UB_DESCRIPTOR; len = Urb_length[lb]; fsupc1 = FstBlockC( gb+1 ); k = j - fsupc; /* Sum the lengths of the leading columns */ for (jj = 0; jj < k; ++jj) len += fsupc1 - index[istart++]; /*assert(irow>=index[istart]);*/ uval[len + irow - index[istart]] = a[i]; } else { /* in L; put in SPA first */ irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* -------------------------------------------------- * FIRST TIME CREATING THE L AND U DATA STRUCTURE. * -------------------------------------------------- */ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* No L and U data structures are available yet. We need to set up the L and U data structures and propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) ABORT("Malloc fails for ToRecv[]."); for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index1[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) ABORT("Malloc fails for ToSendD[]."); for (i = 0; i < k; ++i) ToSendD[i] = NO; if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Urb_fstnz[lb] = BR_HEADER; } /* for lb ... */ SUPERLU_FREE(Ucbs); #if ( PROFlevel>=1 ) t = SuperLU_timer_() - t; if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); #endif #if ( PRNTlevel>=1 ) mem_use -= 2.0*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) *sp_ienv_dist(3))))) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); #if ( PRNTlevel>=1 ) mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; #if ( PRNTlevel>=1 ) mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ for (i = xa_begin[j]; i < xa_end[j]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; dense_col = dense; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; /* NOTE: Only the first nonzero index of the segment is stored in usub[]. */ for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ bsendx_plist[ljb][pr] == EMPTY ) { bsendx_plist[ljb][pr] = YES; ++nbsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; fsupc1 = FstBlockC( gb+1 ); if (rb_marker[lb] <= jb) { /* First time see the block */ rb_marker[lb] = jb + 1; Urb_indptr[lb] = Urb_fstnz[lb];; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; /* Record the first location in index[] of the next block */ Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; len = Urb_indptr[lb];/* Start fstnz in index */ index[len-1] = 0; for (k = 0; k < nsupc; ++k) index[len+k] = fsupc1; if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { /* Already saw the block */ len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; /* Load the numerical values */ k = fsupc1 - irow; /* No. of nonzeros in segment */ index[len-1] += k; /* Increment block length in Descriptor */ irow = ilsum[lb] + irow - FstBlockC( gb ); for (ii = 0; ii < k; ++ii) { uval[Urb_length[lb]++] = dense_col[irow + ii]; dense_col[irow + ii] = zero; } } /* if myrow == pr ... */ } /* for i ... */ dense_col += ldaspa; } /* for j ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow && myrow == jbrow && /* diag. proc. owning jb */ fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { fsendx_plist[ljb][pr] = YES; ++nfsendx; } if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) { fprintf(stderr, "col block " IFMT " ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = 0.0; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->nfsendx = nfsendx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->nbsendx = nbsendx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ if ( !(Llu->mod_bit = intMalloc_dist(k)) ) ABORT("Malloc fails for mod_bit[]."); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time:\n " "\tL\t%.2f\n\tU\t%.2f\n" "\tu_blks %d\tnrbu %d\n--------\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit ddistribute()"); #endif return (mem_use); } /* DDISTRIBUTE */
void GenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist, gridinfo_t *grid, double **xact, int *ldx, double **b, int *ldb) { int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows, nsupr, nsupers, rel; int_t *supno, *xsup, *lxsup; double *x, *bb; NCformat *Astore; double *Aval; n = A->ncol; *ldb = 0; supno = Glu_persist->supno; xsup = Glu_persist->xsup; nsupers = supno[n-1] + 1; iam = grid->iam; myrow = MYROW( iam, grid ); Astore = A->Store; Aval = Astore->nzval; lb = CEILING( nsupers, grid->nprow ) + 1; if ( !(lxsup = intMalloc_dist(lb)) ) ABORT("Malloc fails for lxsup[]."); lsup = 0; nlrows = 0; for (j = 0; j < nsupers; ++j) { i = PROW( j, grid ); if ( myrow == i ) { nsupr = SuperSize( j ); *ldb += nsupr; lxsup[lsup++] = nlrows; nlrows += nsupr; } } *ldx = n; if ( !(x = doubleMalloc_dist(((size_t)*ldx) * nrhs)) ) ABORT("Malloc fails for x[]."); if ( !(bb = doubleCalloc_dist(*ldb * nrhs)) ) ABORT("Calloc fails for bb[]."); for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0; /* Form b = A*x. */ for (j = 0; j < n; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; gb = supno[irow]; gbrow = PROW( gb, grid ); if ( myrow == gbrow ) { rel = irow - xsup[gb]; lb = LBi( gb, grid ); bb[lxsup[lb] + rel] += Aval[i] * x[j]; } } /* Memory allocated but not freed: xact, b */ *xact = x; *b = bb; SUPERLU_FREE(lxsup); #if ( PRNTlevel>=2 ) for (i = 0; i < grid->nprow*grid->npcol; ++i) { if ( iam == i ) { printf("\n(%d)\n", iam); PrintDouble5("rhs", *ldb, *b); } MPI_Barrier( grid->comm ); } #endif } /* GENXTRUERHS */
int_t pddistribute(fact_t fact, int_t n, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid) /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * * Purpose * ======= * Distribute the matrix onto the 2D process mesh. * * Arguments * ========= * * fact (input) fact_t * Specifies whether or not the L and U structures will be re-used. * = SamePattern_SameRowPerm: L and U structures are input, and * unchanged on exit. * = DOFACT or SamePattern: L and U structures are computed and output. * * n (input) int * Dimension of the matrix. * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * LUstruct (input) LUstruct_t* * Data structures for L and U factors. * * grid (input) gridinfo_t* * The 2D process mesh. * * Return value * ============ * > 0, working storage required (in bytes). * */ { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc; int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ int iam, jbrow, kcol, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; #if 0 NCPformat *Astore; #else /* XSL ==> */ NRformat_loc *Astore; #endif double *a; int_t *asub, *xa; #if 0 int_t *xa_begin, *xa_end; #endif int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; int_t *lsub, *xlsub, *usub, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ double *lusup, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ /*-- Counts to be used in factorization. --*/ int_t *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist; /* Column process list to send down Xk. */ int_t nfrecvx = 0; /* Number of Xk I will receive. */ int_t kseen; /*-- Counts to be used in upper triangular solve. --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist; /* Column process list to send down Xk. */ int_t nbrecvx = 0; /* Number of Xk I will receive. */ int_t *ilsum; /* starting position of each supernode in the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ int_t *Ucbs; /* number of column blocks in a block row */ int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t mem_use = 0, iword, dword; #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) double t, t_u, t_l; int_t u_blks; #endif /* Initialization. */ iam = grid->iam; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; nsupers = supno[n-1] + 1; Astore = (NRformat_loc *) A->Store; #if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(double); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pddistribute()"); #endif dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, grid, &xa, &asub, &a); if ( fact == SamePattern_SameRowPerm ) { #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We can propagate the new values of A into the existing L and U data structures. */ ilsum = Llu->ilsum; ldaspa = Llu->ldalsum; if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ if ( !(Urb_length = intCalloc_dist(nrbu)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); for (lb = 0; lb < nrbu; ++lb) Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Unzval[]. */ for (lb = 0; lb < nrbu; ++lb) { index = Ufstnz_br_ptr[lb]; if ( index && index[Urb_indptr[lb]] == jb ) { uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb] + UB_DESCRIPTOR; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj = 0, dense_col = dense; jj < nsupc; ++jj) { j = index[len+jj]; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow+i]; dense_col[irow+i] = zero; } dense_col += ldaspa; } Urb_indptr[lb] += UB_DESCRIPTOR + nsupc; } /* if index != NULL */ } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /* Gather the values of A from SPA into Lnzval[]. */ ljb = LBj( jb, grid ); /* Local block number */ index = Lrowind_bc_ptr[ljb]; if ( index ) { nrbl = index[0]; /* Number of row blocks. */ len = index[1]; /* LDA of lusup[]. */ lusup = Lnzval_bc_ptr[ljb]; next_lind = BC_HEADER; next_lval = 0; for (jj = 0; jj < nrbl; ++jj) { gb = index[next_lind++]; len1 = index[next_lind++]; /* Rows in the block. */ lb = LBi( gb, grid ); for (bnnz = 0; bnnz < len1; ++bnnz) { irow = index[next_lind++]; /* Global index. */ irow = ilsum[lb] + irow - FstBlockC( gb ); k = next_lval++; for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } /* for bnnz ... */ } /* for jj ... */ } /* if index ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ SUPERLU_FREE(dense); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } else { /* ------------------------------------------------------------ FIRST TIME CREATING THE L AND U DATA STRUCTURES. ------------------------------------------------------------*/ #if ( PROFlevel>=1 ) t_l = t_u = 0; u_blks = 0; #endif /* We first need to set up the L and U data structures and then * propagate the values of A into them. */ lsub = Glu_freeable->lsub; /* compressed L subscripts */ xlsub = Glu_freeable->xlsub; usub = Glu_freeable->usub; /* compressed U subscripts */ xusub = Glu_freeable->xusub; if ( !(ToRecv = intCalloc_dist(nsupers)) ) ABORT("Calloc fails for ToRecv[]."); k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index = intMalloc_dist(j)) ) ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) mem_use = k*sizeof(int_t*) + (j + nsupers)*iword; #endif for (i = 0; i < j; ++i) index[i] = EMPTY; for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ /* Pointers to the beginning of each block row of U. */ if ( !(Unzval_br_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Unzval_br_ptr[]."); if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Ufstnz_br_ptr[]."); if ( !(ToSendD = intCalloc_dist(k)) ) ABORT("Malloc fails for ToSendD[]."); if ( !(ilsum = intMalloc_dist(k+1)) ) ABORT("Malloc fails for ilsum[]."); /* Auxiliary arrays used to set up U block data structures. They are freed on return. */ if ( !(rb_marker = intCalloc_dist(k)) ) ABORT("Calloc fails for rb_marker[]."); if ( !(Urb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_length[]."); if ( !(Urb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Urb_indptr[]."); if ( !(Urb_fstnz = intCalloc_dist(k)) ) ABORT("Calloc fails for Urb_fstnz[]."); if ( !(Ucbs = intCalloc_dist(k)) ) ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword; #endif /* Compute ldaspa and ilsum[]. */ ldaspa = 0; ilsum[0] = 0; for (gb = 0; gb < nsupers; ++gb) { if ( myrow == PROW( gb, grid ) ) { i = SuperSize( gb ); ldaspa += i; lb = LBi( gb, grid ); ilsum[lb + 1] = ilsum[lb] + i; } } /* ------------------------------------------------------------ COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). ------------------------------------------------------------*/ /* Loop through each supernode column. */ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); /* Loop through each column in the block. */ for (j = fsupc; j < fsupc + nsupc; ++j) { /* usub[*] contains only "first nonzero" in each segment. */ for (i = xusub[j]; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero of the segment. */ gb = BlockNum( irow ); kcol = PCOL( gb, grid ); ljb = LBj( gb, grid ); if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; pr = PROW( gb, grid ); lb = LBi( gb, grid ); if ( mycol == pc ) { if ( myrow == pr ) { ToSendD[lb] = YES; /* Count nonzeros in entire block row. */ Urb_length[lb] += FstBlockC( gb+1 ) - irow; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; Urb_fstnz[lb] += nsupc; ++Ucbs[lb]; /* Number of column blocks in block row lb. */ #if ( PRNTlevel>=1 ) ++nUblocks; #endif } ToRecv[gb] = 1; } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ } } /* for i ... */ } /* for j ... */ } /* for jb ... */ /* Set up the initial pointers for each block row in U. */ nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ for (lb = 0; lb < nrbu; ++lb) { len = Urb_length[lb]; rb_marker[lb] = 0; /* Reset block marker. */ if ( len ) { /* Add room for descriptors */ len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1+1)) ) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) ABORT("Malloc fails for Unzval_br_ptr[*][]."); mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); index[0] = Ucbs[lb]; /* Number of column blocks */ index[1] = len; /* Total length of nzval[] */ index[2] = len1; /* Total length of index[] */ index[len1] = -1; /* End marker */ } else { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ } /* for lb ... */ SUPERLU_FREE(Urb_fstnz); SUPERLU_FREE(Ucbs); #if ( PRNTlevel>=1 ) mem_use -= 2*k * iword; #endif /* Auxiliary arrays used to set up L block data structures. They are freed on return. k is the number of local row blocks. */ if ( !(Lrb_length = intCalloc_dist(k)) ) ABORT("Calloc fails for Lrb_length[]."); if ( !(Lrb_number = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_number[]."); if ( !(Lrb_indptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_indptr[]."); if ( !(Lrb_valptr = intMalloc_dist(k)) ) ABORT("Malloc fails for Lrb_valptr[]."); if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ if ( !(fmod = intCalloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); if ( !(bmod = intCalloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); /* ------------------------------------------------ */ #if ( PRNTlevel>=1 ) mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ /* Pointers to the beginning of each block column of L. */ if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) ABORT("Malloc fails for Lnzval_bc_ptr[]."); if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) fsendx_plist[i] = &index[j]; if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for bsendx_plist[]."); if ( !(index = intMalloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); for (i = 0; i < len; ++i) index[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) bsendx_plist[i] = &index[j]; /* -------------------------------------------------------------- */ #if ( PRNTlevel>=1 ) mem_use += 4*k*sizeof(int_t*) + 2*len*iword; #endif /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ for (jb = 0; jb < nsupers; ++jb) { pc = PCOL( jb, grid ); if ( mycol == pc ) { /* Block column jb in my process column */ fsupc = FstBlockC( jb ); nsupc = SuperSize( jb ); ljb = LBj( jb, grid ); /* Local block number */ /* Scatter A into SPA. */ for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { for (i = xa[j]; i < xa[j+1]; ++i) { irow = asub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); irow = ilsum[lb] + irow - FstBlockC( gb ); dense_col[irow] = a[i]; } } dense_col += ldaspa; } jbrow = PROW( jb, grid ); #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP U BLOCKS. *------------------------------------------------*/ kseen = 0; /* Loop through each column in the block column. */ for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { istart = xusub[j]; for (i = istart; i < xusub[j+1]; ++i) { irow = usub[i]; /* First nonzero in the segment. */ gb = BlockNum( irow ); pr = PROW( gb, grid ); if ( pr != jbrow ) bsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ index = Ufstnz_br_ptr[lb]; if (rb_marker[lb] <= jb) {/* First see the block */ rb_marker[lb] = jb + 1; index[Urb_indptr[lb]] = jb; /* Descriptor */ Urb_indptr[lb] += UB_DESCRIPTOR; len = Urb_indptr[lb]; for (k = 0; k < nsupc; ++k) index[len+k] = FstBlockC( gb+1 ); if ( gb != jb )/* Exclude diagonal block. */ ++bmod[lb];/* Mod. count for back solve */ if ( kseen == 0 && myrow != jbrow ) { ++nbrecvx; kseen = 1; } } else { len = Urb_indptr[lb];/* Start fstnz in index */ } jj = j - fsupc; index[len+jj] = irow; } /* if myrow == pr ... */ } /* for i ... */ } /* for j ... */ /* Figure out how many nonzeros in each block, and gather the initial values of A from SPA into Uval. */ for (lb = 0; lb < nrbu; ++lb) { if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */ index = Ufstnz_br_ptr[lb]; uval = Unzval_br_ptr[lb]; len = Urb_indptr[lb]; gb = lb * grid->nprow + myrow;/* Global block number */ k = FstBlockC( gb+1 ); irow = ilsum[lb] - FstBlockC( gb ); for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) { j = index[len+jj]; /* First nonzero in segment. */ bnnz += k - j; for (i = j; i < k; ++i) { uval[Urb_length[lb]++] = dense_col[irow + i]; dense_col[irow + i] = zero; } dense_col += ldaspa; } index[len-1] = bnnz; /* Set block length in Descriptor */ Urb_indptr[lb] += nsupc; } } /* for lb ... */ #if ( PROFlevel>=1 ) t_u += SuperLU_timer_() - t; t = SuperLU_timer_(); #endif /*------------------------------------------------ * SET UP L BLOCKS. *------------------------------------------------*/ /* Count number of blocks and length of each block. */ nrbl = 0; len = 0; /* Number of row subscripts I own. */ kseen = 0; istart = xlsub[fsupc]; for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); /* Global block number */ pr = PROW( gb, grid ); /* Process row owning this block */ if ( pr != jbrow ) fsendx_plist[ljb][pr] = YES; if ( myrow == pr ) { lb = LBi( gb, grid ); /* Local block number */ if (rb_marker[lb] <= jb) { /* First see this block */ rb_marker[lb] = jb + 1; Lrb_length[lb] = 1; Lrb_number[nrbl++] = gb; if ( gb != jb ) /* Exclude diagonal block. */ ++fmod[lb]; /* Mod. count for forward solve */ if ( kseen == 0 && myrow != jbrow ) { ++nfrecvx; kseen = 1; } #if ( PRNTlevel>=1 ) ++nLblocks; #endif } else { ++Lrb_length[lb]; } ++len; } } /* for i ... */ if ( nrbl ) { /* Do not ensure the blocks are sorted! */ /* Set up the initial pointers for each block in index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if ( !(index = intMalloc_dist(len1)) ) ABORT("Malloc fails for index[]"); Lrowind_bc_ptr[ljb] = index; if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(len*nsupc))) { fprintf(stderr, "col block %d ", jb); ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); } mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); index[0] = nrbl; /* Number of row blocks */ index[1] = len; /* LDA of the nzval[] */ next_lind = BC_HEADER; next_lval = 0; for (k = 0; k < nrbl; ++k) { gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; Lrb_indptr[lb] = next_lind; Lrb_valptr[lb] = next_lval; next_lind += len; next_lval += len; } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; gb = BlockNum( irow ); if ( myrow == PROW( gb, grid ) ) { lb = LBi( gb, grid ); k = Lrb_indptr[lb]++; /* Random access a block */ index[k] = irow; k = Lrb_valptr[lb]++; irow = ilsum[lb] + irow - FstBlockC( gb ); for (j = 0, dense_col = dense; j < nsupc; ++j) { lusup[k] = dense_col[irow]; dense_col[irow] = zero; k += len; dense_col += ldaspa; } } } /* for i ... */ } else { Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; } /* if nrbl ... */ #if ( PROFlevel>=1 ) t_l += SuperLU_timer_() - t; #endif } /* if mycol == pc */ } /* for jb ... */ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; Llu->ToSendD = ToSendD; Llu->ToSendR = ToSendR; Llu->fmod = fmod; Llu->fsendx_plist = fsendx_plist; Llu->nfrecvx = nfrecvx; Llu->bmod = bmod; Llu->bsendx_plist = bsendx_plist; Llu->nbrecvx = nbrecvx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", nLblocks, nUblocks); #endif SUPERLU_FREE(rb_marker); SUPERLU_FREE(Urb_length); SUPERLU_FREE(Urb_indptr); SUPERLU_FREE(Lrb_length); SUPERLU_FREE(Lrb_number); SUPERLU_FREE(Lrb_indptr); SUPERLU_FREE(Lrb_valptr); SUPERLU_FREE(dense); /* Find the maximum buffer size. */ MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); #if ( PROFlevel>=1 ) if ( !iam ) printf(".. 1st distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ SUPERLU_FREE(xa); SUPERLU_FREE(asub); SUPERLU_FREE(a); #if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ CHECK_MALLOC(iam, "Exit pddistribute()"); #endif return (mem_use); } /* PDDISTRIBUTE */
void pdgstrsL(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, double *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double alpha = 1.0; double zero = 0.0; double *lsum; /* Local running sum of the updates to B-components */ double *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ double *lusup, *dest; double *recvbuf, *tempv; double *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; double **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PDGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrsL()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol != kcol && fmod[lk] ) mod_bit[lk] = 1; /* contribution from off-diagonal */ } } /*PrintInt10("mod_bit", nlb, mod_bit);*/ #if ( PROFlevel>=2 ) t_reduce_tmp = SuperLU_timer_(); #endif /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); #if ( PROFlevel>=2 ) t_reduce += SuperLU_timer_() - t_reduce_tmp; #endif for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* diagonal process */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = *recvbuf; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) x[i + ii + j*knsupc] += tempv[i + j*knsupc]; } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /* Re-distribute X on the diagonal processes to B distributed on all the processes. */ pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Deallocate storage. */ SUPERLU_FREE(lsum); SUPERLU_FREE(x); SUPERLU_FREE(recvbuf); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); SUPERLU_FREE(send_req); #endif stat->utime[SOLVE] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrsL()"); #endif } /* PDGSTRS */
void pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *B, int_t ldb, int nrhs, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double alpha = 1.0; double *lsum; /* Local running sum of the updates to B-components */ double *x; /* X component at step k. */ double *lusup, *dest; double *recvbuf, *tempv; double *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; double **Lnzval_bc_ptr; MPI_Status status; #if defined (ISEND_IRECV) || defined (BSEND) MPI_Request *send_req, recv_req; #endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PDGSTRS_BGLOBAL", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()"); #endif /* Save the count to be altered so it can be used by subsequent call to PDGSTRS_BGLOBAL. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #if defined (ISEND_IRECV) || defined (BSEND) k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doubleMalloc_dist(((size_t)ldalsum) * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Copy B into X on the diagonal processes. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* Diagonal process. */ jj = X_BLK( lk ); x[jj - XK_H] = k; /* Block number prepended in the header. */ RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ x[i + jj + j*knsupc] = B[i + ii + j*ldb]; } }
void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *x, int nrhs, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PDGSTRS1 solves a system of distributed linear equations * * op( sub(A) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PDGSTRF. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures to store L and U factors, * and the permutation vectors. * See superlu_ddefs.h for the definition of 'LUstruct_t' structure. * * grid (input) gridinfo_t* * The 2D process mesh. * * x (input/output) double* * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * NOTE: the right-hand side matrix is already distributed on * the diagonal processes. * * nrhs (input) int (global) * Number of right-hand sides. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves; * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double alpha = 1.0; double *lsum; /* Local running sum of the updates to B-components */ double *lusup, *dest; double *recvbuf, *tempv; double *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; double **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -8; if ( *info ) { pxerbla("PDGSTRS1", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ Llu->SolveMsgSent = 0; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrs1()"); #endif /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Compute ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Prepended the block number in the header for lsum[]. */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H] = k; } } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( !frecv[lk] && !fmod[lk] ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* * Compute the internal nodes asynchronously by all processes. */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = *recvbuf; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) x[i + ii + j*knsupc] += tempv[i + j*knsupc]; if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel>=2 ) if ( !iam ) printf("\n.. After L-solve: y =\n"); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); } MPI_Barrier( grid->comm ); } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; } }