void p2p(MPI_Comm comm) { int id=0, numprocs, value=42, i, dest; MPI_Request req; MPI_Status stat; VT_begin(3); MPI_Comm_size(comm, &numprocs); MPI_Comm_rank(comm, &id); MPI_Barrier(comm); for (i=0; i<5; i++) { if ( id % 2 == 0 ) { value = 42; dest = (id+1+numprocs) % numprocs; if ( dest != id && dest < numprocs ) { MPI_Isend(&value, 1, MPI_INT, dest, 42, comm, &req); MPI_Wait(&req, &stat); } } else { value = 0; MPI_Irecv(&value, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &req); MPI_Wait(&req, &stat); } } MPI_Barrier(comm); VT_end(3); }
int main(int argc, char* argv[]) { int id=0, numprocs; int color; MPI_Comm local; double t1, t2; VT_initialize(&argc, &argv); VT_symdef(1, "step", "USR"); VT_symdef(2, "sequential", "USR"); VT_symdef(3, "p2p", "USR"); VT_symdef(4, "parallel", "USR"); VT_symdef(5, "main", "USR"); VT_begin(5); MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &id); printf("%03d: ctest-vt start\n", id); color = (id >= numprocs/2); MPI_Comm_split(MPI_COMM_WORLD, color, id, &local); t1 = MPI_Wtime(); parallel(MPI_COMM_WORLD); parallel(local); parallel(MPI_COMM_WORLD); t2 = MPI_Wtime(); MPI_Comm_free(&local); MPI_Finalize(); printf("%03d: ctest-vt end (%12.9f)\n", id, (t2-t1)); VT_end(5); VT_finalize(); return 0; }
double step(int id, int tid, int i) { double r; VT_begin(1); /* printf("%d:%d: step %d\n", id, tid, i); */ r = do_work(100); VT_end(1); return r; }
void parallel(MPI_Comm comm) { int id=0, numprocs, value=42; double sum=0.0, gsum=0.0; VT_begin(4); MPI_Comm_size(comm, &numprocs); MPI_Comm_rank(comm, &id); MPI_Bcast(&value, 1, MPI_INT, numprocs-1, comm); sum += sequential(id); MPI_Barrier(comm); sum += sequential(id); MPI_Barrier(comm); sum += sequential(id); MPI_Allreduce (&sum, &gsum, 1, MPI_DOUBLE, MPI_SUM, comm); p2p(comm); VT_end(4); }
double sequential(int id) { int i, tid=0; double sum; VT_begin(2); sum = do_work(100); #pragma omp parallel private(tid) { #ifdef OPENMP tid = omp_get_thread_num(); #endif #pragma omp for reduction(+:sum) for (i=0; i<12; i++) { sum += step(id, tid, i); } } sum += do_work(100); VT_end(2); return sum; }
int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(U_diag_blk_send_req = (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request)))) ABORT("Malloc fails for U_diag_blk_send_req[]."); U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */ if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } k = sp_ienv_dist(3); /* max supernode size */ if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); if ( U_diag_blk_send_req[myrow] ) { /* wait for last Isend requests to complete, deallocate objects */ for (krow = 0; krow < Pr; ++krow) if ( krow != myrow ) MPI_Wait(U_diag_blk_send_req + krow, &status); } SUPERLU_FREE(U_diag_blk_send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */