int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(U_diag_blk_send_req = (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request)))) ABORT("Malloc fails for U_diag_blk_send_req[]."); U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */ if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } k = sp_ienv_dist(3); /* max supernode size */ if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); if ( U_diag_blk_send_req[myrow] ) { /* wait for last Isend requests to complete, deallocate objects */ for (krow = 0; krow < Pr; ++krow) if ( krow != myrow ) MPI_Wait(U_diag_blk_send_req + krow, &status); } SUPERLU_FREE(U_diag_blk_send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */
int main(int argc, char **argv) { double sum_total_timer, total_timer = 0.0; double sum_gather_timer, gather_timer = 0.0; double sum_mpi_timer, mpi_timer = 0.0; double curr_time; double output_time; double dt = 0.0; double local_max_norm = 0.1; double max_norm = 0; int steps; int* fish_off; int* n_fish_split; MPI_Init (&argc, &argv); #ifdef TRACE_WITH_VAMPIR VT_symdef(TRACE_LOCAL_COMP, "Local computation", "Computation"); VT_symdef(TRACE_FISH_GATHER, "Gathering to 0", "Communication"); VT_symdef(TRACE_MAX_NORM, "Collecting max norm", "Communication"); VT_symdef(TRACE_OUTPUT, "Output", "Output"); #endif MPI_Comm_size (comm, &n_proc); MPI_Comm_rank (comm, &rank); make_fishtype (&fishtype); get_options(argc, argv); srand48(clock()); //MPI_Allreduce (&local_max_norm, &max_norm, 1, MPI_DOUBLE, MPI_MAX, comm); //printf("local_max_norm = %g, max_norm = %g\n", local_max_norm, max_norm); #ifdef TRACE_WITH_VAMPIR VT_traceoff(); #endif if (output_filename) { outputp = 1; if (0 == rank) { output_fp = fopen(output_filename, "w"); if (output_fp == NULL) { printf("Could not open %s for output\n", output_filename); exit(1); } fprintf(output_fp, "n_fish: %d\n", n_fish); } } fish_off = malloc ( (n_proc+1) * sizeof(int) ); n_fish_split = malloc ( (n_proc) * sizeof(int) ); //split each fish to different processors. //fish_off: offset index of the fish in that processor //n_fish_split is the # of fish in each processor //ALL FUNCTIONALITY OF split_fish SHOULD BE DONE AFTER init_fish //split_fish (n_proc, fish_off, n_fish_split); //n_local_fish = n_fish_split[rank]; /* All fish are generated on proc 0 to ensure same random numbers. (Yes, the circle case could be parallelized. Feel free to do it.) */ //split physical box sizes row = (int)sqrt((double)n_proc); column = n_proc/row; double rowSep = WALL_SEP/row; double columnSep = WALL_SEP/column; int rowIndex = rank / column; int columnIndex = rank % column; topBound = rowSep * rowIndex; bottomBound = topBound + rowSep; leftBound = columnSep * columnIndex; rightBound = leftBound + columnSep; assert(n_proc % row == 0); // Add n_proc # of arrays each holding ID of local fishes fish_t fishProc[n_proc][n_fish]; int n_fish_proc[n_proc]; int k; for (k = 0; k < n_proc; k++) n_fish_proc[k] = 0; ////////////////////////////////// init_fish (rank, fish_off, n_fish_split, row, column, fishProc, n_fish_proc); // distribute initial conditions to all processes if (rank == 0) { local_fish = fishProc[0]; n_local_fish = n_fish_proc[0]; // Functionality of MPI_Scatterv is done here with Isends //MPI_Request request[n_proc-1]; int mesTag = 0; MPI_Request *req; for (k = 1; k < n_proc; ++k) { //printf("n_fish_proc[%d], %d\n", k, n_fish_proc[k]); MPI_Isend(fishProc[k], n_fish_proc[k], fishtype, k, mesTag, comm, req); } } else { MPI_Status status; // Processors of rank != 0 receives. MPI_Recv( local_fish, n_fish, fishtype, 0, MPI_ANY_TAG, comm, &status); MPI_Get_count(&status, fishtype, &n_local_fish); } printf("rank[%d], n_local_fish = %d\n", rank, n_local_fish); ///* //MPI_Scatterv (fish, n_fish_split, fish_off, fishtype, // local_fish, n_local_fish, fishtype, // 0, comm); //*/ #ifdef TRACE_WITH_VAMPIR tracingp = 1; VT_traceon(); #endif start_mpi_timer(&total_timer); for (output_time = 0.0, curr_time = 0.0, steps = 0; curr_time <= end_time && steps < max_steps; curr_time += dt, ++steps) { #ifdef TRACE_WITH_VAMPIR if (steps >= STEPS_TO_TRACE) { tracingp = 0; VT_traceoff(); } #endif trace_begin(TRACE_FISH_GATHER); start_mpi_timer (&gather_timer); start_mpi_timer (&mpi_timer); /* Pull in all the fish. Obviously, this is not a good idea. You will be greatly expanding this one line... However, feel free to waste memory when producing output. If you're dumping fish to a file, go ahead and do an Allgatherv _in the output steps_ if you want. Or you could pipeline dumping the fish. MPI_Allgatherv (local_fish, n_local_fish, fishtype, fish, n_fish_split, fish_off, fishtype, comm); */ //MPI_Request* sendReq, recvReq; // Set aside buffer for fish received from other processes. /* for (j = 0; j < NUM_NEIGHBOR; ++j) { //FIXME: which neighbors does not exist? if (rankNeighbor[j] >= 0) { MPI_Isend(local_fish, n_local_fish, fishtype, rankNeighbor[j], MPI_ANY_TAG, comm, &sendReqArray); MPI_Irecv(impact_fish, n_fish, fishtype, rankNeighbor[NUM_NEIGHBOR - j], MPI_ANY_TAG, comm, &sendReqArray); MPI_Wait(recvReq, MPI_STATUS_IGNORE); interact_fish_mpi(local_fish, n_local_fish, impact_fish, sizeof(impact_fish)); } } */ // get migrate fish // send migrate fish // receive migrate fish // update local fish // get impact fish // send impact fish // receive impact fish // interact impact fish // interact local fish // move MPI_Request sendReqArray[NUM_NEIGHBOR]; MPI_Request recvReqArray[NUM_NEIGHBOR]; fish_t receive_impact_fish[NUM_NEIGHBOR][n_fish]; int n_receive_impact_fish[NUM_NEIGHBOR]; fish_t receive_migrate_fish[NUM_NEIGHBOR][n_fish]; int n_receive_migrate_fish[NUM_NEIGHBOR]; int n_send_impact_fish[NUM_NEIGHBOR]; fish_t* send_impact_fish[NUM_NEIGHBOR]; int n_send_migrate_fish[NUM_NEIGHBOR]; fish_t* send_migrate_fish[NUM_NEIGHBOR]; get_interacting_fish( local_fish, n_local_fish, send_migrate_fish, n_send_migrate_fish, 1); int tmp; for (tmp = 0; tmp < NUM_NEIGHBOR; tmp++) { printf("rank[%d], iter[%d] ------- get [%d] migrate fish for neig[%d]. \n", rank, iter, n_send_migrate_fish[tmp], tmp); } Isend_receive_fish(send_migrate_fish, n_send_migrate_fish, receive_migrate_fish, n_fish, sendReqArray, recvReqArray); wait_for_fish(recvReqArray, n_receive_migrate_fish); // FIXME: Have not implement update on local fish. //update_local_fish(); get_interacting_fish(local_fish, n_local_fish, send_impact_fish, n_send_impact_fish, 0); for (tmp = 0; tmp < NUM_NEIGHBOR; tmp++) { printf("rank[%d], iter[%d] ------- get [%d] impact fish for neig[%d]. \n", rank, iter, n_send_impact_fish[tmp], tmp); } Isend_receive_fish(send_impact_fish, n_send_impact_fish, receive_impact_fish, n_fish, sendReqArray, recvReqArray); wait_for_fish(recvReqArray, n_receive_impact_fish); int index; for (index = 0; index < NUM_NEIGHBOR; index++) { if (n_receive_impact_fish[index] > 0) { interact_fish_mpi(local_fish, n_local_fish, receive_impact_fish[index], n_receive_impact_fish[index]); } } //*/ // make sure we are sending and receiving the same # msg. //assert(dbg == 0); // While waiting, interact with fish in its own pocket first printf("rank[%d], iter[%d] ------- interact [%d] local fishes\n", rank, iter, n_local_fish); interact_fish_mpi(local_fish, n_local_fish, local_fish, n_local_fish); printf("rank[%d], iter[%d] ------- finished interact local fish\n", rank, iter); stop_mpi_timer (&gather_timer); stop_mpi_timer (&mpi_timer); trace_end(TRACE_FISH_GATHER); /* We only output once every output_interval time unit, at most. Without that restriction, we can easily create a huge output file. Printing a record for ten fish takes about 300 bytes, so for every 1000 steps, we could dump 300K of info. Now scale the number of fish by 1000... */ trace_begin(TRACE_OUTPUT); if (outputp && curr_time >= output_time) { if (0 == rank) output_fish (output_fp, curr_time, dt, fish, n_fish); output_time = curr_time + output_interval; } trace_end(TRACE_OUTPUT); trace_begin (TRACE_LOCAL_COMP); //interact_fish (local_fish, n_local_fish, fish, n_fish); local_max_norm = compute_norm (local_fish, n_local_fish); trace_end (TRACE_LOCAL_COMP); trace_begin (TRACE_MAX_NORM); start_mpi_timer (&mpi_timer); printf("rank[%d], iter[%d] ------- Allreduce max_norm, \n", rank, iter); MPI_Allreduce (&local_max_norm, &max_norm, 1, MPI_DOUBLE, MPI_MAX, comm); printf("rank[%d], iter[%d] ------- local_max_norm: %g, max_norm: %g\n", local_max_norm, max_norm); stop_mpi_timer (&mpi_timer); trace_end (TRACE_MAX_NORM); trace_begin (TRACE_LOCAL_COMP); dt = max_norm_change / max_norm; dt = f_max(dt, min_dt); dt = f_min(dt, max_dt); printf("rank[%d], iter[%d] ------- moving [%d] local_fish, \n", rank, iter, n_local_fish); move_fish(local_fish, n_local_fish, dt); printf("rank[%d], iter[%d] ------- finished moving.\n", rank, iter); trace_end (TRACE_LOCAL_COMP); iter++; } stop_mpi_timer(&total_timer); #ifdef TRACE_WITH_VAMPIR VT_traceoff(); #endif if (outputp) { MPI_Allgatherv (local_fish, n_local_fish, fishtype, fish, n_fish_split, fish_off, fishtype, comm); if (0 == rank) { output_fish (output_fp, curr_time, dt, fish, n_fish); printf("\tEnded at %g (%g), %d (%d) steps\n", curr_time, end_time, steps, max_steps); } } printf("rank[%d], ------- 39, \n", rank); MPI_Reduce (&total_timer, &sum_total_timer, 1, MPI_DOUBLE, MPI_SUM, 0, comm); printf("rank[%d], ------- 40, \n", rank); MPI_Reduce (&gather_timer, &sum_gather_timer, 1, MPI_DOUBLE, MPI_SUM, 0, comm); printf("rank[%d], ------- 41, \n", rank); MPI_Reduce (&mpi_timer, &sum_mpi_timer, 1, MPI_DOUBLE, MPI_SUM, 0, comm); printf("rank[%d], ------- 42, \n", rank); if (0 == rank) { printf("Number of PEs: %d\n" "Time taken on 0: %g (avg. %g)\n" "Time in gathers on 0: %g (avg %g)\n" "Time in MPI on 0: %g (avg %g)\n", n_proc, total_timer, sum_total_timer / n_proc, gather_timer, sum_gather_timer / n_proc, mpi_timer, sum_mpi_timer / n_proc); } printf("rank[%d], ------- 43, \n", rank); MPI_Barrier (comm); printf("rank[%d], ------- 44, \n", rank); MPI_Finalize (); printf("rank[%d], ------- done!!, \n", rank); return 0; }
/*! \brief * * <pre> * Purpose * ======= * * The driver program PDDRIVE1. * * This example illustrates how to use PDGSSVX to * solve systems with the same A but different right-hand side. * In this case, we factorize A only once in the first call to * PDGSSVX, and reuse the following data structures * in the subsequent call to PDGSSVX: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : Glu_persist, Llu * * With MPICH, program may be run by typing: * mpiexec -n <np> pddrive1 -r <proc rows> -c <proc columns> big.rua * </pre> */ int main(int argc, char *argv[]) { superlu_dist_options_t options; SuperLUStat_t stat; SuperMatrix A; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; SOLVEstruct_t SOLVEstruct; gridinfo_t grid; double *berr; double *b, *xtrue, *b1; int i, j, m, n; int nprow, npcol; int iam, info, ldb, ldx, nrhs; char **cpp, c, *postfix; int ii, omp_mpi_level; FILE *fp, *fopen(); int cpp_defs(); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default %d)\n", nprow); printf("\t-c <int>: process columns (default %d)\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER printf("__INTEL_COMPILER is defined\n"); #endif printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); printf("Input matrix file:\t%s\n", *cpp); printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); fflush(stdout); } #if ( VAMPIR>=1 ) VT_traceoff(); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif for(ii = 0;ii<strlen(*cpp);ii++){ if((*cpp)[ii]=='.'){ postfix = &((*cpp)[ii+1]); } } // printf("%s\n", postfix); /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) ABORT("Malloc fails for b1[]"); for (j = 0; j < nrhs; ++j) for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb]; if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* ------------------------------------------------------------ WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; options.ReplaceTinyPivot = NO; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); fflush(stdout); } m = A.nrow; n = A.ncol; /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver. */ pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) printf("\tSolve the first system:\n"); pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, nrhs, b, ldb, xtrue, ldx, &grid); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); /* ------------------------------------------------------------ NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) printf("\tSolve the system with a different B:\n"); pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, nrhs, b1, ldb, xtrue, ldx, &grid); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); Destroy_LU(n, &grid, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { dSolveFinalize(&options, &SOLVEstruct); } SUPERLU_FREE(b); SUPERLU_FREE(b1); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: superlu_gridexit(&grid); /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }
int main(int argc, char *argv[]) { superlu_options_t options; SuperLUStat_t stat; SuperMatrix A; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; gridinfo_t grid; double *berr; double *a, *b, *xtrue; int_t *asub, *xa; int_t m, n, nnz; int_t nprow, npcol; int iam, info, ldb, ldx, nrhs; char trans[1]; char **cpp, c; FILE *fp, *fopen(); extern int cpp_defs(); /* prototypes */ extern void LUstructInit(const int_t, LUstruct_t *); extern void LUstructFree(LUstruct_t *); extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default " IFMT ")\n", nprow); printf("\t-c <int>: process columns (default " IFMT ")\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; #if ( VAMPIR>=1 ) VT_traceoff(); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif /* ------------------------------------------------------------ PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL THE OTHER PROCESSES. ------------------------------------------------------------*/ if ( !iam ) { /* Print the CPP definitions. */ cpp_defs(); #if 1 /* Read the matrix stored on disk in Harwell-Boeing format. */ dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); #else /* Read the matrix stored on disk in Harwell-Boeing format. */ printf(".. reading triplet file\n"); dreadtriple(fp, &m, &n, &nnz, &a, &asub, &xa); #endif printf("Input matrix file: %s\n", *cpp); printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); /* Allocate storage for compressed column representation. */ dallocateA_dist(n, nnz, &a, &asub, &xa); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } /* Create compressed column matrix for A. */ dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); /* Generate the exact solution and compute the right-hand side. */ if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); *trans = 'N'; ldx = n; ldb = m; dGenXtrue_dist(n, nrhs, xtrue, ldx); dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* ------------------------------------------------------------ NOW WE SOLVE THE LINEAR SYSTEM. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); } /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver. */ pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompCol_Matrix_dist(&A); Destroy_LU(n, &grid, &LUstruct); ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: superlu_gridexit(&grid); /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }
int main(int argc, char *argv[]) { superlu_options_t options; SuperLUStat_t stat; SuperMatrix A; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; SOLVEstruct_t SOLVEstruct; gridinfo_t grid; double *berr; doublecomplex *b, *xtrue; int_t m, n; int_t nprow, npcol; int iam, info, ldb, ldx, nrhs; char **cpp, c; FILE *fp, *fopen(); extern int cpp_defs(); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default %d)\n", nprow); printf("\t-c <int>: process columns (default %d)\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; if ( !iam ) printf("\tProcess grid\t%d X %d\n", grid.nprow, grid.npcol); #if ( VAMPIR>=1 ) VT_traceoff(); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* ------------------------------------------------------------ NOW WE SOLVE THE LINEAR SYSTEM. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ParSymbFact = NO; options.ColPerm = MMD_AT_PLUS_A; options.RowPerm = LargeDiag; options.ReplaceTinyPivot = YES; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); #if 0 options.RowPerm = NOROWPERM; options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; options.ReplaceTinyPivot = NO; #endif m = A.nrow; n = A.ncol; /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(m, n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver. */ pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, nrhs, b, ldb, xtrue, ldx, &grid); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); Destroy_LU(n, &grid, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { zSolveFinalize(&options, &SOLVEstruct); } SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: superlu_gridexit(&grid); /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }