SPOOLESSolverMT::SPOOLESSolverMT(const SparseMatrix * A, int numThreads, int verbose) { n = A->Getn(); this->verbose = verbose; msgFile = fopen("SPOOLES.message","w"); // prepare SPOOLES input matrix if (verbose >= 1) printf("Converting matrix to SPOOLES format...\n"); InpMtx * mtxA = InpMtx_new(); InpMtx_init(mtxA, INPMTX_BY_ROWS, SPOOLES_REAL, A->GetNumEntries(), n); for(int row=0; row<n; row++) { int rowLength = A->GetRowLength(row); for(int j=0; j< rowLength; j++) { if (A->GetColumnIndex(row,j) >= row) InpMtx_inputRealEntry(mtxA, row, A->GetColumnIndex(row, j), A->GetEntry(row, j) ); } } InpMtx_changeStorageMode(mtxA, INPMTX_BY_VECTORS); //InpMtx_writeForHumanEye(mtxA, msgFile); // compute the factorization if (verbose >= 1) printf("Factoring the %d x %d matrix...\n",n,n); BridgeMT * bridgeMT = BridgeMT_new(); BridgeMT_setMatrixParams(bridgeMT, n, SPOOLES_REAL, SPOOLES_SYMMETRIC); BridgeMT_setMessageInfo(bridgeMT, 1, msgFile); int rc = BridgeMT_setup(bridgeMT, mtxA); if (rc != 1) { printf("Error: BridgeMT setup returned exit code %d.\n", rc); throw 1; } int type = 1; // real entries int nfront, nfind, nfent; double nfactorops; rc = BridgeMT_factorStats(bridgeMT, type, SPOOLES_SYMMETRIC, &nfront, &nfind, &nfent, &nsolveops, &nfactorops); if ( rc != 1 ) { printf("Error: BridgeMT_factorStats returned exit code %d.\n", rc); throw 1; } fprintf(msgFile, "\n\n factor matrix statistics" "\n %d fronts, %d indices, %d entries" "\n %d solve operations, %12.4e factor operations", nfront, nfind, nfent, nsolveops, nfactorops) ; fflush(msgFile) ; // setup the parallel factorization rc = BridgeMT_factorSetup(bridgeMT, numThreads, 0, 0.0) ; fprintf(msgFile, "\n\n ----- PARALLEL FACTOR SETUP -----\n") ; fprintf(msgFile, "\n CPU %8.3f : time to setup parallel factorization", bridgeMT->cpus[5]) ; fprintf(msgFile, "\n total factor operations = %.0f", DV_sum(bridgeMT->cumopsDV)) ; fprintf(msgFile, "\n upper bound on speedup due to load balance = %.2f", DV_sum(bridgeMT->cumopsDV)/DV_max(bridgeMT->cumopsDV)) ; fprintf(msgFile, "\n operations distributions over threads") ; DV_writeForHumanEye(bridgeMT->cumopsDV, msgFile) ; fflush(msgFile) ; // factor the matrix int permuteflag = 1 ; int error; rc = BridgeMT_factor(bridgeMT, mtxA, permuteflag, &error); if ( rc == 1 ) { fprintf(msgFile, "\n\n factorization completed successfully\n") ; } else { printf("Error: factorization returned exit code %d (error %d).\n", rc, error); throw 1; } fprintf(msgFile, "\n\n ----- FACTORIZATION -----\n") ; fprintf(msgFile, "\n CPU %8.3f : time to permute original matrix" "\n CPU %8.3f : time to initialize factor matrix" "\n CPU %8.3f : time to compute factorization" "\n CPU %8.3f : time to post-process factorization" "\n CPU %8.3f : total factorization time\n", bridgeMT->cpus[6], bridgeMT->cpus[7], bridgeMT->cpus[8], bridgeMT->cpus[9], bridgeMT->cpus[10]) ; fprintf(msgFile, "\n\n factorization statistics" "\n %d pivots, %d pivot tests, %d delayed vertices" "\n %d entries in D, %d entries in L, %d entries in U", bridgeMT->stats[0], bridgeMT->stats[1], bridgeMT->stats[2], bridgeMT->stats[3], bridgeMT->stats[4], bridgeMT->stats[5]) ; fprintf(msgFile, "\n\n factorization: raw mflops %8.3f, overall mflops %8.3f", 1.e-6*nfactorops/bridgeMT->cpus[8], 1.e-6*nfactorops/bridgeMT->cpus[10]) ; fflush(msgFile) ; // construct dense SPOOLES matrix for rhs and x DenseMtx *mtx_rhs = DenseMtx_new(); DenseMtx_init(mtx_rhs, SPOOLES_REAL, 0, 0, n, 1, 1, n); mtx_rhsPointer = (void*) mtx_rhs; DenseMtx *mtx_x = DenseMtx_new(); DenseMtx_init(mtx_x, SPOOLES_REAL, 0, 0, n, 1, 1, n); mtx_xPointer = (void*) mtx_x; bridgeMTPointer = (void*) bridgeMT; APointer = (void*) mtxA; if (verbose >= 1) printf("Factorization completed.\n"); }
/* ---------------------------------------------------------- purpose -- to construct the map from fronts to processors, and compute operations for each processor. maptype -- type of map for parallel factorization maptype = 1 --> wrap map maptype = 2 --> balanced map maptype = 3 --> subtree-subset map maptype = 4 --> domain decomposition map cutoff -- used when maptype = 4 as upper bound on relative domain size return value -- 1 -- success -1 -- bridge is NULL -2 -- front tree is NULL created -- 98sep25, cca ---------------------------------------------------------- */ int BridgeMPI_factorSetup ( BridgeMPI *bridge, int maptype, double cutoff ) { double t1, t2 ; DV *cumopsDV ; ETree *frontETree ; FILE *msgFile ; int msglvl, nproc ; /* --------------- check the input --------------- */ MARKTIME(t1) ; if ( bridge == NULL ) { fprintf(stderr, "\n error in BridgeMPI_factorSetup()" "\n bridge is NULL") ; return(-1) ; } if ( (frontETree = bridge->frontETree) == NULL ) { fprintf(stderr, "\n error in BridgeMPI_factorSetup()" "\n frontETree is NULL") ; return(-2) ; } nproc = bridge->nproc ; msglvl = bridge->msglvl ; msgFile = bridge->msgFile ; /* ------------------------------------------- allocate and initialize the cumopsDV object ------------------------------------------- */ if ( (cumopsDV = bridge->cumopsDV) == NULL ) { cumopsDV = bridge->cumopsDV = DV_new() ; } DV_setSize(cumopsDV, nproc) ; DV_zero(cumopsDV) ; /* ---------------------------- create the owners map object ---------------------------- */ switch ( maptype ) { case 1 : bridge->ownersIV = ETree_wrapMap(frontETree, bridge->type, bridge->symmetryflag, cumopsDV) ; break ; case 2 : bridge->ownersIV = ETree_balancedMap(frontETree, bridge->type, bridge->symmetryflag, cumopsDV) ; break ; case 3 : bridge->ownersIV = ETree_subtreeSubsetMap(frontETree, bridge->type, bridge->symmetryflag, cumopsDV) ; break ; case 4 : bridge->ownersIV = ETree_ddMap(frontETree, bridge->type, bridge->symmetryflag, cumopsDV, cutoff) ; break ; default : bridge->ownersIV = ETree_ddMap(frontETree, bridge->type, bridge->symmetryflag, cumopsDV, 1./(2*nproc)) ; break ; } MARKTIME(t2) ; bridge->cpus[7] = t2 - t1 ; if ( msglvl > 1 ) { fprintf(msgFile, "\n\n parallel factor setup") ; fprintf(msgFile, "\n type = %d, symmetryflag = %d", bridge->type, bridge->symmetryflag) ; fprintf(msgFile, "\n total factor operations = %.0f", DV_sum(cumopsDV)) ; fprintf(msgFile, "\n upper bound on speedup due to load balance = %.2f", DV_max(cumopsDV)/DV_sum(cumopsDV)) ; fprintf(msgFile, "\n operations distributions over threads") ; DV_writeForHumanEye(cumopsDV, msgFile) ; fflush(msgFile) ; } if ( msglvl > 2 ) { fprintf(msgFile, "\n\n owners map IV object") ; IV_writeForHumanEye(bridge->ownersIV, msgFile) ; fflush(msgFile) ; } /* ---------------------------- create the vertex map object ---------------------------- */ bridge->vtxmapIV = IV_new() ; IV_init(bridge->vtxmapIV, bridge->neqns, NULL) ; IVgather(bridge->neqns, IV_entries(bridge->vtxmapIV), IV_entries(bridge->ownersIV), ETree_vtxToFront(bridge->frontETree)) ; if ( msglvl > 2 ) { fprintf(msgFile, "\n\n vertex map IV object") ; IV_writeForHumanEye(bridge->vtxmapIV, msgFile) ; fflush(msgFile) ; } return(1) ; }
/* ------------------------------------------------------------------ to fill xDV and yDV with a log10 profile of the magnitudes of the entries in the DV object. tausmall and tau big provide cutoffs within which to examine the entries. pnzero, pnsmall and pnbig are addresses to hold the number of entries zero, smaller than tausmall and larger than taubig, respectively. created -- 97feb14, cca ------------------------------------------------------------------ */ void DV_log10profile ( DV *dv, int npts, DV *xDV, DV *yDV, double tausmall, double taubig, int *pnzero, int *pnsmall, int *pnbig ) { double deltaVal, maxval, minval, val ; double *dvec, *sums, *x, *y ; int ii, ipt, nbig, nsmall, nzero, size ; /* --------------- check the input --------------- */ if ( dv == NULL || npts <= 0 || xDV == NULL || yDV == NULL || tausmall < 0.0 || taubig < 0.0 || tausmall > taubig || pnzero == NULL || pnsmall == NULL || pnbig == NULL ) { fprintf(stderr, "\n fatal error in DV_log10profile(%p,%d,%p,%p,%f,%f,%p,%p,%p)" "\n bad input\n", dv, npts, xDV, yDV, tausmall, taubig, pnzero, pnsmall, pnbig) ; exit(-1) ; } /* ------------------------------------- find the largest and smallest entries in the range [tausmall, taubig] ------------------------------------- */ nbig = nsmall = nzero = 0 ; minval = maxval = 0.0 ; DV_sizeAndEntries(dv, &size, &dvec) ; for ( ii = 0 ; ii < size ; ii++ ) { val = fabs(dvec[ii]) ; if ( val == 0.0 ) { nzero++ ; } else if ( val <= tausmall ) { nsmall++ ; } else if ( val >= taubig ) { nbig++ ; } else { if ( minval == 0.0 || minval > val ) { minval = val ; } if ( maxval < val ) { maxval = val ; } } } *pnzero = nzero ; *pnsmall = nsmall ; *pnbig = nbig ; #if MYDEBUG > 0 fprintf(stdout, "\n nzero = %d, minval = %e, nsmall = %d, maxval = %e, nbig = %d", nzero, minval, nsmall, maxval, nbig) ; #endif /* ------------------ set up the buckets ------------------ */ DV_setSize(xDV, npts) ; DV_setSize(yDV, npts) ; x = DV_entries(xDV) ; y = DV_entries(yDV) ; sums = DVinit(npts, 0.0) ; minval = log10(minval) ; maxval = log10(maxval) ; /* minval = log10(tausmall) ; maxval = log10(taubig) ; */ deltaVal = (maxval - minval)/(npts - 1) ; DVfill(npts, x, 0.0) ; DVfill(npts, y, 0.0) ; /* -------------------------------- fill the sums and counts vectors -------------------------------- */ for ( ii = 0 ; ii < size ; ii++ ) { val = fabs(dvec[ii]) ; if ( tausmall < val && val < taubig ) { ipt = (log10(val) - minval) / deltaVal ; sums[ipt] += val ; y[ipt]++ ; } } #if MYDEBUG > 0 fprintf(stdout, "\n sum(y) = %.0f", DV_sum(yDV)) ; #endif /* --------------------------- set the x-coordinate vector --------------------------- */ for ( ipt = 0 ; ipt < npts ; ipt++ ) { if ( sums[ipt] == 0.0 ) { x[ipt] = minval + ipt*deltaVal ; } else { x[ipt] = log10(sums[ipt]/y[ipt]) ; } } /* ------------------------ free the working storage ------------------------ */ DVfree(sums) ; return ; }