void runJadMatrixTests(Epetra_JadMatrix * A, Epetra_MultiVector * b, Epetra_MultiVector * bt, Epetra_MultiVector * xexact, bool StaticProfile, bool verbose, bool summary) { Epetra_MultiVector z(*b); Epetra_MultiVector r(*b); Epetra_SerialDenseVector resvec(b->NumVectors()); //Timings Epetra_Flops flopcounter; A->SetFlopCounter(flopcounter); Epetra_Time timer(A->Comm()); for (int j=0; j<2; j++) { // j = 0 is notrans, j = 1 is trans bool TransA = (j==1); A->SetUseTranspose(TransA); flopcounter.ResetFlops(); timer.ResetStartTime(); //10 matvecs for( int i = 0; i < 10; ++i ) A->Apply(*xexact, z); // Compute z = A*xexact or z = A'*xexact double elapsed_time = timer.ElapsedTime(); double total_flops = A->Flops(); // Compute residual if (TransA) r.Update(-1.0, z, 1.0, *bt, 0.0); // r = bt - z else r.Update(-1.0, z, 1.0, *b, 0.0); // r = b - z r.Norm2(resvec.Values()); if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": "; double MFLOPs = total_flops/elapsed_time/1000000.0; if (verbose) cout << "Total MFLOPs for 10 " << " Jagged Diagonal MatVec's with (Trans = " << TransA << ") " << MFLOPs << " (" << elapsed_time << " s)" <<endl; if (summary) { if (A->Comm().NumProc()==1) { if (TransA) cout << "TransMv" << '\t'; else cout << "NoTransMv" << '\t'; } cout << MFLOPs << endl; } } return; }
//M+ void mp( int MinCoreSize, int MaxCoreSize, int SamplingFreq, int NumReplicates, char* OutFilePath, std::string Kernel, vector<int> KernelAccessionIndex, vector<int> AccessionNameList, vector<vector<vector<int> > > ActiveAlleleByPopList, vector<vector<vector<int> > > TargetAlleleByPopList, vector<int> ActiveMaxAllelesList, vector<int> TargetMaxAllelesList, vector<std::string> FullAccessionNameList ) { //PERFORM INITIAL MPI STUFF MPI_Status status; //this struct contains three fields which will contain info about the sender of a received message // MPI_SOURCE, MPI_TAG, MPI_ERROR //MPI::Init (); //Initialize MPI. int nproc = MPI::COMM_WORLD.Get_size ( ); //Get the number of processes. int procid = MPI::COMM_WORLD.Get_rank ( ); //Get the individual process ID. //set up vectors to fill with results //below is a stupid way to calculate the number of rows in the output file, value l (which = V1) //used to monitor progress and as the maximum vector index for shared output vectors int l=0; for (int i=MinCoreSize;i<MaxCoreSize+1;i=i+SamplingFreq) { for (int j=0;j<NumReplicates;j++) { l++; } } double V1 = (double)l; //(MaxCoreSize - MinCoreSize + 1)*NumReplicates; //number of rows in output vectors vector<vector<double> > Results(V1, vector<double>(9)); //will contain numerical results vector<vector<string> > Members(V1); //will contain core set members //***MPI: RECEIVE RESULTS AT MASTER 0 //receive values from any slave, in any order, exiting when the number of 'receives' = the top vector size if ( procid == 0 ) { //set up variables for monitoring progress int percent; //percent of analysis completed int progindex = 0; //index to monitor progress, percent = 100*(progindex/l) //receive and process results from slave processors unsigned int i = 0; while (i<2*(Results.size())) //two receives per row { //probe the incoming message to determine its tag int nchar; //will contain the length of the char array passed with tag=1 int vchar; //will contain the length of the vector passed with tag=0 int tag; //tag of message from sender int source; //procid of sender MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); //MPI_Get_count(&status, MPI_CHAR, &nchar); //probes the length of the message, saves it in nchar tag = status.MPI_TAG; //the tag defines which kind of comm it is, a vector of stats (0=resvec()) //or a char array describing the members of the core (1=cc) source = status.MPI_SOURCE; //determine the source of the message so that you can define which sender to Recv from. This will avoid an intervening message coming in after the MPI_Probe with a different length, causing a message truncated error. if (tag == 0) { //determine the length of the message tagged 0 MPI_Get_count(&status, MPI_DOUBLE, &vchar); //cout <<" vchar="<<vchar<<" tag="<<tag<<" MPI_SOURCE="<<status.MPI_SOURCE<<" MPI_ERROR="<<status.MPI_ERROR<<"\n"; //receive the vector of results, tagged 0, from: //MPI_Send(&resvec[0], resvec.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); vector<double> t(10); MPI_Recv(&t[0], vchar, MPI_DOUBLE, source, 0, MPI_COMM_WORLD, &status); //load data from vector received onto Results, row number is last item t[9] for (int j=0;j<9;++j) { Results[ t[9] ][j] = t[j]; } t.clear(); } else if (tag == 1) { //determine the length of the message tagged 1 MPI_Get_count(&status, MPI_CHAR, &nchar); //probes the length of the message, saves it in nchar //cout <<" nchar="<<nchar<<" tag="<<tag<<" MPI_SOURCE="<<status.MPI_SOURCE<<" MPI_ERROR="<<status.MPI_ERROR<<"\n"; //receive the vector<string> of the core set, tagged 1, from: //MPI_Send(&m[0], nchar, MPI_CHAR, 0, 1, MPI_COMM_WORLD); //vector<string> m(nchar); char m[nchar]; MPI_Recv(&m[0], nchar, MPI_CHAR, source, 1, MPI_COMM_WORLD, &status); //load core set onto Members //1. convert char array into a string string mstr(m); //2. split string on delimiter ',<!>,' string delim = ",<!>,"; vector<string> mvec( countSubstring(mstr, delim) ); unsigned int st = 0; std::size_t en = mstr.find(delim); int k = 0; while (en != std::string::npos) { mvec[k] = mstr.substr(st, en-st); st = en + delim.length(); en = mstr.find(delim,st); ++k; } string z = mstr.substr(st); //get row number as last item in mstr int zz = atoi(z.c_str()); //convert string to c-string then to int //3. load onto Members Members[zz] = mvec; //4. clean up memset(m, 0, nchar);; mstr=""; mvec.clear(); } ++i; //display progress progindex = progindex + 1; percent = 100*( progindex/(V1*2) ); //number of rows X 2 repeats needed to complete search printProgBar(percent); } }//***MPI: END MASTER RECEIVE***/ /***MPI: SEND RESULTS FROM SLAVE PROCESSES***/ else if ( procid != 0 ) { unsigned int r; //r = core size, //int nr, RandAcc, b, bsc, plateau; //nr = controller to repeat NumReplicates times int RandAcc, b, bsc, plateau; //nr = controller to repeat NumReplicates times //row = result vector row number, bsc = holds best sub core member, and other indexed accessions //plateau = index of the number of reps in optimization loop with same diversity value double RandomActiveDiversity; double AltRandomActiveDiversity; double StartingRandomActiveDiversity; double StartingAltRandomActiveDiversity; double RandomTargetDiversity; double AltRandomTargetDiversity; double StartingDiversity; double TempAltOptimizedActiveDiversity; double AltOptimizedActiveDiversity; double OptimizedTargetDiversity; double AltOptimizedTargetDiversity; double best; double nnew; vector<vector<vector<int> > > AlleleList; vector<vector<vector<int> > > CoreAlleles; vector<vector<vector<int> > > TdTempList; vector<vector<vector<int> > > BestSubCoreAlleles; std::string Standardize = "yes"; //a run that mimics the MSTRAT approach can be accomplished by setting Standardize="no", and setting up the var file so that each column in the .dat file is treated as a single locus, rather than two (or more) adjacent columns being treated as a single codominant locus. vector<int> AccessionsInCore; vector<int> AccessionsInSubCore; vector<int> BestSubCore; vector<int> BestSubCoreRevSorted; vector<int> TempList; vector<int> TempList2; vector<int> bestcore; vector<std::string> TempListStr; //seed the random number generator for each processor int tt; tt = (time(NULL)); srand ( abs(((tt*181)*((procid-83)*359))%104729) ); //do parallelization so that each rep by core size combo can be //handled by a distinct thread. this involves figuring out the total //number of reps*coresizes taking into account the SamplingFreq int rsteps = 1 + floor( (MaxCoreSize - MinCoreSize) / SamplingFreq ); //number of steps from MinCoreSize to MaxCoreSize //***MPI: figure out where to start and stop loop for each processor int nreps = rsteps*NumReplicates; int count = nreps/(nproc-1); //p-1 assumes a master, i.e. one less processor than total int start = (procid-1) * count; //procid-1 makes you start at 0, assumes master is p0 int stop; if (nreps % (nproc-1) > (procid-1)) { start += procid - 1; stop = start + (count + 1); } else { start += nreps % (nproc-1); stop = start + count; } //iterate thru the relevant rows for (int rnr=start;rnr<stop;++rnr) { r = MinCoreSize + ((rnr / NumReplicates) * SamplingFreq); //int rounds to floor //develop random starting core set //clear AccessionsInCore and set size AccessionsInCore.clear(); AccessionsInCore.resize(r); //add kernel accessions to core, if necessary if (Kernel == "yes") { for (unsigned int i=0;i<KernelAccessionIndex.size();i++) { AccessionsInCore[i] = KernelAccessionIndex[i]; } } //clear TempList and set size TempList.clear(); TempList.resize( AccessionNameList.size() ); //set list of available accessions in TempList, by erasing those already in the core TempList = AccessionNameList; //expunge the kernel accessions, so they are not available for random addition below //KernelAccessionIndex has been reverse sorted so you don't go outside range after automatic resize by .erase for (unsigned int i=0;i<KernelAccessionIndex.size();i++) { b = KernelAccessionIndex[i]; TempList.erase(TempList.begin()+b); } //randomly add accessions until r accessions are in the core. if there is a kernel, include those (done above) //plus additional, randomly selected accessions, until you get r accessions for (unsigned int i=KernelAccessionIndex.size();i<r;i++) { //choose an accession randomly from those available RandAcc = rand() % TempList.size(); //add it to the list AccessionsInCore[i] = TempList[RandAcc]; //remove it from the list of available accessions TempList.erase(TempList.begin()+RandAcc); } //assemble genotypes for random core and calculate diversity //1. put together initial list of active alleles CoreAlleles.clear(); CoreAlleles.resize( AccessionsInCore.size() ); for (unsigned int i=0;i<AccessionsInCore.size();i++) { b = AccessionsInCore[i]; CoreAlleles[i] = ActiveAlleleByPopList[b]; } //2. calculate diversity from random selection at active loci AlleleList.clear(); AlleleList = CoreAlleles; MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity); //in MyCalculateDiversity, latter two variables are updated as references //save them away in non-updated variables StartingRandomActiveDiversity = RandomActiveDiversity; StartingAltRandomActiveDiversity = AltRandomActiveDiversity; //3. calculate diversity from random selection at target loci AlleleList.clear(); AlleleList.resize( AccessionsInCore.size() ); for (unsigned int j=0;j<AccessionsInCore.size();j++) { b = AccessionsInCore[j]; AlleleList[j] = TargetAlleleByPopList[b]; } MyCalculateDiversity(AlleleList, TargetMaxAllelesList, Standardize, RandomTargetDiversity, AltRandomTargetDiversity); //BEGIN OPTIMIZATION StartingDiversity = 0; //this is the diversity recovered during the prior iteration. plateau = 0; //count of the number of times you have found the best value, evaluates when you are //stuck on a plateau, assuming acceptance criterion allows downhill steps //this is the iterations step, now an indefinite loop that is broken when //no improvement is made during the course of the optimization algorithm //If r = kernel size = MinCoreSize then do no optimization but still calculate all variables. if (KernelAccessionIndex.size() == r) { //assemble genotypes for core //1. put together initial list CoreAlleles.clear(); CoreAlleles.resize(r); for (unsigned int i=0;i<r;i++) { b = AccessionsInCore[i]; CoreAlleles[i] = ActiveAlleleByPopList[b]; } AlleleList = CoreAlleles; MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity); best = RandomActiveDiversity; //best is equivalent to OptimizedActiveDiversity AltOptimizedActiveDiversity = AltRandomActiveDiversity; } else { //do optimization while ( true ) { //assemble genotypes for core //1. put together initial list CoreAlleles.clear(); CoreAlleles.resize(r); for (unsigned int i=0;i<r;i++) { b = AccessionsInCore[i]; CoreAlleles[i] = ActiveAlleleByPopList[b]; } //2. go through all possible subsets of size r-1, one at a time, noting which is best. //If there is a kernel, do not swap out any of those accessions (they are retained as the //first KernelAccessionIndex.size() items in CoreAlleles). Accomplished by starting for loop //at KernelAccessionIndex.size(). best=0; for (unsigned int i=KernelAccessionIndex.size();i<CoreAlleles.size();i++) { //remove each item consecutively from the list of all populations in the core AlleleList.clear(); TdTempList.clear(); TdTempList = CoreAlleles; //swap to temporary vector TdTempList.erase( TdTempList.begin() + i); AlleleList = TdTempList; TempList2.clear(); TempList2 = AccessionsInCore; TempList2.erase(TempList2.begin() + i); AccessionsInSubCore = TempList2; /*Data structure for SubCoreAlleles: SubCore 1..r Population 1..(r-1) AlleleArray 1..NumLoci --3. fuse alleles from the same locus into a single array, for all accessions, for the current subcore --4. assemble a list of diversity (M) for each locus separately --5. standardize the M values to the maximum possible number of alleles at that locus, and add them up to get final estimate of standardized allelic diversity in the core. then divide by the number of loci to get a number that is comparable across data sets. --5.5. simultaneous to the calculation, keep track of which subcore is best */ MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity); nnew = RandomActiveDiversity; if (nnew >= best) // >= allows sideways movement during hill climbing { best = nnew; BestSubCore.clear(); BestSubCore = AccessionsInSubCore; BestSubCoreAlleles.clear(); BestSubCoreAlleles = AlleleList; } } //for loop cycles thru all subcores //reverse sort BestSubCore to support easy assembly of pared TempList below BestSubCoreRevSorted = BestSubCore; std::sort(BestSubCoreRevSorted.begin(), BestSubCoreRevSorted.end(), std::greater<int>()); /* 6. take the subcore with greatest diversity and consecutively add each possible additional accession from the base collection. find the core of size r (not r-1 subcore) that has the greatest diversity. suppress the IDs of those accessions found in the BestSubCore from the list of all accessions to get a list of remaining accessions.*/ TempList = AccessionNameList; for (unsigned int k=0;k<BestSubCoreRevSorted.size();k++) { bsc = BestSubCoreRevSorted[k]; TempList.erase( TempList.begin() + bsc ); } //shuffle the list of remaining accessions, so addition order is not predictable std::random_shuffle (TempList.begin(), TempList.end()); //add each remaining accession consecutively, calculate diversity, test //whether it is better than the prior one best = 0; for (unsigned int k=0;k<TempList.size();k++) { bsc = TempList[k]; //define the core TempList2 = BestSubCore; TempList2.resize( TempList2.size() + 1 ); //TempList2.push_back(i); TempList2[TempList2.size()-1] = bsc; //add new accession to last vector element AccessionsInCore = TempList2; //assemble the allelelist for the core TdTempList = BestSubCoreAlleles; TdTempList.resize( TdTempList.size() + 1 ); //TdTempList.push_back( ActiveAlleleByPopList[i] ); TdTempList[TdTempList.size()-1] = ActiveAlleleByPopList[bsc]; AlleleList = TdTempList; //calculate diversity MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, nnew, TempAltOptimizedActiveDiversity); //test whether current diversity is higher than the best diversity found so far if (nnew >= best) // >= allows sideways movement during hill climbing { best = nnew; bestcore = AccessionsInCore; //save the alternative diversity value for the best core AltOptimizedActiveDiversity = TempAltOptimizedActiveDiversity; } } AccessionsInCore = bestcore; //define starting variable for next MSTRAT iteration //if there has been no improvement from the prior iteration, you have reached // the plateau and should exit the repeat if (best == StartingDiversity) { plateau++; if (plateau > 0) break; } //update starting value and repeat else if (best > StartingDiversity) StartingDiversity = best; } //while(true) endless loop } //7. Calculate diversity at target loci //assemble the target loci allelelist for the accessions in the best core AlleleList.clear(); AlleleList.resize( AccessionsInCore.size() ); for (unsigned int j=0;j<AccessionsInCore.size();j++) { b = AccessionsInCore[j]; AlleleList[j] = TargetAlleleByPopList[b]; } //calculate diversity at target loci based upon the optimized core selection MyCalculateDiversity(AlleleList, TargetMaxAllelesList, Standardize, OptimizedTargetDiversity, AltOptimizedTargetDiversity); //8. Assemble stats for optimized core and add to output vectors //create a list of accession names from the list of accession ID's in AccessionsInCore sort( AccessionsInCore.begin(), AccessionsInCore.end() ); TempListStr.clear(); TempListStr.resize(r); for (unsigned int i=0;i<AccessionsInCore.size();i++) { b = AccessionsInCore[i]; TempListStr[i] = FullAccessionNameList[b]; } /***MPI: BUILD & SEND RESULTS VECTOR***/ //load the variables onto the results vectors //no need to calculate row number, it is the same as rnr, formula saved because it might be useful later //row = ((r - MinCoreSize)*NumReplicates) + nr - ( (NumReplicates*(SamplingFreq-1))*( (r-MinCoreSize)/SamplingFreq ) ); // (r - MinCoreSize)*NumReplicates) + nr specifies row number if SamplingFreq=1 // (NumReplicates*(SamplingFreq-1)) specifies a step value to correct when SamplingFreq>1 // ( (r-MinCoreSize)/SamplingFreq ) specifies the replicate on core size, accounting for SamplingFreq // see file Calculation of row value.xlsx for development of the 'row' index //put results 0-8 into a vector, resvec, return row as last item vector<double> resvec(10); resvec[0] = double(r); resvec[1] = StartingRandomActiveDiversity;//RandomActiveDiversity; resvec[2] = best; //equivalent to OptimizedActiveDiversity resvec[3] = RandomTargetDiversity; resvec[4] = OptimizedTargetDiversity; resvec[5] = StartingAltRandomActiveDiversity;//AltRandomActiveDiversity; resvec[6] = AltOptimizedActiveDiversity; resvec[7] = AltRandomTargetDiversity; resvec[8] = AltOptimizedTargetDiversity; resvec[9] = double(rnr); //cout<<"MPI_Rank="<<MPI_Rank<<" //send result vector to master 0, send row number, rnr, as last element. //message is tagged as 0 //here you are pointing to the first element, then returning resvec.size() doubles- //worth of memory from that starting location. MPI_Send(&resvec[0], resvec.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); /***MPI: END BUILD & SEND RESULTS VECTOR***/ /***MPI: BUILD & SEND MEMBERS VECTOR***/ //add row number as last item in TempListStr TempListStr.resize(TempListStr.size()+1); stringstream ss; ss << rnr; //convert int to stringstream to string TempListStr[ TempListStr.size() - 1 ] = ss.str(); //convert vector<string> to a single, ',<!>,' delimited, string string concat; for (unsigned int i=0;i<TempListStr.size();++i) { concat += TempListStr[i]; //add vector element if (i<TempListStr.size()-1) concat += ",<!>,"; //add delimiter, except for last item } //convert the string to a char array char cc[concat.size()+1]; strcpy(cc, concat.c_str()); //send the char array to master0 tagged as 1 //tagged as 1 to distinguish from result vector send MPI_Send(&cc, sizeof(cc), MPI_CHAR, 0, 1, MPI_COMM_WORLD); } //end for loop over rows } //***MPI: END SEND /*MPI: MASTER 0 WRITES OUTPUT*/ if ( procid == 0 ) { //set up file stream for output file ofstream output; output.open(OutFilePath); output.close(); //quick open close done to clear any existing file each time program is run output.open(OutFilePath, ios::out | ios::app); //open file in append mode output << "core size random reference diversity optimized reference diversity random target diversity optimized target diversity alt random reference diversity alt optimized reference diversity alt random target diversity alt optimized target diversity core members" << "\n"; //write out results row by row for (int i=0;i<V1;i++) { //write variables output << Results[i][0] << " " << Results[i][1] << " " << Results[i][2] << " " << Results[i][3] << " " << Results[i][4] << " " << Results[i][5] << " " << Results[i][6] << " " << Results[i][7] << " " << Results[i][8] << " " << "("; //write Accessions retained for (unsigned int j=0;j<Members[i].size();j++) { if ( j==(Members[i].size() - 1) ) { //add trailing parentheses and move to next row output << Members[i][j] << ")\n"; } else { output << Members[i][j] << ","; } } } //wrap up write step output.close(); } /***MPI: END MASTER WRITE***/ //Terminate MPI. //MPI::Finalize ( ); }
//========================================================================================= void runLUMatrixTests(Epetra_CrsMatrix * L, Epetra_MultiVector * bL, Epetra_MultiVector * btL, Epetra_MultiVector * xexactL, Epetra_CrsMatrix * U, Epetra_MultiVector * bU, Epetra_MultiVector * btU, Epetra_MultiVector * xexactU, bool StaticProfile, bool verbose, bool summary) { if (L->NoDiagonal()) { bL->Update(1.0, *xexactL, 1.0); // Add contribution of a unit diagonal to bL btL->Update(1.0, *xexactL, 1.0); // Add contribution of a unit diagonal to btL } if (U->NoDiagonal()) { bU->Update(1.0, *xexactU, 1.0); // Add contribution of a unit diagonal to bU btU->Update(1.0, *xexactU, 1.0); // Add contribution of a unit diagonal to btU } Epetra_MultiVector z(*bL); Epetra_MultiVector r(*bL); Epetra_SerialDenseVector resvec(bL->NumVectors()); //Timings Epetra_Flops flopcounter; L->SetFlopCounter(flopcounter); U->SetFlopCounter(flopcounter); Epetra_Time timer(L->Comm()); std::string statdyn = "dynamic"; if (StaticProfile) statdyn = "static "; for (int j=0; j<4; j++) { // j = 0/2 is notrans, j = 1/3 is trans bool TransA = (j==1 || j==3); std::string contig = "without"; if (j>1) contig = "with "; if (j==2) { L->OptimizeStorage(); U->OptimizeStorage(); } flopcounter.ResetFlops(); timer.ResetStartTime(); //10 lower solves bool Upper = false; bool UnitDiagonal = L->NoDiagonal(); // If no diagonal, then unit must be used Epetra_MultiVector * b = TransA ? btL : bL; // solve with the appropriate b vector for( int i = 0; i < 10; ++i ) L->Solve(Upper, TransA, UnitDiagonal, *b, z); // Solve Lz = bL or L'z = bLt double elapsed_time = timer.ElapsedTime(); double total_flops = L->Flops(); // Compute residual r.Update(-1.0, z, 1.0, *xexactL, 0.0); // r = bt - z r.Norm2(resvec.Values()); if (resvec.NormInf()>0.000001) { cout << "resvec = " << resvec << endl; cout << "z = " << z << endl; cout << "xexactL = " << *xexactL << endl; cout << "r = " << r << endl; } if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": "; double MFLOPs = total_flops/elapsed_time/1000000.0; if (verbose) cout << "Total MFLOPs for 10 " << " Lower solves " << statdyn << " Profile (Trans = " << TransA << ") and " << contig << " opt storage = " << MFLOPs << " (" << elapsed_time << " s)" <<endl; if (summary) { if (L->Comm().NumProc()==1) { if (TransA) cout << "TransLSv" << statdyn<< "Prof" << contig << "OptStor" << '\t'; else cout << "NoTransLSv" << statdyn << "Prof" << contig << "OptStor" << '\t'; } cout << MFLOPs << endl; } flopcounter.ResetFlops(); timer.ResetStartTime(); //10 upper solves Upper = true; UnitDiagonal = U->NoDiagonal(); // If no diagonal, then unit must be used b = TransA ? btU : bU; // solve with the appropriate b vector for( int i = 0; i < 10; ++i ) U->Solve(Upper, TransA, UnitDiagonal, *b, z); // Solve Lz = bL or L'z = bLt elapsed_time = timer.ElapsedTime(); total_flops = U->Flops(); // Compute residual r.Update(-1.0, z, 1.0, *xexactU, 0.0); // r = bt - z r.Norm2(resvec.Values()); if (resvec.NormInf()>0.001) { cout << "U = " << *U << endl; //cout << "resvec = " << resvec << endl; cout << "z = " << z << endl; cout << "xexactU = " << *xexactU << endl; //cout << "r = " << r << endl; cout << "b = " << *b << endl; } if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": "; MFLOPs = total_flops/elapsed_time/1000000.0; if (verbose) cout << "Total MFLOPs for 10 " << " Upper solves " << statdyn << " Profile (Trans = " << TransA << ") and " << contig << " opt storage = " << MFLOPs <<endl; if (summary) { if (L->Comm().NumProc()==1) { if (TransA) cout << "TransUSv" << statdyn<< "Prof" << contig << "OptStor" << '\t'; else cout << "NoTransUSv" << statdyn << "Prof" << contig << "OptStor" << '\t'; } cout << MFLOPs << endl; } } return; }
void runMatrixTests(Epetra_CrsMatrix * A, Epetra_MultiVector * b, Epetra_MultiVector * bt, Epetra_MultiVector * xexact, bool StaticProfile, bool verbose, bool summary) { Epetra_MultiVector z(*b); Epetra_MultiVector r(*b); Epetra_SerialDenseVector resvec(b->NumVectors()); //Timings Epetra_Flops flopcounter; A->SetFlopCounter(flopcounter); Epetra_Time timer(A->Comm()); std::string statdyn = "dynamic"; if (StaticProfile) statdyn = "static "; for (int j=0; j<4; j++) { // j = 0/2 is notrans, j = 1/3 is trans bool TransA = (j==1 || j==3); std::string contig = "without"; if (j>1) contig = "with "; #ifdef EPETRA_SHORT_PERFTEST int kstart = 1; #else int kstart = 0; #endif for (int k=kstart; k<2; k++) { // Loop over old multiply vs. new multiply std::string oldnew = "old"; if (k>0) oldnew = "new"; if (j==2) A->OptimizeStorage(); flopcounter.ResetFlops(); timer.ResetStartTime(); if (k==0) { //10 matvecs #ifndef EPETRA_SHORT_PERFTEST for( int i = 0; i < 10; ++i ) A->Multiply1(TransA, *xexact, z); // Compute z = A*xexact or z = A'*xexact using old Multiply method #endif } else { //10 matvecs for( int i = 0; i < 10; ++i ) A->Multiply(TransA, *xexact, z); // Compute z = A*xexact or z = A'*xexact } double elapsed_time = timer.ElapsedTime(); double total_flops = A->Flops(); // Compute residual if (TransA) r.Update(-1.0, z, 1.0, *bt, 0.0); // r = bt - z else r.Update(-1.0, z, 1.0, *b, 0.0); // r = b - z r.Norm2(resvec.Values()); if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": "; double MFLOPs = total_flops/elapsed_time/1000000.0; if (verbose) cout << "Total MFLOPs for 10 " << oldnew << " MatVec's with " << statdyn << " Profile (Trans = " << TransA << ") and " << contig << " optimized storage = " << MFLOPs << " (" << elapsed_time << " s)" <<endl; if (summary) { if (A->Comm().NumProc()==1) { if (TransA) cout << "TransMv" << statdyn<< "Prof" << contig << "OptStor" << '\t'; else cout << "NoTransMv" << statdyn << "Prof" << contig << "OptStor" << '\t'; } cout << MFLOPs << endl; } } } return; }
int main(int argc, char *argv[]) { int ierr = 0; double elapsed_time; double total_flops; double MFLOPs; #ifdef EPETRA_MPI // Initialize MPI MPI_Init(&argc,&argv); Epetra_MpiComm comm( MPI_COMM_WORLD ); #else Epetra_SerialComm comm; #endif bool verbose = false; bool summary = false; // Check if we should print verbose results to standard out if (argc>6) if (argv[6][0]=='-' && argv[6][1]=='v') verbose = true; // Check if we should print verbose results to standard out if (argc>6) if (argv[6][0]=='-' && argv[6][1]=='s') summary = true; if(argc < 6) { cerr << "Usage: " << argv[0] << " NumNodesX NumNodesY NumProcX NumProcY NumPoints [-v|-s]" << endl << "where:" << endl << "NumNodesX - Number of mesh nodes in X direction per processor" << endl << "NumNodesY - Number of mesh nodes in Y direction per processor" << endl << "NumProcX - Number of processors to use in X direction" << endl << "NumProcY - Number of processors to use in Y direction" << endl << "NumPoints - Number of points to use in stencil (5, 9 or 25 only)" << endl << "-v|-s - (Optional) Run in verbose mode if -v present or summary mode if -s present" << endl << " NOTES: NumProcX*NumProcY must equal the number of processors used to run the problem." << endl << endl << " Serial example:" << endl << argv[0] << " 16 12 1 1 25 -v" << endl << " Run this program in verbose mode on 1 processor using a 16 X 12 grid with a 25 point stencil."<< endl <<endl << " MPI example:" << endl << "mpirun -np 32 " << argv[0] << " 10 12 4 8 9 -v" << endl << " Run this program in verbose mode on 32 processors putting a 10 X 12 subgrid on each processor using 4 processors "<< endl << " in the X direction and 8 in the Y direction. Total grid size is 40 points in X and 96 in Y with a 9 point stencil."<< endl << endl; return(1); } //char tmp; //if (comm.MyPID()==0) cout << "Press any key to continue..."<< endl; //if (comm.MyPID()==0) cin >> tmp; //comm.Barrier(); comm.SetTracebackMode(0); // This should shut down any error traceback reporting if (verbose && comm.MyPID()==0) cout << Epetra_Version() << endl << endl; if (summary && comm.MyPID()==0) { if (comm.NumProc()==1) cout << Epetra_Version() << endl << endl; else cout << endl << endl; // Print two blank line to keep output columns lined up } if (verbose) cout << comm <<endl; // Redefine verbose to only print on PE 0 if (verbose && comm.MyPID()!=0) verbose = false; if (summary && comm.MyPID()!=0) summary = false; int numNodesX = atoi(argv[1]); int numNodesY = atoi(argv[2]); int numProcsX = atoi(argv[3]); int numProcsY = atoi(argv[4]); int numPoints = atoi(argv[5]); if (verbose || (summary && comm.NumProc()==1)) { cout << " Number of local nodes in X direction = " << numNodesX << endl << " Number of local nodes in Y direction = " << numNodesY << endl << " Number of global nodes in X direction = " << numNodesX*numProcsX << endl << " Number of global nodes in Y direction = " << numNodesY*numProcsY << endl << " Number of local nonzero entries = " << numNodesX*numNodesY*numPoints << endl << " Number of global nonzero entries = " << numNodesX*numNodesY*numPoints*numProcsX*numProcsY << endl << " Number of Processors in X direction = " << numProcsX << endl << " Number of Processors in Y direction = " << numProcsY << endl << " Number of Points in stencil = " << numPoints << endl << endl; } // Print blank line to keep output columns lined up if (summary && comm.NumProc()>1) cout << endl << endl << endl << endl << endl << endl << endl << endl<< endl << endl; if (numProcsX*numProcsY!=comm.NumProc()) { cerr << "Number of processors = " << comm.NumProc() << endl << " is not the product of " << numProcsX << " and " << numProcsY << endl << endl; return(1); } if (numPoints!=5 && numPoints!=9 && numPoints!=25) { cerr << "Number of points specified = " << numPoints << endl << " is not 5, 9, 25" << endl << endl; return(1); } if (numNodesX*numNodesY<=0) { cerr << "Product of number of nodes is <= zero" << endl << endl; return(1); } Epetra_IntSerialDenseVector Xoff, XLoff, XUoff; Epetra_IntSerialDenseVector Yoff, YLoff, YUoff; if (numPoints==5) { // Generate a 5-point 2D Finite Difference matrix Xoff.Size(5); Yoff.Size(5); Xoff[0] = -1; Xoff[1] = 1; Xoff[2] = 0; Xoff[3] = 0; Xoff[4] = 0; Yoff[0] = 0; Yoff[1] = 0; Yoff[2] = 0; Yoff[3] = -1; Yoff[4] = 1; // Generate a 2-point 2D Lower triangular Finite Difference matrix XLoff.Size(2); YLoff.Size(2); XLoff[0] = -1; XLoff[1] = 0; YLoff[0] = 0; YLoff[1] = -1; // Generate a 3-point 2D upper triangular Finite Difference matrix XUoff.Size(3); YUoff.Size(3); XUoff[0] = 0; XUoff[1] = 1; XUoff[2] = 0; YUoff[0] = 0; YUoff[1] = 0; YUoff[2] = 1; } else if (numPoints==9) { // Generate a 9-point 2D Finite Difference matrix Xoff.Size(9); Yoff.Size(9); Xoff[0] = -1; Xoff[1] = 0; Xoff[2] = 1; Yoff[0] = -1; Yoff[1] = -1; Yoff[2] = -1; Xoff[3] = -1; Xoff[4] = 0; Xoff[5] = 1; Yoff[3] = 0; Yoff[4] = 0; Yoff[5] = 0; Xoff[6] = -1; Xoff[7] = 0; Xoff[8] = 1; Yoff[6] = 1; Yoff[7] = 1; Yoff[8] = 1; // Generate a 5-point lower triangular 2D Finite Difference matrix XLoff.Size(5); YLoff.Size(5); XLoff[0] = -1; XLoff[1] = 0; Xoff[2] = 1; YLoff[0] = -1; YLoff[1] = -1; Yoff[2] = -1; XLoff[3] = -1; XLoff[4] = 0; YLoff[3] = 0; YLoff[4] = 0; // Generate a 4-point upper triangular 2D Finite Difference matrix XUoff.Size(4); YUoff.Size(4); XUoff[0] = 1; YUoff[0] = 0; XUoff[1] = -1; XUoff[2] = 0; XUoff[3] = 1; YUoff[1] = 1; YUoff[2] = 1; YUoff[3] = 1; } else { // Generate a 25-point 2D Finite Difference matrix Xoff.Size(25); Yoff.Size(25); int xi = 0, yi = 0; int xo = -2, yo = -2; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; xo = -2, yo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; xo = -2, yo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; xo = -2, yo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; xo = -2, yo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; Yoff[yi++] = yo ; // Generate a 13-point lower triangular 2D Finite Difference matrix XLoff.Size(13); YLoff.Size(13); xi = 0, yi = 0; xo = -2, yo = -2; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; YLoff[yi++] = yo ; YLoff[yi++] = yo ; YLoff[yi++] = yo ; YLoff[yi++] = yo ; YLoff[yi++] = yo ; xo = -2, yo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; YLoff[yi++] = yo ; YLoff[yi++] = yo ; YLoff[yi++] = yo ; YLoff[yi++] = yo ; YLoff[yi++] = yo ; xo = -2, yo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; YLoff[yi++] = yo ; YLoff[yi++] = yo ; YLoff[yi++] = yo ; // Generate a 13-point upper triangular 2D Finite Difference matrix XUoff.Size(13); YUoff.Size(13); xi = 0, yi = 0; xo = 0, yo = 0; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; YUoff[yi++] = yo ; YUoff[yi++] = yo ; YUoff[yi++] = yo ; xo = -2, yo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; YUoff[yi++] = yo ; YUoff[yi++] = yo ; YUoff[yi++] = yo ; YUoff[yi++] = yo ; YUoff[yi++] = yo ; xo = -2, yo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; YUoff[yi++] = yo ; YUoff[yi++] = yo ; YUoff[yi++] = yo ; YUoff[yi++] = yo ; YUoff[yi++] = yo ; } Epetra_Map * map; Epetra_Map * mapL; Epetra_Map * mapU; Epetra_CrsMatrix * A; Epetra_CrsMatrix * L; Epetra_CrsMatrix * U; Epetra_MultiVector * b; Epetra_MultiVector * bt; Epetra_MultiVector * xexact; Epetra_MultiVector * bL; Epetra_MultiVector * btL; Epetra_MultiVector * xexactL; Epetra_MultiVector * bU; Epetra_MultiVector * btU; Epetra_MultiVector * xexactU; Epetra_SerialDenseVector resvec(0); //Timings Epetra_Flops flopcounter; Epetra_Time timer(comm); #ifdef EPETRA_VERY_SHORT_PERFTEST int jstop = 1; #elif EPETRA_SHORT_PERFTEST int jstop = 1; #else int jstop = 2; #endif for (int j=0; j<jstop; j++) { for (int k=1; k<17; k++) { #ifdef EPETRA_VERY_SHORT_PERFTEST if (k<3 || (k%4==0 && k<9)) { #elif EPETRA_SHORT_PERFTEST if (k<6 || k%4==0) { #else if (k<7 || k%2==0) { #endif int nrhs=k; if (verbose) cout << "\n*************** Results for " << nrhs << " RHS with "; bool StaticProfile = (j!=0); if (verbose) { if (StaticProfile) cout << " static profile\n"; else cout << " dynamic profile\n"; } GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, numPoints, Xoff.Values(), Yoff.Values(), nrhs, comm, verbose, summary, map, A, b, bt, xexact, StaticProfile, false); #ifdef EPETRA_HAVE_JADMATRIX timer.ResetStartTime(); Epetra_JadMatrix JA(*A); elapsed_time = timer.ElapsedTime(); if (verbose) cout << "Time to create Jagged diagonal matrix = " << elapsed_time << endl; //cout << "A = " << *A << endl; //cout << "JA = " << JA << endl; runJadMatrixTests(&JA, b, bt, xexact, StaticProfile, verbose, summary); #endif runMatrixTests(A, b, bt, xexact, StaticProfile, verbose, summary); delete A; delete b; delete bt; delete xexact; GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, XLoff.Length(), XLoff.Values(), YLoff.Values(), nrhs, comm, verbose, summary, mapL, L, bL, btL, xexactL, StaticProfile, true); GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, XUoff.Length(), XUoff.Values(), YUoff.Values(), nrhs, comm, verbose, summary, mapU, U, bU, btU, xexactU, StaticProfile, true); runLUMatrixTests(L, bL, btL, xexactL, U, bU, btU, xexactU, StaticProfile, verbose, summary); delete L; delete bL; delete btL; delete xexactL; delete mapL; delete U; delete bU; delete btU; delete xexactU; delete mapU; Epetra_MultiVector q(*map, nrhs); Epetra_MultiVector z(q); Epetra_MultiVector r(q); delete map; q.SetFlopCounter(flopcounter); z.SetFlopCounter(q); r.SetFlopCounter(q); resvec.Resize(nrhs); flopcounter.ResetFlops(); timer.ResetStartTime(); //10 norms for( int i = 0; i < 10; ++i ) q.Norm2( resvec.Values() ); elapsed_time = timer.ElapsedTime(); total_flops = q.Flops(); MFLOPs = total_flops/elapsed_time/1000000.0; if (verbose) cout << "\nTotal MFLOPs for 10 Norm2's= " << MFLOPs << endl; if (summary) { if (comm.NumProc()==1) cout << "Norm2" << '\t'; cout << MFLOPs << endl; } flopcounter.ResetFlops(); timer.ResetStartTime(); //10 dot's for( int i = 0; i < 10; ++i ) q.Dot(z, resvec.Values()); elapsed_time = timer.ElapsedTime(); total_flops = q.Flops(); MFLOPs = total_flops/elapsed_time/1000000.0; if (verbose) cout << "Total MFLOPs for 10 Dot's = " << MFLOPs << endl; if (summary) { if (comm.NumProc()==1) cout << "DotProd" << '\t'; cout << MFLOPs << endl; } flopcounter.ResetFlops(); timer.ResetStartTime(); //10 dot's for( int i = 0; i < 10; ++i ) q.Update(1.0, z, 1.0, r, 0.0); elapsed_time = timer.ElapsedTime(); total_flops = q.Flops(); MFLOPs = total_flops/elapsed_time/1000000.0; if (verbose) cout << "Total MFLOPs for 10 Updates= " << MFLOPs << endl; if (summary) { if (comm.NumProc()==1) cout << "Update" << '\t'; cout << MFLOPs << endl; } } } } #ifdef EPETRA_MPI MPI_Finalize() ; #endif return ierr ; } // Constructs a 2D PDE finite difference matrix using the list of x and y offsets. // // nx (In) - number of grid points in x direction // ny (In) - number of grid points in y direction // The total number of equations will be nx*ny ordered such that the x direction changes // most rapidly: // First equation is at point (0,0) // Second at (1,0) // ... // nx equation at (nx-1,0) // nx+1st equation at (0,1) // numPoints (In) - number of points in finite difference stencil // xoff (In) - stencil offsets in x direction (of length numPoints) // yoff (In) - stencil offsets in y direction (of length numPoints) // A standard 5-point finite difference stencil would be described as: // numPoints = 5 // xoff = [-1, 1, 0, 0, 0] // yoff = [ 0, 0, 0, -1, 1] // nrhs - Number of rhs to generate. (First interface produces vectors, so nrhs is not needed // comm (In) - an Epetra_Comm object describing the parallel machine (numProcs and my proc ID) // map (Out) - Epetra_Map describing distribution of matrix and vectors/multivectors // A (Out) - Epetra_CrsMatrix constructed for nx by ny grid using prescribed stencil // Off-diagonal values are random between 0 and 1. If diagonal is part of stencil, // diagonal will be slightly diag dominant. // b (Out) - Generated RHS. Values satisfy b = A*xexact // bt (Out) - Generated RHS. Values satisfy b = A'*xexact // xexact (Out) - Generated exact solution to Ax = b and b' = A'xexact // Note: Caller of this function is responsible for deleting all output objects. void GenerateCrsProblem(int numNodesX, int numNodesY, int numProcsX, int numProcsY, int numPoints, int * xoff, int * yoff, const Epetra_Comm &comm, bool verbose, bool summary, Epetra_Map *& map, Epetra_CrsMatrix *& A, Epetra_Vector *& b, Epetra_Vector *& bt, Epetra_Vector *&xexact, bool StaticProfile, bool MakeLocalOnly) { Epetra_MultiVector * b1, * bt1, * xexact1; GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, numPoints, xoff, yoff, 1, comm, verbose, summary, map, A, b1, bt1, xexact1, StaticProfile, MakeLocalOnly); b = dynamic_cast<Epetra_Vector *>(b1); bt = dynamic_cast<Epetra_Vector *>(bt1); xexact = dynamic_cast<Epetra_Vector *>(xexact1); return; } void GenerateCrsProblem(int numNodesX, int numNodesY, int numProcsX, int numProcsY, int numPoints, int * xoff, int * yoff, int nrhs, const Epetra_Comm &comm, bool verbose, bool summary, Epetra_Map *& map, Epetra_CrsMatrix *& A, Epetra_MultiVector *& b, Epetra_MultiVector *& bt, Epetra_MultiVector *&xexact, bool StaticProfile, bool MakeLocalOnly) { Epetra_Time timer(comm); // Determine my global IDs long long * myGlobalElements; GenerateMyGlobalElements(numNodesX, numNodesY, numProcsX, numProcsY, comm.MyPID(), myGlobalElements); int numMyEquations = numNodesX*numNodesY; map = new Epetra_Map((long long)-1, numMyEquations, myGlobalElements, 0, comm); // Create map with 2D block partitioning. delete [] myGlobalElements; long long numGlobalEquations = map->NumGlobalElements64(); int profile = 0; if (StaticProfile) profile = numPoints; #ifdef EPETRA_HAVE_STATICPROFILE if (MakeLocalOnly) A = new Epetra_CrsMatrix(Copy, *map, *map, profile, StaticProfile); // Construct matrix with rowmap=colmap else A = new Epetra_CrsMatrix(Copy, *map, profile, StaticProfile); // Construct matrix #else if (MakeLocalOnly) A = new Epetra_CrsMatrix(Copy, *map, *map, profile); // Construct matrix with rowmap=colmap else A = new Epetra_CrsMatrix(Copy, *map, profile); // Construct matrix #endif long long * indices = new long long[numPoints]; double * values = new double[numPoints]; double dnumPoints = (double) numPoints; int nx = numNodesX*numProcsX; for (int i=0; i<numMyEquations; i++) { long long rowID = map->GID64(i); int numIndices = 0; for (int j=0; j<numPoints; j++) { long long colID = rowID + xoff[j] + nx*yoff[j]; // Compute column ID based on stencil offsets if (colID>-1 && colID<numGlobalEquations) { indices[numIndices] = colID; double value = - ((double) rand())/ ((double) RAND_MAX); if (colID==rowID) values[numIndices++] = dnumPoints - value; // Make diagonal dominant else values[numIndices++] = value; } } //cout << "Building row " << rowID << endl; A->InsertGlobalValues(rowID, numIndices, values, indices); } delete [] indices; delete [] values; double insertTime = timer.ElapsedTime(); timer.ResetStartTime(); A->FillComplete(false); double fillCompleteTime = timer.ElapsedTime(); if (verbose) cout << "Time to insert matrix values = " << insertTime << endl << "Time to complete fill = " << fillCompleteTime << endl; if (summary) { if (comm.NumProc()==1) cout << "InsertTime" << '\t'; cout << insertTime << endl; if (comm.NumProc()==1) cout << "FillCompleteTime" << '\t'; cout << fillCompleteTime << endl; } if (nrhs<=1) { b = new Epetra_Vector(*map); bt = new Epetra_Vector(*map); xexact = new Epetra_Vector(*map); } else { b = new Epetra_MultiVector(*map, nrhs); bt = new Epetra_MultiVector(*map, nrhs); xexact = new Epetra_MultiVector(*map, nrhs); } xexact->Random(); // Fill xexact with random values A->Multiply(false, *xexact, *b); A->Multiply(true, *xexact, *bt); return; }
/* OPWEIGHTEDL2: Solves weighted L2 regularized inverse problems. Minimizes the cost function X* = argmin_X ||A(X)-b_FC||_2^2 + lambda ||W |D(X)| ||_2^2 where X* = recovered image A = linear measurement operator b_FC = (noisy) measurements % W = diagonal weight matrix built from the edge mask |D(X)| = gradient magnitude at each pixel Inputs: A = function handle representing the forward model/measurement operator At = function handle representing the backwards model/ the transpose of the measurment operator. (e.g. if A is a downsampling, At is a upsampling) b_FC = a vector of measurements; should match the dimensions of A(X) lambda = regularization parameter that balances data fidelity and smoothness. set lambda high for more smoothing. siz = output image size, e.g. siz = [512,512] Niter = is the number of iterations; should be ~100-500 Output: X = high-resolution output image cost = array of cost function value vs. iteration Define AtA fourier mask PrecisionType lambda, uvec ind_samples, frowvec res, int Niter, double tol, PrecisionType gam, FloatImageType::Pointer& X, frowvec& cost, frowvec& resvec) */ FloatImageType::Pointer OpWeightedL2(FloatImageType::Pointer norm01_lowres, FloatImageType::Pointer edgemask) { const PrecisionType lambda = 1e-3F ; constexpr int Niter = 100; const PrecisionType tol = 1e-8F ; const PrecisionType gam = 1.0F ; //typedef itk::VectorMagnitudeImageFilter<CVImageType, FloatImageType> GMType; //The optimal filter for modeling the measurement operator is low pass filter in this case // NOTE: That the A operator is a projection operator, so A^{T}A = A, That is to say that applying // the A^{T} to A results in A. FloatImageType::Pointer p_image = GetDiracDeltaImage(edgemask); // Precompute //Make high-res coefficients const HalfHermetianImageType::Pointer b_FC = GetAFP_of_b(norm01_lowres, edgemask); //TODO: too many copies of Atb here. FloatImageType::Pointer Atb = At_fhp(b_FC, edgemask->GetLargestPossibleRegion().GetSize()[0]%2 == 1, edgemask.GetPointer()); FloatImageType::Pointer TwoAtb = MakeTwoAtb(Atb); FloatImageType::Pointer X = DeepImageCopy<FloatImageType>(Atb); Atb = nullptr; //Save memory here CVImageType::Pointer DX = GetGradient(X); CVImageType::Pointer L = CreateEmptyImage<CVImageType>(DX); CVImageType::Pointer Y = CreateEmptyImage<CVImageType>(DX); //CVImageType::Pointer WDX = CreateEmptyImage<CVImageType>(DX); CVImageType::Pointer residue = CreateEmptyImage<CVImageType>(DX); CVImageType::Pointer YminusL = CreateEmptyImage<CVImageType>(DX); FloatImageType::Pointer tempValue=CreateEmptyImage<FloatImageType>(DX); std::vector<PrecisionType> resvec(Niter,0); std::vector<PrecisionType> cost(Niter,0); #ifdef USE_WRITE_DEGUBBING itk::ComplexToModulusImageFilter<HalfHermetianImageType,FloatImageType>::Pointer cpx2abs = itk::ComplexToModulusImageFilter<HalfHermetianImageType,FloatImageType>::New(); #endif CVImageType::Pointer gradIm = GetGradient(p_image); FloatImageType::Pointer divIm = GetDivergence(gradIm); HalfHermetianImageType::Pointer DtDhat = GetForwardFFT(divIm); // TODO: ALL SAME TO HERE! typedef HalfHermetianImageType::PixelType FCType; HalfHermetianImageType::Pointer TwoTimesAtAhatPlusLamGamDtDhat = CreateEmptyImage<HalfHermetianImageType>(DtDhat); { HalfHermetianImageType::Pointer TwoTimesAtAhat = GetLowpassOperator(norm01_lowres,p_image, 2.0F); TwoTimesAtAhatPlusLamGamDtDhat = opIC(TwoTimesAtAhatPlusLamGamDtDhat,FCType(lambda*gam),'*',DtDhat); //TODO: Make This an inverse! TwoTimesAtAhatPlusLamGamDtDhat = opII(TwoTimesAtAhatPlusLamGamDtDhat,TwoTimesAtAhat,'+',TwoTimesAtAhatPlusLamGamDtDhat); } p_image = nullptr; //Save memory const bool edgemask_ActualXDimensionIsOdd = edgemask->GetLargestPossibleRegion().GetSize()[0] % 2 == 1; CVImageType::Pointer InvTwoMuPlusGamma = ComputeInvTwoMuPlusGamma(edgemask,gam); //FloatImageType::Pointer SqrtMu = ComputeSqrtMu(edgemask); #define USE_BLAS_WRAPPERS #ifdef USE_BLAS_WRAPPERS #else typedef itk::AddImageFilter<CVImageType,CVImageType> CVImageAdder; CVImageAdder::Pointer dxPlusL = CVImageAdder::New(); #endif itk::TimeProbe tp; tp.Start(); HalfHermetianImageType::Pointer tempRatioFC = CreateEmptyImage<HalfHermetianImageType>(DtDhat); for (size_t i=0; i < Niter; ++i) { std::cout << "Iteration : " << i << std::endl; #ifdef USE_BLAS_WRAPPERS //Z = 1.0*L+DX AddAllElements(DX,1.0F,L,DX,gam);//DX destroyed CVImageType::Pointer & Z = DX; #else //Z = opII(Z,DX,'+',L); dxPlusL->SetInput1(DX); dxPlusL->SetInput2(L); dxPlusL->SetInPlace(true); dxPlusL->Update(); CVImageType::Pointer Z=dxPlusL->GetOutput(); MultiplyCVByScalar(Z,gam); #endif #ifdef USE_BLAS_WRAPPERS //Y=InvTwoMuPlusGamm.*Z MultiplyVectors(Y,InvTwoMuPlusGamma,Z); #else Y = opII(Y,Z,'*',InvTwoMuPlusGamma); #endif // X Subprob // Numerator = 2*Atb+lambda*gam*SRdiv(Y-L)) #ifdef USE_BLAS_WRAPPERS //YminusL = 1.0F* SRdiv( -1.0F*L + Y) Duplicate(Y,YminusL); AddAllElements(YminusL,-1.0F,L,YminusL,1.0F); #else YminusL=opII(YminusL,Y,'-',L); #endif FloatImageType::Pointer tempNumerator=GetDivergence(YminusL); #ifdef USE_BLAS_WRAPPERS //lambd*gam*tempNumerator+TwoAtb Duplicate(TwoAtb,tempValue); AddAllElements(tempValue,lambda*gam,tempNumerator,tempValue,1.0F); HalfHermetianImageType::Pointer tempNumeratorFC = GetForwardFFT(tempValue); #else tempNumerator=opIC(tempNumerator,lambda*gam,'*',tempNumerator); tempNumerator=opII(tempNumerator,TwoAtb,'+',tempNumerator); HalfHermetianImageType::Pointer tempNumeratorFC = GetForwardFFT(tempNumerator); #endif //KEEP tempRatioFC = opII_scalar(tempRatioFC,tempNumeratorFC,'/',TwoTimesAtAhatPlusLamGamDtDhat); X = GetInverseFFT(tempRatioFC,edgemask_ActualXDimensionIsOdd,1.0); //TODO: Determine scale factor here. X // should be on same dynamic range as b DX = GetGradient(X); residue = opII(residue, DX, '-', Y); //TODO: Implement math graph output here L = opII(L,L,'+',residue); // resvec[i] = 0; //TODO: Figure out the math for here if ( i > 900000 ) { tp.Stop(); std::cout << " Only iterations " << tp.GetTotal() << tp.GetUnit() << std::endl; return X; } if( i > 99 ) //HACK: Cutting this function short { return X; } //WDX = opII_CVmult(WDX,SqrtMu,'*',DX); //diff = opII(diff,A_fhp(X,norm01_lowres.GetPointer()),'-',b_FC); // //cost[i] = 0; //TODO: Need to figure out math for here } return X; }