Example #1
0
void runJadMatrixTests(Epetra_JadMatrix * A,  Epetra_MultiVector * b, Epetra_MultiVector * bt,
		    Epetra_MultiVector * xexact, bool StaticProfile, bool verbose, bool summary) {

  Epetra_MultiVector z(*b);
  Epetra_MultiVector r(*b);
  Epetra_SerialDenseVector resvec(b->NumVectors());

  //Timings
  Epetra_Flops flopcounter;
  A->SetFlopCounter(flopcounter);
  Epetra_Time timer(A->Comm());

  for (int j=0; j<2; j++) { // j = 0 is notrans, j = 1 is trans

    bool TransA = (j==1);
    A->SetUseTranspose(TransA);
    flopcounter.ResetFlops();
    timer.ResetStartTime();

    //10 matvecs
    for( int i = 0; i < 10; ++i )
      A->Apply(*xexact, z); // Compute z = A*xexact or z = A'*xexact

    double elapsed_time = timer.ElapsedTime();
    double total_flops = A->Flops();

    // Compute residual
    if (TransA)
      r.Update(-1.0, z, 1.0, *bt, 0.0); // r = bt - z
    else
      r.Update(-1.0, z, 1.0, *b, 0.0); // r = b - z

    r.Norm2(resvec.Values());

    if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": ";
    double MFLOPs = total_flops/elapsed_time/1000000.0;
    if (verbose) cout << "Total MFLOPs for 10 " << " Jagged Diagonal MatVec's with (Trans = " << TransA
		      << ") " << MFLOPs << " (" << elapsed_time << " s)" <<endl;
    if (summary) {
      if (A->Comm().NumProc()==1) {
	if (TransA) cout << "TransMv" << '\t';
	else cout << "NoTransMv" << '\t';
      }
      cout << MFLOPs << endl;
    }
  }
  return;
}
Example #2
0
//M+	
void mp(
	int MinCoreSize,
	int MaxCoreSize,
	int SamplingFreq,
	int NumReplicates,
	char* OutFilePath,
	std::string Kernel,
	vector<int> KernelAccessionIndex,
	vector<int> AccessionNameList,
	vector<vector<vector<int> > > ActiveAlleleByPopList,
	vector<vector<vector<int> > > TargetAlleleByPopList,
	vector<int> ActiveMaxAllelesList,
	vector<int> TargetMaxAllelesList,
	vector<std::string> FullAccessionNameList
	)	
{

	//PERFORM INITIAL MPI STUFF
	MPI_Status status; //this struct contains three fields which will contain info about the sender of a received message
						 // MPI_SOURCE, MPI_TAG, MPI_ERROR
	
	//MPI::Init ();  //Initialize MPI.
	int nproc = MPI::COMM_WORLD.Get_size ( );  //Get the number of processes.
	int procid = MPI::COMM_WORLD.Get_rank ( );  //Get the individual process ID.
	

	//set up vectors to fill with results
	//below is a stupid way to calculate the number of rows in the output file, value l (which = V1) 
	//used to monitor progress and as the maximum vector index for shared output vectors
	int l=0;
	for (int i=MinCoreSize;i<MaxCoreSize+1;i=i+SamplingFreq)
	{
		for (int j=0;j<NumReplicates;j++)
		{
			l++;
		}
	}
	
	double V1 = (double)l; //(MaxCoreSize - MinCoreSize + 1)*NumReplicates; //number of rows in output vectors
	vector<vector<double> > Results(V1, vector<double>(9)); //will contain numerical results
	vector<vector<string> > Members(V1); //will contain core set members
	
	//***MPI:  RECEIVE RESULTS AT MASTER 0
	//receive values from any slave, in any order, exiting when the number of 'receives' = the top vector size
	if ( procid == 0 ) 
	{
		//set up variables for monitoring progress
		int percent; //percent of analysis completed
		int progindex = 0;  //index to monitor progress, percent = 100*(progindex/l)

		//receive and process results from slave processors
		unsigned int i = 0;
		while (i<2*(Results.size())) //two receives per row
		{
			//probe the incoming message to determine its tag
			int nchar; //will contain the length of the char array passed with tag=1
			int vchar; //will contain the length of the vector passed with tag=0
			int tag; //tag of message from sender
			int source; //procid of sender
			
			MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
			//MPI_Get_count(&status, MPI_CHAR, &nchar); //probes the length of the message, saves it in nchar
			tag = status.MPI_TAG; //the tag defines which kind of comm it is, a vector of stats (0=resvec()) 
			                      //or a char array describing the members of the core (1=cc)
			source = status.MPI_SOURCE; //determine the source of the message so that you can define which sender to Recv from.  This will avoid an intervening message coming in after the MPI_Probe with a different length, causing a message truncated error.
			
			if (tag == 0)
			{
				//determine the length of the message tagged 0
				MPI_Get_count(&status, MPI_DOUBLE, &vchar);

				//cout <<" vchar="<<vchar<<" tag="<<tag<<" MPI_SOURCE="<<status.MPI_SOURCE<<" MPI_ERROR="<<status.MPI_ERROR<<"\n";

				//receive the vector of results, tagged 0, from:
				//MPI_Send(&resvec[0], resvec.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
				vector<double> t(10);
				MPI_Recv(&t[0], vchar, MPI_DOUBLE, source, 0, MPI_COMM_WORLD, &status);
			
				//load data from vector received onto Results, row number is last item t[9]
				for (int j=0;j<9;++j)
				{
					Results[ t[9] ][j] = t[j];
				}
				t.clear();
			}

			else if (tag == 1)
			{
				//determine the length of the message tagged 1
				MPI_Get_count(&status, MPI_CHAR, &nchar); //probes the length of the message, saves it in nchar

				//cout <<" nchar="<<nchar<<" tag="<<tag<<" MPI_SOURCE="<<status.MPI_SOURCE<<" MPI_ERROR="<<status.MPI_ERROR<<"\n";
				
				//receive the vector<string> of the core set, tagged 1, from:
				//MPI_Send(&m[0], nchar, MPI_CHAR, 0, 1, MPI_COMM_WORLD);
				//vector<string> m(nchar);
				char m[nchar];
				MPI_Recv(&m[0], nchar, MPI_CHAR, source, 1, MPI_COMM_WORLD, &status);
			
				//load core set onto Members
				//1. convert char array into a string
				string mstr(m);
			
				//2. split string on delimiter ',<!>,'
				string delim = ",<!>,";
				vector<string> mvec( countSubstring(mstr, delim) );
				unsigned int st = 0;
				std::size_t en = mstr.find(delim);
				int k = 0;
				while (en != std::string::npos)
				{
					mvec[k] = mstr.substr(st, en-st);
					st = en + delim.length();
					en = mstr.find(delim,st);
					++k;
				}
				string z = mstr.substr(st); //get row number as last item in mstr
				int zz = atoi(z.c_str()); //convert string to c-string then to int
			
				//3. load onto Members
				Members[zz] = mvec;
				
				//4. clean up
				memset(m, 0, nchar);; mstr=""; mvec.clear();
				

			}


			++i;
			
			//display progress
			progindex = progindex + 1;
			percent = 100*( progindex/(V1*2) ); //number of rows X 2 repeats needed to complete search
			printProgBar(percent); 
		}
	}//***MPI: END MASTER RECEIVE***/

	
	/***MPI:  SEND RESULTS FROM SLAVE PROCESSES***/
	else if ( procid != 0 )
	{
		unsigned int r; //r = core size, 
		//int nr, RandAcc, b, bsc, plateau; //nr = controller to repeat NumReplicates times
		int RandAcc, b, bsc, plateau; //nr = controller to repeat NumReplicates times
								//row = result vector row number, bsc = holds best sub core member, and other indexed accessions
									//plateau = index of the number of reps in optimization loop with same diversity value
		double RandomActiveDiversity;
		double AltRandomActiveDiversity;
		double StartingRandomActiveDiversity;
		double StartingAltRandomActiveDiversity;
		double RandomTargetDiversity;
		double AltRandomTargetDiversity;
		double StartingDiversity;
		double TempAltOptimizedActiveDiversity;
		double AltOptimizedActiveDiversity;
		double OptimizedTargetDiversity;
		double AltOptimizedTargetDiversity;
		double best;
		double nnew;
		vector<vector<vector<int> > > AlleleList;
		vector<vector<vector<int> > > CoreAlleles;
		vector<vector<vector<int> > > TdTempList;
		vector<vector<vector<int> > > BestSubCoreAlleles;
		std::string Standardize = "yes";  //a run that mimics the MSTRAT approach can be accomplished by setting Standardize="no", and setting up the var file so that each column in the .dat file is treated as a single locus, rather than two (or more) adjacent columns being treated as a single codominant locus.
		vector<int> AccessionsInCore;
		vector<int> AccessionsInSubCore;
		vector<int> BestSubCore;
		vector<int> BestSubCoreRevSorted;
		vector<int> TempList;
		vector<int> TempList2;
		vector<int> bestcore;
		vector<std::string> TempListStr;
	
		//seed the random number generator for each processor
		int tt;
		tt = (time(NULL));
		srand ( abs(((tt*181)*((procid-83)*359))%104729) );
			
		//do parallelization so that each rep by core size combo can be
		//handled by a distinct thread.  this involves figuring out the total
		//number of reps*coresizes taking into account the SamplingFreq
		int rsteps = 1 + floor( (MaxCoreSize - MinCoreSize) / SamplingFreq ); //number of steps from MinCoreSize to MaxCoreSize

		//***MPI: figure out where to start and stop loop for each processor
		int nreps = rsteps*NumReplicates;
		int count = nreps/(nproc-1); //p-1 assumes a master, i.e. one less processor than total
		int start = (procid-1) * count; //procid-1 makes you start at 0, assumes master is p0
		int stop;
		
		if (nreps % (nproc-1) > (procid-1))
		{
			start += procid - 1;
			stop = start + (count + 1); 
		}
		else
		{
			start += nreps % (nproc-1);
			stop = start + count;
		}
		
		//iterate thru the relevant rows
		for (int rnr=start;rnr<stop;++rnr)
		{
			r = MinCoreSize + ((rnr / NumReplicates) * SamplingFreq); //int rounds to floor
			
			//develop random starting core set
			//clear AccessionsInCore and set size
			AccessionsInCore.clear();
			AccessionsInCore.resize(r);
			
			//add kernel accessions to core, if necessary
			if (Kernel == "yes")
			{
				for (unsigned int i=0;i<KernelAccessionIndex.size();i++)
				{
					AccessionsInCore[i] = KernelAccessionIndex[i];
				}
			}

			//clear TempList and set size					
			TempList.clear();
			TempList.resize( AccessionNameList.size() );
			
			//set list of available accessions in TempList, by erasing those already in the core
			TempList = AccessionNameList;
			//expunge the kernel accessions, so they are not available for random addition below
			//KernelAccessionIndex has been reverse sorted so you don't go outside range after automatic resize by .erase
			for (unsigned int i=0;i<KernelAccessionIndex.size();i++)
			{
				b = KernelAccessionIndex[i];
				TempList.erase(TempList.begin()+b);
			}
		
			//randomly add accessions until r accessions are in the core. if there is a kernel, include those (done above)
			//plus additional, randomly selected accessions, until you get r accessions
			for (unsigned int i=KernelAccessionIndex.size();i<r;i++)
			{
				//choose an accession randomly from those available
				RandAcc = rand() % TempList.size();
				//add it to the list
				AccessionsInCore[i] = TempList[RandAcc];
			
				//remove it from the list of available accessions
				TempList.erase(TempList.begin()+RandAcc);
			}
	
			//assemble genotypes for random core and calculate diversity
			//1. put together initial list of active alleles
			CoreAlleles.clear();
			CoreAlleles.resize( AccessionsInCore.size() );
			for (unsigned int i=0;i<AccessionsInCore.size();i++)
			{
				b = AccessionsInCore[i];
				CoreAlleles[i] = ActiveAlleleByPopList[b];
			}

			//2. calculate diversity from random selection at active loci
			AlleleList.clear();
			AlleleList = CoreAlleles;
	
			MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity);
			//in MyCalculateDiversity, latter two variables are updated as references
			//save them away in non-updated variables
			StartingRandomActiveDiversity = RandomActiveDiversity;
			StartingAltRandomActiveDiversity = AltRandomActiveDiversity;

			//3. calculate diversity from random selection at target loci
			AlleleList.clear();
			AlleleList.resize( AccessionsInCore.size() );
			for (unsigned int j=0;j<AccessionsInCore.size();j++)
			{
				b = AccessionsInCore[j];
				AlleleList[j] = TargetAlleleByPopList[b];
			}
			MyCalculateDiversity(AlleleList, TargetMaxAllelesList, Standardize, RandomTargetDiversity, AltRandomTargetDiversity);


			//BEGIN OPTIMIZATION
			StartingDiversity = 0; //this is the diversity recovered during the prior iteration.
			plateau = 0; //count of the number of times you have found the best value, evaluates when you are
						 //stuck on a plateau, assuming acceptance criterion allows downhill steps
			//this is the iterations step, now an indefinite loop that is broken when 
			//no improvement is made during the course of the optimization algorithm
			//If r = kernel size = MinCoreSize then do no optimization but still calculate all variables.
			if (KernelAccessionIndex.size() == r)
			{
				//assemble genotypes for core
				//1. put together initial list
				CoreAlleles.clear();
				CoreAlleles.resize(r);
				for (unsigned int i=0;i<r;i++)
				{
					b = AccessionsInCore[i];
					CoreAlleles[i] = ActiveAlleleByPopList[b];
				}
				
				AlleleList = CoreAlleles;
				
				MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity);
				best = RandomActiveDiversity; //best is equivalent to OptimizedActiveDiversity
				AltOptimizedActiveDiversity = AltRandomActiveDiversity;
			}
			else
			{
				//do optimization
				while ( true )
				{
					//assemble genotypes for core
					//1. put together initial list
					CoreAlleles.clear();
					CoreAlleles.resize(r);
					for (unsigned int i=0;i<r;i++)
					{
						b = AccessionsInCore[i];
						CoreAlleles[i] = ActiveAlleleByPopList[b];
					}
		
					//2. go through all possible subsets of size r-1, one at a time, noting which is best.
					//If there is a kernel, do not swap out any of those accessions (they are retained as the
					//first KernelAccessionIndex.size() items in CoreAlleles).  Accomplished by starting for loop
					//at KernelAccessionIndex.size().
					best=0;
					for (unsigned int i=KernelAccessionIndex.size();i<CoreAlleles.size();i++)
					{
						//remove each item consecutively from the list of all populations in the core
						AlleleList.clear();
						TdTempList.clear();
				
						TdTempList = CoreAlleles; //swap to temporary vector
						TdTempList.erase( TdTempList.begin() + i);
						AlleleList = TdTempList;
			
						TempList2.clear();
						TempList2 = AccessionsInCore;
						TempList2.erase(TempList2.begin() + i);
						AccessionsInSubCore = TempList2;

						/*Data structure for SubCoreAlleles:
						SubCore 1..r
							Population 1..(r-1)
								AlleleArray 1..NumLoci		
			
						--3. fuse alleles from the same locus into a single array, for all accessions, for the current subcore
						--4. assemble a list of diversity (M) for each locus separately
						--5. standardize the M values to the maximum possible number of alleles at that locus, and add them up to get final estimate of standardized allelic diversity in the core.  then divide by the number of loci to get a number that is comparable across data sets.
						--5.5. simultaneous to the calculation, keep track of which subcore is best
						*/
			
						MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity);
						nnew = RandomActiveDiversity;

						if (nnew >= best) // >= allows sideways movement during hill climbing
						{
							best = nnew;

							BestSubCore.clear();
							BestSubCore = AccessionsInSubCore;
							BestSubCoreAlleles.clear();
							BestSubCoreAlleles = AlleleList;
						}
					}  //for loop cycles thru all subcores

					//reverse sort BestSubCore to support easy assembly of pared TempList below
					BestSubCoreRevSorted = BestSubCore;
					std::sort(BestSubCoreRevSorted.begin(), BestSubCoreRevSorted.end(), std::greater<int>());
	
					/*
					6. take the subcore with greatest diversity and consecutively add each 
					possible additional accession from the base collection.  find the core of size r 
					(not r-1 subcore) that has the greatest diversity.

					suppress the IDs of those accessions found in the BestSubCore from the 
					list of all accessions to get a list of remaining accessions.*/
					TempList = AccessionNameList;
					for (unsigned int k=0;k<BestSubCoreRevSorted.size();k++)
					{
						bsc = BestSubCoreRevSorted[k];
						TempList.erase( TempList.begin() + bsc );
					}
			
					//shuffle the list of remaining accessions, so addition order is not predictable
					std::random_shuffle (TempList.begin(), TempList.end());
				
					//add each remaining accession consecutively, calculate diversity, test 
					//whether it is better than the prior one
					best = 0;
					for (unsigned int k=0;k<TempList.size();k++)
					{
						bsc = TempList[k];
			
						//define the core
						TempList2 = BestSubCore;
						TempList2.resize( TempList2.size() + 1 );
						//TempList2.push_back(i);
						TempList2[TempList2.size()-1] = bsc; //add new accession to last vector element
						AccessionsInCore = TempList2;
			
						//assemble the allelelist for the core
						TdTempList = BestSubCoreAlleles;
						TdTempList.resize( TdTempList.size() + 1 );
						//TdTempList.push_back( ActiveAlleleByPopList[i] );
						TdTempList[TdTempList.size()-1] = ActiveAlleleByPopList[bsc];
						AlleleList = TdTempList;
			
						//calculate diversity
						MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, nnew, TempAltOptimizedActiveDiversity); 
		
						//test whether current diversity is higher than the best diversity found so far
						if (nnew >= best) // >= allows sideways movement during hill climbing
						{
							best = nnew;
							bestcore = AccessionsInCore;
							//save the alternative diversity value for the best core
							AltOptimizedActiveDiversity = TempAltOptimizedActiveDiversity;
						}
					}

					AccessionsInCore = bestcore; //define starting variable for next MSTRAT iteration
		
					//if there has been no improvement from the prior iteration, you have reached
					// the plateau and should exit the repeat
					if (best == StartingDiversity) 
					{
						plateau++;
						if (plateau > 0) break;
					}
					//update starting value and repeat
					else if (best > StartingDiversity) StartingDiversity = best;
				
				} //while(true) endless loop
			}

			//7. Calculate diversity at target loci
			//assemble the target loci allelelist for the accessions in the best core
			AlleleList.clear();
			AlleleList.resize( AccessionsInCore.size() );
			for (unsigned int j=0;j<AccessionsInCore.size();j++)
			{
				b = AccessionsInCore[j];
				AlleleList[j] = TargetAlleleByPopList[b];
			}
	
			//calculate diversity at target loci based upon the optimized core selection
			MyCalculateDiversity(AlleleList, TargetMaxAllelesList, Standardize, OptimizedTargetDiversity, AltOptimizedTargetDiversity);

			//8. Assemble stats for optimized core and add to output vectors
			//create a list of accession names from the list of accession ID's in AccessionsInCore
			sort( AccessionsInCore.begin(), AccessionsInCore.end() );
			
			TempListStr.clear();
			TempListStr.resize(r);
			for (unsigned int i=0;i<AccessionsInCore.size();i++)
			{
				b = AccessionsInCore[i];
				TempListStr[i] = FullAccessionNameList[b];
			}

			/***MPI: BUILD & SEND RESULTS VECTOR***/
			//load the variables onto the results vectors
			
			//no need to calculate row number, it is the same as rnr, formula saved because it might be useful later
			//row = ((r - MinCoreSize)*NumReplicates) + nr - ( (NumReplicates*(SamplingFreq-1))*( (r-MinCoreSize)/SamplingFreq ) );
			// (r - MinCoreSize)*NumReplicates) + nr specifies row number if SamplingFreq=1
			// (NumReplicates*(SamplingFreq-1)) specifies a step value to correct when SamplingFreq>1
			// ( (r-MinCoreSize)/SamplingFreq ) specifies the replicate on core size, accounting for SamplingFreq
			// see file Calculation of row value.xlsx for development of the 'row' index
			
			//put results 0-8 into a vector, resvec, return row as last item
			vector<double> resvec(10);

			resvec[0] = double(r);
			resvec[1] = StartingRandomActiveDiversity;//RandomActiveDiversity;
			resvec[2] = best; //equivalent to OptimizedActiveDiversity
			resvec[3] = RandomTargetDiversity;
			resvec[4] = OptimizedTargetDiversity;
			resvec[5] = StartingAltRandomActiveDiversity;//AltRandomActiveDiversity;
			resvec[6] = AltOptimizedActiveDiversity;
			resvec[7] = AltRandomTargetDiversity;
			resvec[8] = AltOptimizedTargetDiversity;
			resvec[9] = double(rnr);
			
			
			//cout<<"MPI_Rank="<<MPI_Rank<<" 
			
			
			//send result vector to master 0, send row number, rnr, as last element.
			//message is tagged as 0
			//here you are pointing to the first element, then returning resvec.size() doubles-
			//worth of memory from that starting location.
			MPI_Send(&resvec[0], resvec.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
			/***MPI: END BUILD & SEND RESULTS VECTOR***/


			/***MPI: BUILD & SEND MEMBERS VECTOR***/
			//add row number as last item in TempListStr
			TempListStr.resize(TempListStr.size()+1);
			stringstream ss;
			ss << rnr;	//convert int to stringstream to string			
			TempListStr[ TempListStr.size() - 1 ] = ss.str();
		
			//convert vector<string> to a single, ',<!>,' delimited, string
			string concat;
			for (unsigned int i=0;i<TempListStr.size();++i)
			{
				concat += TempListStr[i]; //add vector element
				if (i<TempListStr.size()-1) concat += ",<!>,"; //add delimiter, except for last item
			}
			//convert the string to a char array
			char cc[concat.size()+1];
			strcpy(cc, concat.c_str());
			
			//send the char array to master0 tagged as 1
			//tagged as 1 to distinguish from result vector send
			MPI_Send(&cc, sizeof(cc), MPI_CHAR, 0, 1, MPI_COMM_WORLD);

		} //end for loop over rows
	} //***MPI:  END SEND
	

	/*MPI: MASTER 0 WRITES OUTPUT*/
	if ( procid == 0 )
	{
		//set up file stream for output file
		ofstream output; 
		output.open(OutFilePath);
		output.close(); //quick open close done to clear any existing file each time program is run
		output.open(OutFilePath, ios::out | ios::app); //open file in append mode
		output << "core size	random reference diversity	optimized reference diversity	random target diversity	optimized target diversity	alt random reference diversity	alt optimized reference diversity	alt random target diversity	alt optimized target diversity	core members" << "\n";
		
		//write out results row by row
		for (int i=0;i<V1;i++)
		{
			//write variables
			output 	<< Results[i][0] 
					<< "	" << Results[i][1] 
					<< "	" << Results[i][2] 
					<< "	" << Results[i][3] 
					<< "	" << Results[i][4] 
					<< "	" << Results[i][5] 
					<< "	" << Results[i][6] 
					<< "	" << Results[i][7] 
					<< "	" << Results[i][8] 
					<< "	" << "(";
			//write Accessions retained
			for (unsigned int j=0;j<Members[i].size();j++)
			{
				if ( j==(Members[i].size() - 1) )
				{
					//add trailing parentheses and move to next row
					output << Members[i][j] << ")\n";
				}
				else
				{
					output << Members[i][j] << ",";
				}
			}
		}
	
		//wrap up write step
		output.close();
	} /***MPI: END MASTER WRITE***/
	
	//Terminate MPI.
	//MPI::Finalize ( );

}
Example #3
0
//=========================================================================================
void runLUMatrixTests(Epetra_CrsMatrix * L,  Epetra_MultiVector * bL, Epetra_MultiVector * btL, Epetra_MultiVector * xexactL,
		      Epetra_CrsMatrix * U,  Epetra_MultiVector * bU, Epetra_MultiVector * btU, Epetra_MultiVector * xexactU,
		      bool StaticProfile, bool verbose, bool summary) {

  if (L->NoDiagonal()) {
    bL->Update(1.0, *xexactL, 1.0); // Add contribution of a unit diagonal to bL
    btL->Update(1.0, *xexactL, 1.0); // Add contribution of a unit diagonal to btL
  }
  if (U->NoDiagonal()) {
    bU->Update(1.0, *xexactU, 1.0); // Add contribution of a unit diagonal to bU
    btU->Update(1.0, *xexactU, 1.0); // Add contribution of a unit diagonal to btU
  }

  Epetra_MultiVector z(*bL);
  Epetra_MultiVector r(*bL);
  Epetra_SerialDenseVector resvec(bL->NumVectors());

  //Timings
  Epetra_Flops flopcounter;
  L->SetFlopCounter(flopcounter);
  U->SetFlopCounter(flopcounter);
  Epetra_Time timer(L->Comm());
  std::string statdyn =        "dynamic";
  if (StaticProfile) statdyn = "static ";

  for (int j=0; j<4; j++) { // j = 0/2 is notrans, j = 1/3 is trans

    bool TransA = (j==1 || j==3);
    std::string contig = "without";
    if (j>1) contig =    "with   ";

    if (j==2) {
      L->OptimizeStorage();
      U->OptimizeStorage();
    }

    flopcounter.ResetFlops();
    timer.ResetStartTime();

    //10 lower solves
    bool Upper = false;
    bool UnitDiagonal = L->NoDiagonal();  // If no diagonal, then unit must be used
    Epetra_MultiVector * b = TransA ? btL : bL;  // solve with the appropriate b vector
    for( int i = 0; i < 10; ++i )
      L->Solve(Upper, TransA, UnitDiagonal, *b, z); // Solve Lz = bL or L'z = bLt

    double elapsed_time = timer.ElapsedTime();
    double total_flops = L->Flops();

    // Compute residual
    r.Update(-1.0, z, 1.0, *xexactL, 0.0); // r = bt - z
    r.Norm2(resvec.Values());

    if (resvec.NormInf()>0.000001) {
      cout << "resvec = " << resvec << endl;
      cout << "z = " << z << endl;
      cout << "xexactL = " << *xexactL << endl;
      cout << "r = " << r << endl;
    }

    if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": ";
    double MFLOPs = total_flops/elapsed_time/1000000.0;
    if (verbose) cout << "Total MFLOPs for 10 " << " Lower solves " << statdyn << " Profile (Trans = " << TransA
		      << ")  and " << contig << " opt storage = " << MFLOPs << " (" << elapsed_time << " s)" <<endl;
    if (summary) {
      if (L->Comm().NumProc()==1) {
	if (TransA) cout << "TransLSv" << statdyn<< "Prof" << contig << "OptStor" << '\t';
	else cout << "NoTransLSv" << statdyn << "Prof" << contig << "OptStor" << '\t';
      }
      cout << MFLOPs << endl;
    }
    flopcounter.ResetFlops();
    timer.ResetStartTime();

    //10 upper solves
    Upper = true;
    UnitDiagonal = U->NoDiagonal();  // If no diagonal, then unit must be used
    b = TransA ? btU : bU;  // solve with the appropriate b vector
    for( int i = 0; i < 10; ++i )
      U->Solve(Upper, TransA, UnitDiagonal, *b, z); // Solve Lz = bL or L'z = bLt

    elapsed_time = timer.ElapsedTime();
    total_flops = U->Flops();

    // Compute residual
    r.Update(-1.0, z, 1.0, *xexactU, 0.0); // r = bt - z
    r.Norm2(resvec.Values());

    if (resvec.NormInf()>0.001) {
      cout << "U = " << *U << endl;
      //cout << "resvec = " << resvec << endl;
      cout << "z = " << z << endl;
      cout << "xexactU = " << *xexactU << endl;
      //cout << "r = " << r << endl;
      cout << "b = " << *b << endl;
    }


    if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": ";
    MFLOPs = total_flops/elapsed_time/1000000.0;
    if (verbose) cout << "Total MFLOPs for 10 " << " Upper solves " << statdyn << " Profile (Trans = " << TransA
		      << ")  and " << contig << " opt storage = " << MFLOPs <<endl;
    if (summary) {
      if (L->Comm().NumProc()==1) {
	if (TransA) cout << "TransUSv" << statdyn<< "Prof" << contig << "OptStor" << '\t';
	else cout << "NoTransUSv" << statdyn << "Prof" << contig << "OptStor" << '\t';
      }
      cout << MFLOPs << endl;
    }
  }
  return;
}
Example #4
0
void runMatrixTests(Epetra_CrsMatrix * A,  Epetra_MultiVector * b, Epetra_MultiVector * bt,
		    Epetra_MultiVector * xexact, bool StaticProfile, bool verbose, bool summary) {

  Epetra_MultiVector z(*b);
  Epetra_MultiVector r(*b);
  Epetra_SerialDenseVector resvec(b->NumVectors());

  //Timings
  Epetra_Flops flopcounter;
  A->SetFlopCounter(flopcounter);
  Epetra_Time timer(A->Comm());
  std::string statdyn =        "dynamic";
  if (StaticProfile) statdyn = "static ";

  for (int j=0; j<4; j++) { // j = 0/2 is notrans, j = 1/3 is trans

    bool TransA = (j==1 || j==3);
    std::string contig = "without";
    if (j>1) contig =    "with   ";

#ifdef EPETRA_SHORT_PERFTEST
    int kstart = 1;
#else
    int kstart = 0;
#endif
    for (int k=kstart; k<2; k++) { // Loop over old multiply vs. new multiply

      std::string oldnew = "old";
      if (k>0) oldnew =    "new";

      if (j==2) A->OptimizeStorage();

      flopcounter.ResetFlops();
      timer.ResetStartTime();

      if (k==0) {
	//10 matvecs
#ifndef EPETRA_SHORT_PERFTEST
	for( int i = 0; i < 10; ++i )
	  A->Multiply1(TransA, *xexact, z); // Compute z = A*xexact or z = A'*xexact using old Multiply method
#endif
      }
      else {
	//10 matvecs
	for( int i = 0; i < 10; ++i )
	  A->Multiply(TransA, *xexact, z); // Compute z = A*xexact or z = A'*xexact
      }

      double elapsed_time = timer.ElapsedTime();
      double total_flops = A->Flops();

      // Compute residual
      if (TransA)
	r.Update(-1.0, z, 1.0, *bt, 0.0); // r = bt - z
      else
	r.Update(-1.0, z, 1.0, *b, 0.0); // r = b - z

      r.Norm2(resvec.Values());

      if (verbose) cout << "ResNorm = " << resvec.NormInf() << ": ";
      double MFLOPs = total_flops/elapsed_time/1000000.0;
      if (verbose) cout << "Total MFLOPs for 10 " << oldnew << " MatVec's with " << statdyn << " Profile (Trans = " << TransA
			<< ")  and " << contig << " optimized storage = " << MFLOPs << " (" << elapsed_time << " s)" <<endl;
      if (summary) {
	if (A->Comm().NumProc()==1) {
	  if (TransA) cout << "TransMv" << statdyn<< "Prof" << contig << "OptStor" << '\t';
	  else cout << "NoTransMv" << statdyn << "Prof" << contig << "OptStor" << '\t';
	}
	cout << MFLOPs << endl;
      }
    }
  }
  return;
}
Example #5
0
int main(int argc, char *argv[])
{
  int ierr = 0;
  double elapsed_time;
  double total_flops;
  double MFLOPs;


#ifdef EPETRA_MPI

  // Initialize MPI
  MPI_Init(&argc,&argv);
  Epetra_MpiComm comm( MPI_COMM_WORLD );
#else
  Epetra_SerialComm comm;
#endif

  bool verbose = false;
  bool summary = false;

  // Check if we should print verbose results to standard out
  if (argc>6) if (argv[6][0]=='-' && argv[6][1]=='v') verbose = true;

  // Check if we should print verbose results to standard out
  if (argc>6) if (argv[6][0]=='-' && argv[6][1]=='s') summary = true;

  if(argc < 6) {
    cerr << "Usage: " << argv[0]
         << " NumNodesX NumNodesY NumProcX NumProcY NumPoints [-v|-s]" << endl
         << "where:" << endl
         << "NumNodesX         - Number of mesh nodes in X direction per processor" << endl
         << "NumNodesY         - Number of mesh nodes in Y direction per processor" << endl
         << "NumProcX          - Number of processors to use in X direction" << endl
         << "NumProcY          - Number of processors to use in Y direction" << endl
         << "NumPoints         - Number of points to use in stencil (5, 9 or 25 only)" << endl
         << "-v|-s             - (Optional) Run in verbose mode if -v present or summary mode if -s present" << endl
         << " NOTES: NumProcX*NumProcY must equal the number of processors used to run the problem." << endl << endl
	 << " Serial example:" << endl
         << argv[0] << " 16 12 1 1 25 -v" << endl
	 << " Run this program in verbose mode on 1 processor using a 16 X 12 grid with a 25 point stencil."<< endl <<endl
	 << " MPI example:" << endl
         << "mpirun -np 32 " << argv[0] << " 10 12 4 8 9 -v" << endl
	 << " Run this program in verbose mode on 32 processors putting a 10 X 12 subgrid on each processor using 4 processors "<< endl
	 << " in the X direction and 8 in the Y direction.  Total grid size is 40 points in X and 96 in Y with a 9 point stencil."<< endl
         << endl;
    return(1);

  }
    //char tmp;
    //if (comm.MyPID()==0) cout << "Press any key to continue..."<< endl;
    //if (comm.MyPID()==0) cin >> tmp;
    //comm.Barrier();

  comm.SetTracebackMode(0); // This should shut down any error traceback reporting
  if (verbose && comm.MyPID()==0)
    cout << Epetra_Version() << endl << endl;
  if (summary && comm.MyPID()==0) {
    if (comm.NumProc()==1)
      cout << Epetra_Version() << endl << endl;
    else
      cout << endl << endl; // Print two blank line to keep output columns lined up
  }

  if (verbose) cout << comm <<endl;


  // Redefine verbose to only print on PE 0

  if (verbose && comm.MyPID()!=0) verbose = false;
  if (summary && comm.MyPID()!=0) summary = false;

  int numNodesX = atoi(argv[1]);
  int numNodesY = atoi(argv[2]);
  int numProcsX = atoi(argv[3]);
  int numProcsY = atoi(argv[4]);
  int numPoints = atoi(argv[5]);

  if (verbose || (summary && comm.NumProc()==1)) {
    cout << " Number of local nodes in X direction  = " << numNodesX << endl
	 << " Number of local nodes in Y direction  = " << numNodesY << endl
	 << " Number of global nodes in X direction = " << numNodesX*numProcsX << endl
	 << " Number of global nodes in Y direction = " << numNodesY*numProcsY << endl
	 << " Number of local nonzero entries       = " << numNodesX*numNodesY*numPoints << endl
	 << " Number of global nonzero entries      = " << numNodesX*numNodesY*numPoints*numProcsX*numProcsY << endl
	 << " Number of Processors in X direction   = " << numProcsX << endl
	 << " Number of Processors in Y direction   = " << numProcsY << endl
	 << " Number of Points in stencil           = " << numPoints << endl << endl;
  }
  // Print blank line to keep output columns lined up
  if (summary && comm.NumProc()>1)
    cout << endl << endl << endl << endl << endl << endl << endl << endl<< endl << endl;

  if (numProcsX*numProcsY!=comm.NumProc()) {
    cerr << "Number of processors = " << comm.NumProc() << endl
	 << " is not the product of " << numProcsX << " and " << numProcsY << endl << endl;
    return(1);
  }

  if (numPoints!=5 && numPoints!=9 && numPoints!=25) {
    cerr << "Number of points specified = " << numPoints << endl
	 << " is not 5, 9, 25" << endl << endl;
    return(1);
  }

  if (numNodesX*numNodesY<=0) {
    cerr << "Product of number of nodes is <= zero" << endl << endl;
    return(1);
  }

  Epetra_IntSerialDenseVector Xoff, XLoff, XUoff;
  Epetra_IntSerialDenseVector Yoff, YLoff, YUoff;
  if (numPoints==5) {

     // Generate a 5-point 2D Finite Difference matrix
    Xoff.Size(5);
    Yoff.Size(5);
    Xoff[0] = -1; Xoff[1] = 1; Xoff[2] = 0; Xoff[3] = 0;  Xoff[4] = 0;
    Yoff[0] = 0;  Yoff[1] = 0; Yoff[2] = 0; Yoff[3] = -1; Yoff[4] = 1;

     // Generate a 2-point 2D Lower triangular Finite Difference matrix
    XLoff.Size(2);
    YLoff.Size(2);
    XLoff[0] = -1; XLoff[1] =  0;
    YLoff[0] =  0; YLoff[1] = -1;

     // Generate a 3-point 2D upper triangular Finite Difference matrix
    XUoff.Size(3);
    YUoff.Size(3);
    XUoff[0] =  0; XUoff[1] =  1; XUoff[2] = 0;
    YUoff[0] =  0; YUoff[1] =  0; YUoff[2] = 1;
  }
  else if (numPoints==9) {
    // Generate a 9-point 2D Finite Difference matrix
    Xoff.Size(9);
    Yoff.Size(9);
    Xoff[0] = -1;  Xoff[1] =  0; Xoff[2] =  1;
    Yoff[0] = -1;  Yoff[1] = -1; Yoff[2] = -1;
    Xoff[3] = -1;  Xoff[4] =  0; Xoff[5] =  1;
    Yoff[3] =  0;  Yoff[4] =  0; Yoff[5] =  0;
    Xoff[6] = -1;  Xoff[7] =  0; Xoff[8] =  1;
    Yoff[6] =  1;  Yoff[7] =  1; Yoff[8] =  1;

    // Generate a 5-point lower triangular 2D Finite Difference matrix
    XLoff.Size(5);
    YLoff.Size(5);
    XLoff[0] = -1;  XLoff[1] =  0; Xoff[2] =  1;
    YLoff[0] = -1;  YLoff[1] = -1; Yoff[2] = -1;
    XLoff[3] = -1;  XLoff[4] =  0;
    YLoff[3] =  0;  YLoff[4] =  0;

    // Generate a 4-point upper triangular 2D Finite Difference matrix
    XUoff.Size(4);
    YUoff.Size(4);
    XUoff[0] =  1;
    YUoff[0] =  0;
    XUoff[1] = -1;  XUoff[2] =  0; XUoff[3] =  1;
    YUoff[1] =  1;  YUoff[2] =  1; YUoff[3] =  1;

  }
  else {
    // Generate a 25-point 2D Finite Difference matrix
    Xoff.Size(25);
    Yoff.Size(25);
    int xi = 0, yi = 0;
    int xo = -2, yo = -2;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ;
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ;
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ;
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ;
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ;

    // Generate a 13-point lower triangular 2D Finite Difference matrix
    XLoff.Size(13);
    YLoff.Size(13);
    xi = 0, yi = 0;
    xo = -2, yo = -2;
    XLoff[xi++] = xo++;  XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++;
    YLoff[yi++] = yo  ;  YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ;
    xo = -2, yo++;
    XLoff[xi++] = xo++;  XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++;
    YLoff[yi++] = yo  ;  YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ;
    xo = -2, yo++;
    XLoff[xi++] = xo++;  XLoff[xi++] = xo++; XLoff[xi++] = xo++;
    YLoff[yi++] = yo  ;  YLoff[yi++] = yo  ; YLoff[yi++] = yo  ;

    // Generate a 13-point upper triangular 2D Finite Difference matrix
    XUoff.Size(13);
    YUoff.Size(13);
    xi = 0, yi = 0;
    xo = 0, yo = 0;
    XUoff[xi++] = xo++;  XUoff[xi++] = xo++; XUoff[xi++] = xo++;
    YUoff[yi++] = yo  ;  YUoff[yi++] = yo  ; YUoff[yi++] = yo  ;
    xo = -2, yo++;
    XUoff[xi++] = xo++;  XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++;
    YUoff[yi++] = yo  ;  YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ;
    xo = -2, yo++;
    XUoff[xi++] = xo++;  XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++;
    YUoff[yi++] = yo  ;  YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ;

  }

  Epetra_Map * map;
  Epetra_Map * mapL;
  Epetra_Map * mapU;
  Epetra_CrsMatrix * A;
  Epetra_CrsMatrix * L;
  Epetra_CrsMatrix * U;
  Epetra_MultiVector * b;
  Epetra_MultiVector * bt;
  Epetra_MultiVector * xexact;
  Epetra_MultiVector * bL;
  Epetra_MultiVector * btL;
  Epetra_MultiVector * xexactL;
  Epetra_MultiVector * bU;
  Epetra_MultiVector * btU;
  Epetra_MultiVector * xexactU;
  Epetra_SerialDenseVector resvec(0);

  //Timings
  Epetra_Flops flopcounter;
  Epetra_Time timer(comm);

#ifdef EPETRA_VERY_SHORT_PERFTEST
  int jstop = 1;
#elif EPETRA_SHORT_PERFTEST
  int jstop = 1;
#else
  int jstop = 2;
#endif
  for (int j=0; j<jstop; j++) {
    for (int k=1; k<17; k++) {
#ifdef EPETRA_VERY_SHORT_PERFTEST
      if (k<3 || (k%4==0 && k<9)) {
#elif EPETRA_SHORT_PERFTEST
      if (k<6 || k%4==0) {
#else
      if (k<7 || k%2==0) {
#endif
      int nrhs=k;
      if (verbose) cout << "\n*************** Results for " << nrhs << " RHS with ";

      bool StaticProfile = (j!=0);
      if (verbose) {
        if (StaticProfile) cout << " static profile\n";
        else cout << " dynamic profile\n";
      }
      GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, numPoints,
			 Xoff.Values(), Yoff.Values(), nrhs, comm, verbose, summary,
			 map, A, b, bt, xexact, StaticProfile, false);


#ifdef EPETRA_HAVE_JADMATRIX

      timer.ResetStartTime();
      Epetra_JadMatrix JA(*A);
      elapsed_time = timer.ElapsedTime();
      if (verbose) cout << "Time to create Jagged diagonal matrix = " << elapsed_time << endl;

      //cout << "A = " << *A << endl;
      //cout << "JA = " << JA << endl;

      runJadMatrixTests(&JA, b, bt, xexact, StaticProfile, verbose, summary);

#endif
      runMatrixTests(A, b, bt, xexact, StaticProfile, verbose, summary);

      delete A;
      delete b;
      delete bt;
      delete xexact;

      GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, XLoff.Length(),
			 XLoff.Values(), YLoff.Values(), nrhs, comm, verbose, summary,
			 mapL, L, bL, btL, xexactL, StaticProfile, true);


      GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, XUoff.Length(),
			 XUoff.Values(), YUoff.Values(), nrhs, comm, verbose, summary,
			 mapU, U, bU, btU, xexactU, StaticProfile, true);


      runLUMatrixTests(L, bL, btL, xexactL, U, bU, btU, xexactU, StaticProfile, verbose, summary);

      delete L;
      delete bL;
      delete btL;
      delete xexactL;
      delete mapL;

      delete U;
      delete bU;
      delete btU;
      delete xexactU;
      delete mapU;

      Epetra_MultiVector q(*map, nrhs);
      Epetra_MultiVector z(q);
      Epetra_MultiVector r(q);

      delete map;
      q.SetFlopCounter(flopcounter);
      z.SetFlopCounter(q);
      r.SetFlopCounter(q);

      resvec.Resize(nrhs);


      flopcounter.ResetFlops();
      timer.ResetStartTime();

      //10 norms
      for( int i = 0; i < 10; ++i )
	q.Norm2( resvec.Values() );

      elapsed_time = timer.ElapsedTime();
      total_flops = q.Flops();
      MFLOPs = total_flops/elapsed_time/1000000.0;
      if (verbose) cout << "\nTotal MFLOPs for 10 Norm2's= " << MFLOPs << endl;

      if (summary) {
	if (comm.NumProc()==1) cout << "Norm2" << '\t';
	cout << MFLOPs << endl;
      }

      flopcounter.ResetFlops();
      timer.ResetStartTime();

      //10 dot's
      for( int i = 0; i < 10; ++i )
	q.Dot(z, resvec.Values());

      elapsed_time = timer.ElapsedTime();
      total_flops = q.Flops();
      MFLOPs = total_flops/elapsed_time/1000000.0;
      if (verbose) cout << "Total MFLOPs for 10 Dot's  = " << MFLOPs << endl;

      if (summary) {
	if (comm.NumProc()==1) cout << "DotProd" << '\t';
	cout << MFLOPs << endl;
      }

      flopcounter.ResetFlops();
      timer.ResetStartTime();

      //10 dot's
      for( int i = 0; i < 10; ++i )
	q.Update(1.0, z, 1.0, r, 0.0);

      elapsed_time = timer.ElapsedTime();
      total_flops = q.Flops();
      MFLOPs = total_flops/elapsed_time/1000000.0;
      if (verbose) cout << "Total MFLOPs for 10 Updates= " << MFLOPs << endl;

      if (summary) {
	if (comm.NumProc()==1) cout << "Update" << '\t';
	cout << MFLOPs << endl;
      }
    }
    }
  }
#ifdef EPETRA_MPI
  MPI_Finalize() ;
#endif

return ierr ;
}

// Constructs a 2D PDE finite difference matrix using the list of x and y offsets.
//
// nx      (In) - number of grid points in x direction
// ny      (In) - number of grid points in y direction
//   The total number of equations will be nx*ny ordered such that the x direction changes
//   most rapidly:
//      First equation is at point (0,0)
//      Second at                  (1,0)
//       ...
//      nx equation at             (nx-1,0)
//      nx+1st equation at         (0,1)

// numPoints (In) - number of points in finite difference stencil
// xoff    (In) - stencil offsets in x direction (of length numPoints)
// yoff    (In) - stencil offsets in y direction (of length numPoints)
//   A standard 5-point finite difference stencil would be described as:
//     numPoints = 5
//     xoff = [-1, 1, 0,  0, 0]
//     yoff = [ 0, 0, 0, -1, 1]

// nrhs - Number of rhs to generate. (First interface produces vectors, so nrhs is not needed

// comm    (In) - an Epetra_Comm object describing the parallel machine (numProcs and my proc ID)
// map    (Out) - Epetra_Map describing distribution of matrix and vectors/multivectors
// A      (Out) - Epetra_CrsMatrix constructed for nx by ny grid using prescribed stencil
//                Off-diagonal values are random between 0 and 1.  If diagonal is part of stencil,
//                diagonal will be slightly diag dominant.
// b      (Out) - Generated RHS.  Values satisfy b = A*xexact
// bt     (Out) - Generated RHS.  Values satisfy b = A'*xexact
// xexact (Out) - Generated exact solution to Ax = b and b' = A'xexact

// Note: Caller of this function is responsible for deleting all output objects.

void GenerateCrsProblem(int numNodesX, int numNodesY, int numProcsX, int numProcsY, int numPoints,
			int * xoff, int * yoff,
			const Epetra_Comm  &comm, bool verbose, bool summary,
			Epetra_Map *& map,
			Epetra_CrsMatrix *& A,
			Epetra_Vector *& b,
			Epetra_Vector *& bt,
			Epetra_Vector *&xexact, bool StaticProfile, bool MakeLocalOnly) {

  Epetra_MultiVector * b1, * bt1, * xexact1;
	
  GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, numPoints,
		     xoff, yoff, 1, comm, verbose, summary,
		     map, A, b1, bt1, xexact1, StaticProfile, MakeLocalOnly);

  b = dynamic_cast<Epetra_Vector *>(b1);
  bt = dynamic_cast<Epetra_Vector *>(bt1);
  xexact = dynamic_cast<Epetra_Vector *>(xexact1);

  return;
}

void GenerateCrsProblem(int numNodesX, int numNodesY, int numProcsX, int numProcsY, int numPoints,
			int * xoff, int * yoff, int nrhs,
			const Epetra_Comm  &comm, bool verbose, bool summary,
			Epetra_Map *& map,
			Epetra_CrsMatrix *& A,
			Epetra_MultiVector *& b,
			Epetra_MultiVector *& bt,
			Epetra_MultiVector *&xexact, bool StaticProfile, bool MakeLocalOnly) {

  Epetra_Time timer(comm);
  // Determine my global IDs
  long long * myGlobalElements;
  GenerateMyGlobalElements(numNodesX, numNodesY, numProcsX, numProcsY, comm.MyPID(), myGlobalElements);

  int numMyEquations = numNodesX*numNodesY;

  map = new Epetra_Map((long long)-1, numMyEquations, myGlobalElements, 0, comm); // Create map with 2D block partitioning.
  delete [] myGlobalElements;

  long long numGlobalEquations = map->NumGlobalElements64();

  int profile = 0; if (StaticProfile) profile = numPoints;

#ifdef EPETRA_HAVE_STATICPROFILE

  if (MakeLocalOnly)
    A = new Epetra_CrsMatrix(Copy, *map, *map, profile, StaticProfile); // Construct matrix with rowmap=colmap
  else
    A = new Epetra_CrsMatrix(Copy, *map, profile, StaticProfile); // Construct matrix

#else

  if (MakeLocalOnly)
    A = new Epetra_CrsMatrix(Copy, *map, *map, profile); // Construct matrix with rowmap=colmap
  else
    A = new Epetra_CrsMatrix(Copy, *map, profile); // Construct matrix

#endif

  long long * indices = new long long[numPoints];
  double * values = new double[numPoints];

  double dnumPoints = (double) numPoints;
  int nx = numNodesX*numProcsX;

  for (int i=0; i<numMyEquations; i++) {

    long long rowID = map->GID64(i);
    int numIndices = 0;

    for (int j=0; j<numPoints; j++) {
      long long colID = rowID + xoff[j] + nx*yoff[j]; // Compute column ID based on stencil offsets
      if (colID>-1 && colID<numGlobalEquations) {
	indices[numIndices] = colID;
	double value = - ((double) rand())/ ((double) RAND_MAX);
	if (colID==rowID)
	  values[numIndices++] = dnumPoints - value; // Make diagonal dominant
	else
	  values[numIndices++] = value;
      }
    }
    //cout << "Building row " << rowID << endl;
    A->InsertGlobalValues(rowID, numIndices, values, indices);
  }

  delete [] indices;
  delete [] values;
  double insertTime = timer.ElapsedTime();
  timer.ResetStartTime();
  A->FillComplete(false);
  double fillCompleteTime = timer.ElapsedTime();

  if (verbose)
    cout << "Time to insert matrix values = " << insertTime << endl
	 << "Time to complete fill        = " << fillCompleteTime << endl;
  if (summary) {
    if (comm.NumProc()==1) cout << "InsertTime" << '\t';
    cout << insertTime << endl;
    if (comm.NumProc()==1) cout << "FillCompleteTime" << '\t';
    cout << fillCompleteTime << endl;
  }

  if (nrhs<=1) {
    b = new Epetra_Vector(*map);
    bt = new Epetra_Vector(*map);
    xexact = new Epetra_Vector(*map);
  }
  else {
    b = new Epetra_MultiVector(*map, nrhs);
    bt = new Epetra_MultiVector(*map, nrhs);
    xexact = new Epetra_MultiVector(*map, nrhs);
  }

  xexact->Random(); // Fill xexact with random values

  A->Multiply(false, *xexact, *b);
  A->Multiply(true, *xexact, *bt);

  return;
}
Example #6
0
/*
OPWEIGHTEDL2: Solves weighted L2 regularized inverse problems.
Minimizes the cost function
X* = argmin_X ||A(X)-b_FC||_2^2 + lambda ||W |D(X)| ||_2^2
where     X* = recovered image
          A  = linear measurement operator
          b_FC  = (noisy) measurements
                 %           W  = diagonal weight matrix built from the edge mask
          |D(X)| = gradient magnitude at each pixel

Inputs:  A = function handle representing the forward
              model/measurement operator
         At = function handle representing the backwards model/
              the transpose of the measurment operator.
              (e.g. if A is a downsampling, At is a upsampling)
         b_FC =  a vector of measurements; should match the
              dimensions of A(X)
         lambda = regularization parameter that balances data fidelity
              and smoothness. set lambda high for more smoothing.
         siz = output image size, e.g. siz = [512,512]
         Niter = is the number of iterations; should be ~100-500

Output:  X = high-resolution output image
         cost = array of cost function value vs. iteration
Define AtA fourier mask
PrecisionType lambda, uvec ind_samples, frowvec res, int Niter, double tol, PrecisionType gam, FloatImageType::Pointer& X, frowvec& cost, frowvec& resvec)
 */
FloatImageType::Pointer OpWeightedL2(FloatImageType::Pointer norm01_lowres, FloatImageType::Pointer edgemask)
{
  const PrecisionType lambda = 1e-3F ;
  constexpr int Niter = 100;
  const PrecisionType tol = 1e-8F ;

  const PrecisionType gam = 1.0F ;

  //typedef itk::VectorMagnitudeImageFilter<CVImageType, FloatImageType> GMType;

  //The optimal filter for modeling the measurement operator is low pass filter in this case
  // NOTE: That the A operator is a projection operator, so A^{T}A = A, That is to say that applying
  //       the A^{T} to A results in A.
  FloatImageType::Pointer p_image = GetDiracDeltaImage(edgemask);
  // Precompute

  //Make high-res coefficients
  const HalfHermetianImageType::Pointer b_FC = GetAFP_of_b(norm01_lowres, edgemask);
  //TODO: too many copies of Atb here.
  FloatImageType::Pointer Atb = At_fhp(b_FC,
                                       edgemask->GetLargestPossibleRegion().GetSize()[0]%2 == 1,
                                       edgemask.GetPointer());
  FloatImageType::Pointer TwoAtb = MakeTwoAtb(Atb);
  FloatImageType::Pointer X = DeepImageCopy<FloatImageType>(Atb);
  Atb = nullptr; //Save memory here

  CVImageType::Pointer DX = GetGradient(X);
  CVImageType::Pointer L = CreateEmptyImage<CVImageType>(DX);
  CVImageType::Pointer Y = CreateEmptyImage<CVImageType>(DX);
  //CVImageType::Pointer WDX = CreateEmptyImage<CVImageType>(DX);
  CVImageType::Pointer residue = CreateEmptyImage<CVImageType>(DX);
  CVImageType::Pointer YminusL = CreateEmptyImage<CVImageType>(DX);
  FloatImageType::Pointer tempValue=CreateEmptyImage<FloatImageType>(DX);

  std::vector<PrecisionType> resvec(Niter,0);
  std::vector<PrecisionType> cost(Niter,0);

#ifdef USE_WRITE_DEGUBBING
  itk::ComplexToModulusImageFilter<HalfHermetianImageType,FloatImageType>::Pointer cpx2abs =
  itk::ComplexToModulusImageFilter<HalfHermetianImageType,FloatImageType>::New();
#endif

  CVImageType::Pointer gradIm = GetGradient(p_image);
  FloatImageType::Pointer divIm = GetDivergence(gradIm);
  HalfHermetianImageType::Pointer DtDhat = GetForwardFFT(divIm);
  // TODO:  ALL SAME TO HERE!
  typedef HalfHermetianImageType::PixelType FCType;
  HalfHermetianImageType::Pointer TwoTimesAtAhatPlusLamGamDtDhat = CreateEmptyImage<HalfHermetianImageType>(DtDhat);
  {
    HalfHermetianImageType::Pointer TwoTimesAtAhat = GetLowpassOperator(norm01_lowres,p_image, 2.0F);
    TwoTimesAtAhatPlusLamGamDtDhat = opIC(TwoTimesAtAhatPlusLamGamDtDhat,FCType(lambda*gam),'*',DtDhat);
    //TODO:  Make This an inverse!
    TwoTimesAtAhatPlusLamGamDtDhat = opII(TwoTimesAtAhatPlusLamGamDtDhat,TwoTimesAtAhat,'+',TwoTimesAtAhatPlusLamGamDtDhat);
  }
  p_image = nullptr; //Save memory

  const bool edgemask_ActualXDimensionIsOdd = edgemask->GetLargestPossibleRegion().GetSize()[0] % 2 == 1;

  CVImageType::Pointer InvTwoMuPlusGamma = ComputeInvTwoMuPlusGamma(edgemask,gam);
  //FloatImageType::Pointer SqrtMu = ComputeSqrtMu(edgemask);

#define USE_BLAS_WRAPPERS
#ifdef USE_BLAS_WRAPPERS
#else
  typedef itk::AddImageFilter<CVImageType,CVImageType> CVImageAdder;
  CVImageAdder::Pointer dxPlusL = CVImageAdder::New();
#endif

  itk::TimeProbe tp;
  tp.Start();

  HalfHermetianImageType::Pointer tempRatioFC = CreateEmptyImage<HalfHermetianImageType>(DtDhat);

  for (size_t i=0; i < Niter; ++i)
  {
    std::cout << "Iteration : " << i << std::endl;

#ifdef USE_BLAS_WRAPPERS
    //Z = 1.0*L+DX
    AddAllElements(DX,1.0F,L,DX,gam);//DX destroyed
    CVImageType::Pointer & Z = DX;
#else
    //Z = opII(Z,DX,'+',L);
    dxPlusL->SetInput1(DX);
    dxPlusL->SetInput2(L);
    dxPlusL->SetInPlace(true);
    dxPlusL->Update();
    CVImageType::Pointer Z=dxPlusL->GetOutput();
    MultiplyCVByScalar(Z,gam);
#endif
#ifdef USE_BLAS_WRAPPERS
    //Y=InvTwoMuPlusGamm.*Z
    MultiplyVectors(Y,InvTwoMuPlusGamma,Z);
#else
    Y = opII(Y,Z,'*',InvTwoMuPlusGamma);
#endif

    // X Subprob
    // Numerator = 2*Atb+lambda*gam*SRdiv(Y-L))
#ifdef USE_BLAS_WRAPPERS
    //YminusL = 1.0F* SRdiv( -1.0F*L + Y)
    Duplicate(Y,YminusL);
    AddAllElements(YminusL,-1.0F,L,YminusL,1.0F);
#else
    YminusL=opII(YminusL,Y,'-',L);
#endif
    FloatImageType::Pointer tempNumerator=GetDivergence(YminusL);
#ifdef USE_BLAS_WRAPPERS
    //lambd*gam*tempNumerator+TwoAtb
    Duplicate(TwoAtb,tempValue);
    AddAllElements(tempValue,lambda*gam,tempNumerator,tempValue,1.0F);
    HalfHermetianImageType::Pointer tempNumeratorFC = GetForwardFFT(tempValue);
#else
    tempNumerator=opIC(tempNumerator,lambda*gam,'*',tempNumerator);
    tempNumerator=opII(tempNumerator,TwoAtb,'+',tempNumerator);
    HalfHermetianImageType::Pointer tempNumeratorFC = GetForwardFFT(tempNumerator);
#endif

    //KEEP
    tempRatioFC = opII_scalar(tempRatioFC,tempNumeratorFC,'/',TwoTimesAtAhatPlusLamGamDtDhat);

    X = GetInverseFFT(tempRatioFC,edgemask_ActualXDimensionIsOdd,1.0); //TODO: Determine scale factor here. X

    // should be on same dynamic range as b
    DX = GetGradient(X);
    residue = opII(residue, DX, '-', Y); //TODO:  Implement math graph output here
    L = opII(L,L,'+',residue);

    //
    resvec[i] = 0; //TODO: Figure out the math for here
    if ( i > 900000 )
    {
      tp.Stop();
      std::cout << " Only iterations " << tp.GetTotal() << tp.GetUnit() << std::endl;
      return X;
    }
    if( i > 99 ) //HACK: Cutting this function short
    {
      return X;
    }
    //WDX = opII_CVmult(WDX,SqrtMu,'*',DX);
    //diff = opII(diff,A_fhp(X,norm01_lowres.GetPointer()),'-',b_FC);
    //
    //cost[i] = 0; //TODO: Need to figure out math for here
  }
  return X;
}