Beispiel #1
0
Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f, bool sh) : 
Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
	try {
		ReferenceDB* rdb = ReferenceDB::getInstance();
		
		threadID = tid;
		flip = f;
        shortcuts = sh;
		string baseName = tempFile;
			
		if (baseName == "saved") { baseName = rdb->getSavedReference(); }
		
		string baseTName = tfile;
		if (baseTName == "saved") { baseTName = rdb->getSavedTaxonomy(); }
		
		/************calculate the probablity that each word will be in a specific taxonomy*************/
		string tfileroot = m->getFullPathName(baseTName.substr(0,baseTName.find_last_of(".")+1));
		string tempfileroot = m->getRootName(m->getSimpleName(baseName));
		string phyloTreeName = tfileroot + "tree.train";
		string phyloTreeSumName = tfileroot + "tree.sum";
		string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
		string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero";
		
		ofstream out;
		ofstream out2;
		
		ifstream phyloTreeTest(phyloTreeName.c_str());
		ifstream probFileTest2(probFileName2.c_str());
		ifstream probFileTest(probFileName.c_str());
		ifstream probFileTest3(phyloTreeSumName.c_str());
		
		int start = time(NULL);
		
		//if they are there make sure they were created after this release date
		bool FilesGood = false;
		if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3){
			FilesGood = checkReleaseDate(probFileTest, probFileTest2, phyloTreeTest, probFileTest3);
		}
		
		//if you want to save, but you dont need to calculate then just read
		if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood && (tempFile != "saved")) {  
			ifstream saveIn;
			m->openInputFile(tempFile, saveIn);
			
			while (!saveIn.eof()) {
				Sequence temp(saveIn);
				m->gobble(saveIn);
				
				rdb->referenceSeqs.push_back(temp); 
			}
			saveIn.close();			
		}

		if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){	
			if (tempFile == "saved") { m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");	m->mothurOutEndLine(); }
			
			m->mothurOut("Reading template taxonomy...     "); cout.flush();
			
			phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
			
			m->mothurOut("DONE."); m->mothurOutEndLine();
			
			genusNodes = phyloTree->getGenusNodes(); 
			genusTotals = phyloTree->getGenusTotals();
			
			if (tfile == "saved") { 
				m->mothurOutEndLine();  m->mothurOut("Using probabilties from " + rdb->getSavedTaxonomy() + " that are saved in memory...    ");	cout.flush();; 
				wordGenusProb = rdb->wordGenusProb;
				WordPairDiffArr = rdb->WordPairDiffArr;
			}else {
				m->mothurOut("Reading template probabilities...     "); cout.flush();
				readProbFile(probFileTest, probFileTest2, probFileName, probFileName2);
			}	
			
			//save probabilities
			if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
		}else{
		
			//create search database and names vector
			generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
			
			//prevents errors caused by creating shortcut files if you had an error in the sanity check.
			if (m->control_pressed) {  m->mothurRemove(phyloTreeName);  m->mothurRemove(probFileName); m->mothurRemove(probFileName2); }
			else{ 
				genusNodes = phyloTree->getGenusNodes(); 
				genusTotals = phyloTree->getGenusTotals();
				
				m->mothurOut("Calculating template taxonomy tree...     "); cout.flush();
				
				phyloTree->printTreeNodes(phyloTreeName);
							
				m->mothurOut("DONE."); m->mothurOutEndLine();
				
				m->mothurOut("Calculating template probabilities...     "); cout.flush();
				
				numKmers = database->getMaxKmer() + 1;
			
				//initialze probabilities
				wordGenusProb.resize(numKmers);
				WordPairDiffArr.resize(numKmers);
			
				for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
                ofstream out;
				ofstream out2;
				
				#ifdef USE_MPI
					int pid;
					MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are

					if (pid == 0) {  
				#endif

				
                if (shortcuts) { 
                    m->openOutputFile(probFileName, out); 
				
                    //output mothur version
                    out << "#" << m->getVersion() << endl;
				
                    out << numKmers << endl;
				
                    m->openOutputFile(probFileName2, out2);
				
                    //output mothur version
                    out2 << "#" << m->getVersion() << endl;
                }
				
				#ifdef USE_MPI
					}
				#endif

				//for each word
				for (int i = 0; i < numKmers; i++) {
                    //m->mothurOut("[DEBUG]: kmer = " + toString(i) + "\n");
                    
					if (m->control_pressed) {  break; }
					
					#ifdef USE_MPI
						MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are

						if (pid == 0) {  
					#endif

                    if (shortcuts) {  out << i << '\t'; }
					
					#ifdef USE_MPI
						}
					#endif
					
					vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
					
					//for each sequence with that word
                    vector<int> count; count.resize(genusNodes.size(), 0);
					for (int j = 0; j < seqsWithWordi.size(); j++) {
						int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]);
						count[temp]++;  //increment count of seq in this genus who have this word
					}
					
					//probabilityInTemplate = (# of seqs with that word in template + 0.50) / (total number of seqs in template + 1);
					float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
					diffPair tempProb(log(probabilityInTemplate), 0.0);
					WordPairDiffArr[i] = tempProb;
						
					int numNotZero = 0;
					for (int k = 0; k < genusNodes.size(); k++) {
						//probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
						
						
						wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
									
						if (count[k] != 0) { 
							#ifdef USE_MPI
								int pid;
								MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
						
								if (pid == 0) {  
							#endif

                            if (shortcuts) { out << k << '\t' << wordGenusProb[i][k] << '\t' ; }
							
							#ifdef USE_MPI
								}
							#endif

							numNotZero++;  
						}
					}
					
					#ifdef USE_MPI
						MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
				
						if (pid == 0) {  
					#endif
					
                            if (shortcuts) { 
                                out << endl;
                                out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl;
                            }
					
					#ifdef USE_MPI
						}
					#endif
				}
				
				#ifdef USE_MPI
					MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
				
					if (pid == 0) {  
				#endif
				
                        if (shortcuts) { 
                            out.close();
                            out2.close();
                        }
				#ifdef USE_MPI
					}
				#endif
				
				//read in new phylotree with less info. - its faster
				ifstream phyloTreeTest(phyloTreeName.c_str());
				delete phyloTree;
				
				phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
                
				//save probabilities
				if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
			}
		}
		
        if (m->debug) { m->mothurOut("[DEBUG]: about to generateWordPairDiffArr\n"); }
		generateWordPairDiffArr();
        if (m->debug) { m->mothurOut("[DEBUG]: done generateWordPairDiffArr\n"); }
		
		//save probabilities
		if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
		
		m->mothurOut("DONE."); m->mothurOutEndLine();
		m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine();
	}
	catch(exception& e) {
		m->errorOut(e, "Bayesian", "Bayesian");
		exit(1);
	}
}
Beispiel #2
0
PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){
	try {
		m = MothurOut::getInstance();
		maxLevel = 0;
		ignore = false;
        numSeqs = 0;
		
		groupmap = g;
        ct = NULL;
				
		//check for necessary files
        if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); }
		string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
		ifstream FileTest(taxFileNameTest.c_str());
		
		if (!FileTest) { 
			m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
		}else{
			readTreeStruct(FileTest);
		}
		
		tree[0].rank = "0";
		assignRank(0);

	}
	catch(exception& e) {
		m->errorOut(e, "PhyloSummary", "PhyloSummary");
		exit(1);
	}
}