Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f, bool sh) : Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { ReferenceDB* rdb = ReferenceDB::getInstance(); threadID = tid; flip = f; shortcuts = sh; string baseName = tempFile; if (baseName == "saved") { baseName = rdb->getSavedReference(); } string baseTName = tfile; if (baseTName == "saved") { baseTName = rdb->getSavedTaxonomy(); } /************calculate the probablity that each word will be in a specific taxonomy*************/ string tfileroot = m->getFullPathName(baseTName.substr(0,baseTName.find_last_of(".")+1)); string tempfileroot = m->getRootName(m->getSimpleName(baseName)); string phyloTreeName = tfileroot + "tree.train"; string phyloTreeSumName = tfileroot + "tree.sum"; string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob"; string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero"; ofstream out; ofstream out2; ifstream phyloTreeTest(phyloTreeName.c_str()); ifstream probFileTest2(probFileName2.c_str()); ifstream probFileTest(probFileName.c_str()); ifstream probFileTest3(phyloTreeSumName.c_str()); int start = time(NULL); //if they are there make sure they were created after this release date bool FilesGood = false; if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3){ FilesGood = checkReleaseDate(probFileTest, probFileTest2, phyloTreeTest, probFileTest3); } //if you want to save, but you dont need to calculate then just read if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood && (tempFile != "saved")) { ifstream saveIn; m->openInputFile(tempFile, saveIn); while (!saveIn.eof()) { Sequence temp(saveIn); m->gobble(saveIn); rdb->referenceSeqs.push_back(temp); } saveIn.close(); } if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){ if (tempFile == "saved") { m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); } m->mothurOut("Reading template taxonomy... "); cout.flush(); phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); m->mothurOut("DONE."); m->mothurOutEndLine(); genusNodes = phyloTree->getGenusNodes(); genusTotals = phyloTree->getGenusTotals(); if (tfile == "saved") { m->mothurOutEndLine(); m->mothurOut("Using probabilties from " + rdb->getSavedTaxonomy() + " that are saved in memory... "); cout.flush();; wordGenusProb = rdb->wordGenusProb; WordPairDiffArr = rdb->WordPairDiffArr; }else { m->mothurOut("Reading template probabilities... "); cout.flush(); readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); } //save probabilities if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } }else{ //create search database and names vector generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0); //prevents errors caused by creating shortcut files if you had an error in the sanity check. if (m->control_pressed) { m->mothurRemove(phyloTreeName); m->mothurRemove(probFileName); m->mothurRemove(probFileName2); } else{ genusNodes = phyloTree->getGenusNodes(); genusTotals = phyloTree->getGenusTotals(); m->mothurOut("Calculating template taxonomy tree... "); cout.flush(); phyloTree->printTreeNodes(phyloTreeName); m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("Calculating template probabilities... "); cout.flush(); numKmers = database->getMaxKmer() + 1; //initialze probabilities wordGenusProb.resize(numKmers); WordPairDiffArr.resize(numKmers); for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } ofstream out; ofstream out2; #ifdef USE_MPI int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are if (pid == 0) { #endif if (shortcuts) { m->openOutputFile(probFileName, out); //output mothur version out << "#" << m->getVersion() << endl; out << numKmers << endl; m->openOutputFile(probFileName2, out2); //output mothur version out2 << "#" << m->getVersion() << endl; } #ifdef USE_MPI } #endif //for each word for (int i = 0; i < numKmers; i++) { //m->mothurOut("[DEBUG]: kmer = " + toString(i) + "\n"); if (m->control_pressed) { break; } #ifdef USE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are if (pid == 0) { #endif if (shortcuts) { out << i << '\t'; } #ifdef USE_MPI } #endif vector<int> seqsWithWordi = database->getSequencesWithKmer(i); //for each sequence with that word vector<int> count; count.resize(genusNodes.size(), 0); for (int j = 0; j < seqsWithWordi.size(); j++) { int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]); count[temp]++; //increment count of seq in this genus who have this word } //probabilityInTemplate = (# of seqs with that word in template + 0.50) / (total number of seqs in template + 1); float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1); diffPair tempProb(log(probabilityInTemplate), 0.0); WordPairDiffArr[i] = tempProb; int numNotZero = 0; for (int k = 0; k < genusNodes.size(); k++) { //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); if (count[k] != 0) { #ifdef USE_MPI int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are if (pid == 0) { #endif if (shortcuts) { out << k << '\t' << wordGenusProb[i][k] << '\t' ; } #ifdef USE_MPI } #endif numNotZero++; } } #ifdef USE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are if (pid == 0) { #endif if (shortcuts) { out << endl; out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl; } #ifdef USE_MPI } #endif } #ifdef USE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are if (pid == 0) { #endif if (shortcuts) { out.close(); out2.close(); } #ifdef USE_MPI } #endif //read in new phylotree with less info. - its faster ifstream phyloTreeTest(phyloTreeName.c_str()); delete phyloTree; phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); //save probabilities if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } } } if (m->debug) { m->mothurOut("[DEBUG]: about to generateWordPairDiffArr\n"); } generateWordPairDiffArr(); if (m->debug) { m->mothurOut("[DEBUG]: done generateWordPairDiffArr\n"); } //save probabilities if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine(); } catch(exception& e) { m->errorOut(e, "Bayesian", "Bayesian"); exit(1); } }
PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){ try { m = MothurOut::getInstance(); maxLevel = 0; ignore = false; numSeqs = 0; groupmap = g; ct = NULL; //check for necessary files if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); } string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum")); ifstream FileTest(taxFileNameTest.c_str()); if (!FileTest) { m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1); }else{ readTreeStruct(FileTest); } tree[0].rank = "0"; assignRank(0); } catch(exception& e) { m->errorOut(e, "PhyloSummary", "PhyloSummary"); exit(1); } }