void VJCascadeClassifier::savePosteriors(const string& dataFileName, const string& shypFileName, const string& outFileName, int numIterations) { // loading data InputData* pData = loadInputData(dataFileName, shypFileName); const int numOfExamples = pData->getNumExamples(); //get the index of positive label const NameMap& namemap = pData->getClassMap(); _positiveLabelIndex = namemap.getIdxFromName( _positiveLabelName ); if (_verbose > 0) cout << "Loading strong hypothesis..." << flush; // open outfile ofstream outRes(outFileName.c_str()); if (!outRes.is_open()) { cout << "Cannot open outfile!!! " << outFileName << endl; } // The class that loads the weak hypotheses UnSerialization us; // Where to put the weak hypotheses vector<vector<BaseLearner*> > weakHypotheses; // For stagewise thresholds vector<AlphaReal> thresholds(0); // loads them //us.loadHypotheses(shypFileName, weakHypotheses, pData); us.loadCascadeHypotheses(shypFileName, weakHypotheses, thresholds, pData); // output the number of stages outRes << "StageNum " << weakHypotheses.size() << endl; // output original labels outRes << "Labels"; for(int i=0; i<numOfExamples; ++i ) { vector<Label>& labels = pData->getLabels(i); if (labels[_positiveLabelIndex].y>0) // pos label outRes << " 1"; else outRes << " 0"; } outRes << endl; // store result vector<CascadeOutputInformation> cascadeData(0); vector<CascadeOutputInformation>::iterator it; cascadeData.resize(numOfExamples); for( it=cascadeData.begin(); it != cascadeData.end(); ++it ) { it->active=true; } for(int stagei=0; stagei < weakHypotheses.size(); ++stagei ) { // for posteriors vector<AlphaReal> posteriors(0); // calculate the posteriors after stage VJCascadeLearner::calculatePosteriors( pData, weakHypotheses[stagei], posteriors, _positiveLabelIndex ); // update the data (posteriors, active element index etc.) //VJCascadeLearner::forecastOverAllCascade( pData, posteriors, activeInstances, thresholds[stagei] ); updateCascadeData(pData, weakHypotheses, stagei, posteriors, thresholds, _positiveLabelIndex, cascadeData); int numberOfActiveInstance = 0; for( int i = 0; i < numOfExamples; ++i ) if (cascadeData[i].active) numberOfActiveInstance++; if (_verbose > 0 ) cout << "Number of active instances: " << numberOfActiveInstance << "(" << numOfExamples << ")" << endl; // output stats outRes << "Stage " << stagei << " " << weakHypotheses[stagei].size() << endl; outRes << "Forecast"; for(int i=0; i<numOfExamples; ++i ) { outRes << " " << cascadeData[i].forecast; } outRes << endl; outRes << "Active"; for(int i=0; i<numOfExamples; ++i ) { if( cascadeData[i].active) outRes << " 1"; else outRes << " 0"; } outRes << endl; outRes << "Posteriors"; for(int i=0; i<numOfExamples; ++i ) { outRes << " " << cascadeData[i].score; } outRes << endl; } outRes.close(); // free memory allocation vector<vector<BaseLearner*> >::iterator bvIt; for( bvIt = weakHypotheses.begin(); bvIt != weakHypotheses.end(); ++bvIt ) { vector<BaseLearner* >::iterator bIt; for( bIt = (*bvIt).begin(); bIt != (*bvIt).end(); ++bIt ) delete *bIt; } }
void VJCascadeClassifier::run(const string& dataFileName, const string& shypFileName, int numIterations, const string& outResFileName ) { // loading data InputData* pData = loadInputData(dataFileName, shypFileName); const int numOfExamples = pData->getNumExamples(); //get the index of positive label const NameMap& namemap = pData->getClassMap(); _positiveLabelIndex = namemap.getIdxFromName( _positiveLabelName ); if (_verbose > 0) cout << "Loading strong hypothesis..." << flush; // The class that loads the weak hypotheses UnSerialization us; // Where to put the weak hypotheses vector<vector<BaseLearner*> > weakHypotheses; // For stagewise thresholds vector<AlphaReal> thresholds(0); // loads them //us.loadHypotheses(shypFileName, weakHypotheses, pData); us.loadCascadeHypotheses(shypFileName, weakHypotheses, thresholds, pData); // store result vector<CascadeOutputInformation> cascadeData(0); vector<CascadeOutputInformation>::iterator it; cascadeData.resize(numOfExamples); for( it=cascadeData.begin(); it != cascadeData.end(); ++it ) { it->active=true; } if (!_outputInfoFile.empty()) { outputHeader(); } for(int stagei=0; stagei < weakHypotheses.size(); ++stagei ) { // for posteriors vector<AlphaReal> posteriors(0); // calculate the posteriors after stage VJCascadeLearner::calculatePosteriors( pData, weakHypotheses[stagei], posteriors, _positiveLabelIndex ); // update the data (posteriors, active element index etc.) updateCascadeData(pData, weakHypotheses, stagei, posteriors, thresholds, _positiveLabelIndex, cascadeData); if (!_outputInfoFile.empty()) { _output << stagei + 1 << "\t"; _output << weakHypotheses[stagei].size() << "\t"; outputCascadeResult( pData, cascadeData ); } int numberOfActiveInstance = 0; for( int i = 0; i < numOfExamples; ++i ) if (cascadeData[i].active) numberOfActiveInstance++; if (_verbose > 0 ) cout << "Number of active instances: " << numberOfActiveInstance << "(" << numOfExamples << ")" << endl; } vector<vector<int> > confMatrix(2); confMatrix[0].resize(2); fill( confMatrix[0].begin(), confMatrix[0].end(), 0 ); confMatrix[1].resize(2); fill( confMatrix[1].begin(), confMatrix[1].end(), 0 ); // print accuracy for(int i=0; i<numOfExamples; ++i ) { vector<Label>& labels = pData->getLabels(i); if (labels[_positiveLabelIndex].y>0) // pos label if (cascadeData[i].forecast==1) confMatrix[1][1]++; else confMatrix[1][0]++; else // negative label if (cascadeData[i].forecast==0) confMatrix[0][0]++; else confMatrix[0][1]++; } double acc = 100.0 * (confMatrix[0][0] + confMatrix[1][1]) / ((double) numOfExamples); // output it cout << endl; cout << "Error Summary" << endl; cout << "=============" << endl; cout << "Accuracy: " << setprecision(4) << acc << endl; cout << setw(10) << "\t" << setw(10) << namemap.getNameFromIdx(1-_positiveLabelIndex) << setw(10) << namemap.getNameFromIdx(_positiveLabelIndex) << endl; cout << setw(10) << namemap.getNameFromIdx(1-_positiveLabelIndex) << setw(10) << confMatrix[0][0] << setw(10) << confMatrix[0][1] << endl; cout << setw(10) << namemap.getNameFromIdx(_positiveLabelIndex) << setw(10) << confMatrix[1][0] << setw(10) << confMatrix[1][1] << endl; // output forecast if (!outResFileName.empty() ) outputForecast(pData, outResFileName, cascadeData ); // free memory allocation vector<vector<BaseLearner*> >::iterator bvIt; for( bvIt = weakHypotheses.begin(); bvIt != weakHypotheses.end(); ++bvIt ) { vector<BaseLearner* >::iterator bIt; for( bIt = (*bvIt).begin(); bIt != (*bvIt).end(); ++bIt ) delete *bIt; } }
string KmerTree::getTaxonomy(Sequence* thisSeq, string& simpleTax, bool& flipped){ try { simpleTax = ""; string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = ""; string unalignedSeq = thisSeq->getUnaligned(); double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers()); vector<int> queryProfile = ripKmerProfile(unalignedSeq); //convert to kmer vector vector<vector<double> > pXgivenKj_D_j(numLevels); vector<vector<int> > indices(numLevels); for(int i=0;i<numLevels;i++){ if (m->getControl_pressed()) { return taxonProbabilityString; } pXgivenKj_D_j[i].push_back(logPOutlier); indices[i].push_back(-1); } for(int i=0;i<numTaxa;i++){ if (m->getControl_pressed()) { return taxonProbabilityString; } pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile)); indices[tree[i]->getLevel()].push_back(i); } vector<double> sumLikelihood(numLevels, 0); vector<double> bestPosterior(numLevels, 0); vector<int> maxIndex(numLevels, 0); int maxPosteriorIndex; //let's find the best level and taxa within that level for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus if (m->getControl_pressed()) { return taxonProbabilityString; } int numTaxaInLevel = (int)indices[i].size(); vector<double> posteriors(numTaxaInLevel, 0); sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex); maxPosteriorIndex = 0; for(int j=0;j<numTaxaInLevel;j++){ posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]); if(posteriors[j] > posteriors[maxPosteriorIndex]){ maxPosteriorIndex = j; } } maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors); maxIndex[i] = maxPosteriorIndex; bestPosterior[i] = posteriors[maxIndex[i]]; } int saneDepth = sanityCheck(indices, maxIndex); simpleTax = ""; int savedspot = 1; taxonProbabilityString = ""; for(int i=1;i<=saneDepth;i++){ if (m->getControl_pressed()) { return taxonProbabilityString; } int confidenceScore = (int) (bestPosterior[i] * 100); if (confidenceScore >= confidenceThreshold) { if(indices[i][maxIndex[i]] != -1){ taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");"; simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";"; } else{ taxonProbabilityString += "unclassified(" + toString(confidenceScore) + ");"; simpleTax += "unclassified;"; } }else { break; } savedspot = i; } for(int i=savedspot+1;i<numLevels;i++){ if (m->getControl_pressed()) { return taxonProbabilityString; } taxonProbabilityString += "unclassified(0);"; simpleTax += "unclassified;"; } return taxonProbabilityString; } catch(exception& e) { m->errorOut(e, "KmerTree", "getTaxonomy"); exit(1); } }