// feed adaptation data from a batch file containing entries (rawFile alignmentFile) void FMLLREstimator::feedAdaptationData(const char *strBatchFile, const char *strAlignmentFormat, double *dLikelihood) { BatchFile batchFile(strBatchFile,"features|alignment"); batchFile.load(); for(unsigned int i=0 ; i < batchFile.size() ; ++i) { //for(int i=0 ; i < 5 ; ++i) { // load the alignment Alignment *alignment = NULL; if (strcmp(strAlignmentFormat,"text") == 0) { AlignmentFile alignmentFile(m_phoneSet); VPhoneAlignment *vPhoneAlignment = alignmentFile.load(batchFile.getField(i,"alignment")); assert(vPhoneAlignment); alignment = AlignmentFile::toAlignment(m_phoneSet,m_hmmManager,vPhoneAlignment); AlignmentFile::destroyPhoneAlignment(vPhoneAlignment); } else { alignment = Alignment::load(batchFile.getField(i,"alignment"),NULL); assert(alignment); } // load the feature vectors FeatureFile featureFile(batchFile.getField(i,"features"),MODE_READ); featureFile.load(); Matrix<float> *mFeatures = featureFile.getFeatureVectors(); // load and apply the transform /* Transform *transform = new Transform(); transform->load("/data/daniel/tasks/wsj/experiments/may16th_2013_CMNUtterance/5/fmllr1/transforms/440m.fmllr.bin"); Matrix<float> *mFeaturesX = transform->apply(*mFeatures); mFeatures = mFeaturesX; delete transform; */ // check consistency if (mFeatures->getRows() != alignment->getFrames()) { BVC_ERROR << "inconsistent number of feature vectors / alignment file"; } // accumulate adaptation data double dLikelihoodAlignment = 0.0; feedAdaptationData(*mFeatures,alignment,&dLikelihoodAlignment); BVC_VERB << "loaded file: " << batchFile.getField(i,"alignment") << " likelihood: " << FLT(10,2) << dLikelihoodAlignment << " (" << mFeatures->getRows() << "frames)"; *dLikelihood += dLikelihoodAlignment; // clean-up delete alignment; delete mFeatures; } double dLikelihoodFrame = (*dLikelihood)/m_fOccupancyTotal; BVC_VERB << "total likelihood: " << FLT(20,6) << *dLikelihood << " (likelihood per frame: " << FLT(8,4) << dLikelihoodFrame << ")"; }
// feed adaptation data from a batch file containing entries (rawFile alignmentFile) void MLLRManager::feedAdaptationData(const char *strBatchFile, const char *strAlignmentFormat, double *dLikelihood, bool bVerbose) { BatchFile batchFile(strBatchFile,"features|alignment"); batchFile.load(); *dLikelihood = 0.0; for(unsigned int i=0 ; i < batchFile.size() ; ++i) { // load the alignment Alignment *alignment = NULL; // text format if (strcmp(strAlignmentFormat,"text") == 0) { AlignmentFile alignmentFile(m_phoneSet,NULL); VPhoneAlignment *vPhoneAlignment = alignmentFile.load(batchFile.getField(i,"alignment")); assert(vPhoneAlignment); alignment = AlignmentFile::toAlignment(m_phoneSet,m_hmmManager,vPhoneAlignment); AlignmentFile::destroyPhoneAlignment(vPhoneAlignment); } // binary format else { alignment = Alignment::load(batchFile.getField(i,"alignment"),NULL); assert(alignment); } // load the feature vectors FeatureFile featureFile(batchFile.getField(i,"features"),MODE_READ); featureFile.load(); unsigned int iFeatureVectors = 0; float *fFeatures = featureFile.getFeatureVectors(&iFeatureVectors); // check consistency if (iFeatureVectors != alignment->getFrames()) { BVC_ERROR << "inconsistent number of feature vectors / alignment file"; } // accumulate adaptation data double dLikelihoodAlignment = 0.0; feedAdaptationData(fFeatures,iFeatureVectors,alignment,&dLikelihoodAlignment); if (bVerbose) { printf("loaded file: %s likelihood: %12.2f\n",batchFile.getField(i,"alignment"),dLikelihoodAlignment); } *dLikelihood += dLikelihoodAlignment; // clean-up delete alignment; delete [] fFeatures; } if (bVerbose) { printf("total likelihood: %14.4f\n",*dLikelihood); } }
int main(int argc, char* argv[]) { bfs::path featureFile(argv[1]); bfs::path expressionFile(argv[2]); double estimatedReadLength = atod(argv[3]); double kmersPerRead = atod(argv[4]); uint64_t mappedKmers = atol(argv[5]); uint32_t mappedKmers = atoi(argv[6]); bfs::path outputFile(argv[7]); size_t numThreads = atoi(argv[8]); performBiasCorrection(featureFile, expressionFile, estimatedReadLength, kmersPerRead, mappedKmers, merLen, outputFile, numThreads); }
void MainWindow::loadFeature() { std::ifstream featureFile("E:\\final\\Qtfinal\\myData\\featureDic.txt"); std::ifstream crossfeatureFile("E:\\final\\Qtfinal\\myData\\crossfeatureDic.txt"); std::string words; double val; while(featureFile>>words>>val) { featureDicOnCHI.insert(words); } while(crossfeatureFile>>words>>val) { featureDicOnECE.insert(words); } featureFile.close(); crossfeatureFile.close(); }
// accumulate statistics void DTAccumulator::accumulate() { // make sure the HMMs are already initialized assert(m_hmmManager->areInitialized()); const char *strErrorCode; double dLikelihoodTotalNum = 0.0; double dLikelihoodTotalNumAcoustic = 0.0; double dLikelihoodTotalDen = 0.0; long iFeatureVectorsTotal = 0; long iFeatureVectorsUsedTotal = 0; double dBegin = TimeUtils::getTimeMilliseconds(); // empty the accumulators m_hmmManager->resetAccumulators(); m_bMMI = false; if (strcmp(m_strObjectiveFunction,DISCRIMINATIVE_TRAINING_OBJECTIVE_FUNCTION_BMMI) == 0) { m_bMMI = true; } // create accumulators for each HMM-state and Gaussian component for(int i=0 ; i < m_hmmManager->getNumberHMMStatesPhysical() ; ++i) { for(unsigned int g=0 ; g < m_hmmManager->getHMMState(i)->getMixture().getNumberComponents() ; ++g) { unsigned int iKey = Accumulator::getPhysicalAccumulatorKey(i,g); // numerator Accumulator *accumulatorNum = new Accumulator(m_iFeatureDimensionality, m_hmmManager->getCovarianceModelling(),i,g); m_mAccumulatorNum.insert(MAccumulatorPhysical::value_type(iKey,accumulatorNum)); // denominator Accumulator *accumulatorDen = new Accumulator(m_iFeatureDimensionality, m_hmmManager->getCovarianceModelling(),i,g); m_mAccumulatorDen.insert(MAccumulatorPhysical::value_type(iKey,accumulatorDen)); } } // precompute constants used to speed-up emission probability computation m_hmmManager->precomputeConstants(); // (2) process each utterance in the MLF file int iUtterance = 0; VMLFUtterance *vMLFUtterance = m_mlfFile->getUtterances(); // at this point we might not know the total amount of audio but we do know the total number of utterances int iUtterancesTotal = (int)vMLFUtterance->size(); float fPercentageDisplayed = 0.0; for(VMLFUtterance::iterator it = vMLFUtterance->begin() ; it != vMLFUtterance->end() ; ++it, ++iUtterance) { // (2.1) load the features for the estimation ostringstream strFileFeatures; strFileFeatures << m_strFolderFeatures << PATH_SEPARATOR << (*it)->strFilePattern; FeatureFile featureFile(strFileFeatures.str().c_str(),MODE_READ,FORMAT_FEATURES_FILE_DEFAULT,m_iFeatureDimensionality); try { featureFile.load(); } catch (std::runtime_error &e) { std::cerr << e.what() << std::endl; BVC_WARNING << "unable to load the features file: " << strFileFeatures.str(); continue; } Matrix<float> *mFeatures = featureFile.getFeatureVectors(); // process the utterance using Forward-Backward (get the occupation counts) /*double dLikelihoodNum = -DBL_MAX; Alignment *alignmentNum = m_forwardBackwardX->processUtterance((*it)->vLexUnit,m_bMultiplePronunciations, m_vLexUnitOptional,fFeatures,fFeatures,iFeatureVectors,&dLikelihoodNum,iErrorCode); if (iErrorCode != UTTERANCE_PROCESSED_SUCCESSFULLY) { delete mFeatures; sprintf(strMessage,"unable to process utterance: \"%s\", reason: %s",strFileFeatures, m_forwardBackwardX->getErrorMessage(iErrorCode)); m_log->logInformation(strMessage); continue; }*/ // (2.2) load the hypothesis lattice ostringstream strFileAux; strFileAux << m_strFolderLattices << PATH_SEPARATOR << (*it)->strFilePattern; char strFileLattice[1024+1]; FileUtils::replaceExtension(strFileLattice,strFileAux.str().c_str(),"bin"); cout << "lattice: " << strFileLattice << endl; HypothesisLattice *lattice = new HypothesisLattice(m_phoneSet,m_lexiconManager); try { lattice->load(strFileLattice); } catch (std::runtime_error &e) { std::cerr << e.what() << std::endl; BVC_WARNING << "unable to load the lattice: " << strFileLattice; delete mFeatures; continue; } //lattice->printProperties(); // check lattice properties if ((lattice->isProperty(LATTICE_PROPERTY_AM_PROB) == false) || (lattice->isProperty(LATTICE_PROPERTY_LM_PROB) == false) || (lattice->isProperty(LATTICE_PROPERTY_INSERTION_PENALTY) == false) || (lattice->isProperty(LATTICE_PROPERTY_HMMS) == false) || (lattice->isProperty(LATTICE_PROPERTY_PHONE_ALIGN) == false)) { BVC_ERROR << "wrong lattice properties"; } //LatticeDepth *depth = lattice->computeDepth(); //printf("depth: %12.4f\n",depth->fDepth); //lattice->store("./lattice.txt",FILE_FORMAT_TEXT); // mark best path m_lexiconManager->removeNonStandardLexUnits((*it)->vLexUnit); LatticeWER *latticeWER = lattice->computeWER((*it)->vLexUnit,NULL,NULL,true,0); if ((!latticeWER) || (latticeWER->iErrors != 0)) { BVC_WARNING << "lattice WER is not zero, lattice does not contain the hand-made transcription: " << strFileLattice; if (latticeWER) { delete latticeWER; } delete lattice; delete mFeatures; continue; } //HypothesisLattice::print(latticeWER); delete latticeWER; // get the best-path from the lattice (transcription) and compute numerator stats from it //BestPath *bestPath = lattice->getBestPath(); //bestPath->print(); //m_lexiconManager->print((*it)->vLexUnit); VLPhoneAlignment *vLPhoneAlignment = lattice->getBestPathAlignment(); assert(vLPhoneAlignment); // compute numerator statistics double dLikelihoodNum = -DBL_MAX; Alignment *alignmentNum = m_forwardBackward->processPhoneAlignment(*mFeatures,vLPhoneAlignment,dLikelihoodNum,&strErrorCode); if (strcmp(strErrorCode,FB_RETURN_CODE_SUCCESS) != 0) { BVC_WARNING << "unable to compute numerator occupation statistics: " << strErrorCode << ", " << strFileLattice; delete vLPhoneAlignment; delete lattice; delete mFeatures; continue; } // get denominator statistics from the lattice double dLikelihoodDen = -DBL_MAX; MOccupation *mOccupationDen = m_forwardBackward->processLattice(lattice,*mFeatures,m_fScaleAM,m_fScaleLM, dLikelihoodDen,m_bMMI,m_fBoostingFactor,&strErrorCode); if (strcmp(strErrorCode,FB_RETURN_CODE_SUCCESS) != 0) { BVC_WARNING << "unable to compute lattice occupation statistics (denominator): " << strErrorCode << ", " << strFileLattice; delete alignmentNum; delete vLPhoneAlignment; delete lattice; delete mFeatures; continue; } // perform statistics cancellation between numerator and denominator if (m_bCanceledStatistics) { statisticsCancellation(alignmentNum,mOccupationDen); } Alignment *alignmentDen = ForwardBackward::getAlignment(mOccupationDen,mFeatures->getRows()); // accumulate statistics for both numerator and denominator accumulate(alignmentNum,*mFeatures,true); accumulate(alignmentDen,*mFeatures,false); // get the best path with updated am-scores, lm-prob and insertion penalties BestPath *bestPath = lattice->getBestPath(); assert(bestPath); LBestPathElement *lBestPathElement = bestPath->getBestPathElements(); double dLMIP = 0.0; for(LBestPathElement::iterator it = lBestPathElement->begin() ; it != lBestPathElement->end() ; ++it) { dLMIP += m_fScaleLM*(*it)->fScoreLanguageModel+m_fScaleAM*(*it)->fInsertionPenalty; } delete bestPath; dLikelihoodTotalNumAcoustic += dLikelihoodNum; // apply acoustic scaling and add the lm and insertion penalty scores dLikelihoodNum *= m_fScaleAM; dLikelihoodNum += dLMIP; dLikelihoodTotalNum += dLikelihoodNum; dLikelihoodTotalDen += dLikelihoodDen; //printf("%12.4f %12.4f\n",dLikelihoodNum,dLikelihoodDen); iFeatureVectorsTotal += mFeatures->getRows(); // clean-up delete vLPhoneAlignment; delete lattice; delete alignmentNum; delete alignmentDen; delete mOccupationDen; delete mFeatures; // update the progress bar if necessary float fPercentage = (((float)iUtterance)*100)/((float)iUtterancesTotal); if (fPercentage >= fPercentageDisplayed + 10.0) { fPercentageDisplayed += 10.0; printf("*"); fflush(stdout); } } // update the progress bar if necessary while (fPercentageDisplayed < 100.0) { printf("*"); fPercentageDisplayed += 10.0; } iFeatureVectorsUsedTotal = iFeatureVectorsTotal; // get the iteration end time double dEnd = TimeUtils::getTimeMilliseconds(); double dMillisecondsInterval = dEnd - dBegin; // compute the Real Time Factor of the reestimation process double dRTF = (dMillisecondsInterval/10.0)/((double)iFeatureVectorsTotal); int iGaussians = m_hmmManager->getNumberGaussianComponents(); // compute audio available for training int iHours,iMinutes,iSeconds; TimeUtils::convertHundredths((double)iFeatureVectorsTotal,iHours,iMinutes,iSeconds); // compute audio used int iHoursUsed,iMinutesUsed,iSecondsUsed; TimeUtils::convertHundredths((double)iFeatureVectorsUsedTotal,iHoursUsed,iMinutesUsed,iSecondsUsed); // show the accumulation information printf(" likelihood= (%.4f) %.4f %.4f %.4f [%8d Gauss][RTF=%.4f][%d:%02d'%02d''][%d:%02d'%02d'']\n", dLikelihoodTotalNumAcoustic,dLikelihoodTotalNum,dLikelihoodTotalDen,dLikelihoodTotalNum-dLikelihoodTotalDen, iGaussians,dRTF,iHours,iMinutes,iSeconds,iHoursUsed,iMinutesUsed,iSecondsUsed); // dump the accumulators Accumulator::storeAccumulators(m_strFileAccumulatorsNum,m_iFeatureDimensionality,m_iCovarianceModeling, m_iHMMStates,m_iGaussianComponents,m_mAccumulatorNum); Accumulator::storeAccumulators(m_strFileAccumulatorsDen,m_iFeatureDimensionality,m_iCovarianceModeling, m_iHMMStates,m_iGaussianComponents,m_mAccumulatorDen); // destroy the accumulators Accumulator::destroy(m_mAccumulatorNum); Accumulator::destroy(m_mAccumulatorDen); }
void naiveBayes(string naiveBayesTestVSM) { ifstream iFile(trainBayes.c_str()); //TF词典"E:\\final\\final\\myData\\myTFDic.txt"; ifstream featureFile(featureDicPath.c_str()); cout<<featureDicPath<<endl; double featureVal; string featureStr; string words; int TF; int cnt; int flag; int i,j; while(featureFile>>featureStr>>featureVal) featureDic[featureStr]++; while(iFile>>words) { iFile>>flag>>cnt; for(i = 0; i < cnt; i++) { iFile>>flag>>TF; if(featureDic.count(words)) bayesDic[words][flag] = TF; } } i = j = 0; cout<<"******"<<bayesDic.size()<<endl; for(map<string,map<int,int> >::iterator itor = bayesDic.begin(); itor != bayesDic.end();itor++) { //cout<<itor->first<<endl; for(int k = 0; k < 8; k++) bayes[j][k] = 1.0 / (1000 + bayesDic[itor->first][9]); for(map<int,int>::iterator it = itor->second.begin(); it != itor->second.end();it++) { i = it->first - 1; if(i == 8) break; bayes[j][i] = 1.0 * (1 + it->second) / (1000.0 + bayesDic[itor->first][9]); } j++; } ofstream out("mydata\\bayesOnECE.txt"); for( i = 0; i < 8; i++) { for(j = 0 ; j < 1000; j++) out<<bayes[j][i]<<" "; out<<endl; } double p[8]; int c,pos,v,b; string kk; //ifstream in("testBayesOnCross.txt"); ifstream in(naiveBayesTestVSM.c_str()); //ofstream myout("ans.txt"); int result[3117 + 3]; int reCnt = 0; while(in>>kk) { //in>>kk; in>>c; for(i = 0; i < 8; i++)p[i] = 0; for( i = 0; i < c ;i++) { in>>pos>>v; for(j = 0; j < 8; j++) { p[j] += v * log(bayes[pos][j]); //p[j] += v * log(bayes[j][pos]); } } double max = -999999999; int ans; for(i = 0; i < 8; i++) { //cout<<p[i]<<endl; if(max < p[i]) { max = p[i]; ans = i; } } //myout<<ans+1<<endl; result[reCnt++] = ans; } for(i = 0; i < reCnt; i++) { int flag = result[i]; b = i; switch(flag) { case 2: if(b >= 0 && b < 362)artNum[0]++; else P[0]++; break; case 3: if(b >= 362 && b < 753)artNum[1]++; else P[1]++; break; case 0: if(b >= 753 && b < 1166)artNum[2]++; else P[2]++; break; case 4: if(b >= 1166 && b < 1568)artNum[3]++; else P[3]++; break; case 5: if(b >= 1568 && b < 1967)artNum[4]++; else P[4]++; break; case 6: if(b >= 1967 && b < 2367)artNum[5]++; else P[5]++; break; case 7: if(b >= 2367 && b < 2768)artNum[6]++; else P[6]++; break; case 1: if(b >= 2768 && b < 3117)artNum[7]++; else P[7]++; break; } //b++; } }