Exemple #1
0
// feed adaptation data from a batch file containing entries (rawFile alignmentFile)
void FMLLREstimator::feedAdaptationData(const char *strBatchFile, const char *strAlignmentFormat, 
	double *dLikelihood) {

	BatchFile batchFile(strBatchFile,"features|alignment");
	batchFile.load();
	
	for(unsigned int i=0 ; i < batchFile.size() ; ++i) {
	//for(int i=0 ; i < 5 ; ++i) {
		
		// load the alignment
		Alignment *alignment = NULL;
		if (strcmp(strAlignmentFormat,"text") == 0) {
			AlignmentFile alignmentFile(m_phoneSet);	
			VPhoneAlignment *vPhoneAlignment = alignmentFile.load(batchFile.getField(i,"alignment"));
			assert(vPhoneAlignment);
			alignment = AlignmentFile::toAlignment(m_phoneSet,m_hmmManager,vPhoneAlignment);
			AlignmentFile::destroyPhoneAlignment(vPhoneAlignment);
		} else {
			alignment = Alignment::load(batchFile.getField(i,"alignment"),NULL);
			assert(alignment);	
		}
		
		// load the feature vectors
		FeatureFile featureFile(batchFile.getField(i,"features"),MODE_READ);
		featureFile.load();
		Matrix<float> *mFeatures = featureFile.getFeatureVectors();
		
		// load and apply the transform
		/*
		Transform *transform = new Transform();
		transform->load("/data/daniel/tasks/wsj/experiments/may16th_2013_CMNUtterance/5/fmllr1/transforms/440m.fmllr.bin");
		Matrix<float> *mFeaturesX = transform->apply(*mFeatures);
		mFeatures = mFeaturesX;
		delete transform;
		*/
		
		// check consistency
		if (mFeatures->getRows() != alignment->getFrames()) {
			BVC_ERROR << "inconsistent number of feature vectors / alignment file";
		}
		
		// accumulate adaptation data
		double dLikelihoodAlignment = 0.0;
		feedAdaptationData(*mFeatures,alignment,&dLikelihoodAlignment);
		BVC_VERB << "loaded file: " << batchFile.getField(i,"alignment") << " likelihood: " << FLT(10,2) 
			<< dLikelihoodAlignment << " (" << mFeatures->getRows() << "frames)";	
		*dLikelihood += dLikelihoodAlignment;
		
		// clean-up
		delete alignment;
		delete mFeatures;
	}
	double dLikelihoodFrame = (*dLikelihood)/m_fOccupancyTotal;
	BVC_VERB << "total likelihood: " << FLT(20,6) << *dLikelihood << " (likelihood per frame: " 
		<< FLT(8,4) << dLikelihoodFrame << ")";
}
Exemple #2
0
// feed adaptation data from a batch file containing entries (rawFile alignmentFile)
void MLLRManager::feedAdaptationData(const char *strBatchFile, const char *strAlignmentFormat, 
	double *dLikelihood, bool bVerbose) {

	BatchFile batchFile(strBatchFile,"features|alignment");
	batchFile.load();
	
	*dLikelihood = 0.0;
	
	for(unsigned int i=0 ; i < batchFile.size() ; ++i) {	
	
		// load the alignment
		Alignment *alignment = NULL;
		// text format
		if (strcmp(strAlignmentFormat,"text") == 0) {	
			AlignmentFile alignmentFile(m_phoneSet,NULL);
			VPhoneAlignment *vPhoneAlignment = alignmentFile.load(batchFile.getField(i,"alignment"));
			assert(vPhoneAlignment);
			alignment = AlignmentFile::toAlignment(m_phoneSet,m_hmmManager,vPhoneAlignment);
			AlignmentFile::destroyPhoneAlignment(vPhoneAlignment);
		} 
		// binary format
		else {
			alignment = Alignment::load(batchFile.getField(i,"alignment"),NULL);
			assert(alignment);	
		}
		
		// load the feature vectors
		FeatureFile featureFile(batchFile.getField(i,"features"),MODE_READ);
		featureFile.load();
		unsigned int iFeatureVectors = 0;
		float *fFeatures = featureFile.getFeatureVectors(&iFeatureVectors);
		
		// check consistency	
		if (iFeatureVectors != alignment->getFrames()) {
			BVC_ERROR << "inconsistent number of feature vectors / alignment file";
		}
		
		// accumulate adaptation data
		double dLikelihoodAlignment = 0.0;
		feedAdaptationData(fFeatures,iFeatureVectors,alignment,&dLikelihoodAlignment);
		if (bVerbose) {
			printf("loaded file: %s likelihood: %12.2f\n",batchFile.getField(i,"alignment"),dLikelihoodAlignment);
		}
		*dLikelihood += dLikelihoodAlignment;
		
		// clean-up
		delete alignment;
		delete [] fFeatures;
	}
	
	if (bVerbose) {
		printf("total likelihood: %14.4f\n",*dLikelihood);
	}
}
int main(int argc, char* argv[]) {
        bfs::path featureFile(argv[1]);
        bfs::path expressionFile(argv[2]);
        double estimatedReadLength = atod(argv[3]);
        double kmersPerRead = atod(argv[4]);
        uint64_t mappedKmers = atol(argv[5]);
        uint32_t mappedKmers = atoi(argv[6]);
        bfs::path outputFile(argv[7]);
        size_t numThreads = atoi(argv[8]);

        performBiasCorrection(featureFile, expressionFile, estimatedReadLength, kmersPerRead,
                              mappedKmers, merLen, outputFile, numThreads);
}
Exemple #4
0
void MainWindow::loadFeature()
{
     std::ifstream featureFile("E:\\final\\Qtfinal\\myData\\featureDic.txt");
     std::ifstream crossfeatureFile("E:\\final\\Qtfinal\\myData\\crossfeatureDic.txt");
     std::string words;
     double val;
     while(featureFile>>words>>val)
     {
        featureDicOnCHI.insert(words);
     }
     while(crossfeatureFile>>words>>val)
     {
        featureDicOnECE.insert(words);
     }
    featureFile.close();
    crossfeatureFile.close();
}
Exemple #5
0
// accumulate statistics
void DTAccumulator::accumulate() {

	// make sure the HMMs are already initialized
	assert(m_hmmManager->areInitialized());
	
	const char *strErrorCode;
	double dLikelihoodTotalNum = 0.0;
	double dLikelihoodTotalNumAcoustic = 0.0;
	double dLikelihoodTotalDen = 0.0;
	long iFeatureVectorsTotal = 0;
	long iFeatureVectorsUsedTotal = 0;
		
	double dBegin = TimeUtils::getTimeMilliseconds();
		
	// empty the accumulators
	m_hmmManager->resetAccumulators();
	
	m_bMMI = false;
	if (strcmp(m_strObjectiveFunction,DISCRIMINATIVE_TRAINING_OBJECTIVE_FUNCTION_BMMI) == 0) {
		m_bMMI = true;
	}
	
	// create accumulators for each HMM-state and Gaussian component	
	for(int i=0 ; i < m_hmmManager->getNumberHMMStatesPhysical() ; ++i) {
		for(unsigned int g=0 ; g < m_hmmManager->getHMMState(i)->getMixture().getNumberComponents() ; ++g) {
			unsigned int iKey = Accumulator::getPhysicalAccumulatorKey(i,g);
			// numerator
			Accumulator *accumulatorNum = new Accumulator(m_iFeatureDimensionality,
				m_hmmManager->getCovarianceModelling(),i,g);
			m_mAccumulatorNum.insert(MAccumulatorPhysical::value_type(iKey,accumulatorNum));
			// denominator
			Accumulator *accumulatorDen = new Accumulator(m_iFeatureDimensionality,
				m_hmmManager->getCovarianceModelling(),i,g);
			m_mAccumulatorDen.insert(MAccumulatorPhysical::value_type(iKey,accumulatorDen));
		}
	}
	
	// precompute constants used to speed-up emission probability computation
	m_hmmManager->precomputeConstants();

	// (2) process each utterance in the MLF file
	int iUtterance = 0;
	VMLFUtterance *vMLFUtterance = m_mlfFile->getUtterances();
	// at this point we might not know the total amount of audio but we do know the total number of utterances
	int iUtterancesTotal = (int)vMLFUtterance->size();
	float fPercentageDisplayed = 0.0;
	for(VMLFUtterance::iterator it = vMLFUtterance->begin() ; it != vMLFUtterance->end() ; ++it, ++iUtterance) {
	
		// (2.1) load the features for the estimation
		ostringstream strFileFeatures;
		strFileFeatures << m_strFolderFeatures << PATH_SEPARATOR << (*it)->strFilePattern;
		FeatureFile featureFile(strFileFeatures.str().c_str(),MODE_READ,FORMAT_FEATURES_FILE_DEFAULT,m_iFeatureDimensionality);
		try {
			featureFile.load();
		} catch (std::runtime_error &e) {
			std::cerr << e.what() << std::endl;
			BVC_WARNING << "unable to load the features file: " << strFileFeatures.str();
			continue;
		}	
		Matrix<float> *mFeatures = featureFile.getFeatureVectors();
		
		// process the utterance using Forward-Backward (get the occupation counts)
		/*double dLikelihoodNum = -DBL_MAX;
		Alignment *alignmentNum = m_forwardBackwardX->processUtterance((*it)->vLexUnit,m_bMultiplePronunciations,
			m_vLexUnitOptional,fFeatures,fFeatures,iFeatureVectors,&dLikelihoodNum,iErrorCode);
		if (iErrorCode != UTTERANCE_PROCESSED_SUCCESSFULLY) {
			delete mFeatures;	
			sprintf(strMessage,"unable to process utterance: \"%s\", reason: %s",strFileFeatures,
			m_forwardBackwardX->getErrorMessage(iErrorCode));
			m_log->logInformation(strMessage);
			continue;
		}*/
		
		// (2.2) load the hypothesis lattice
		ostringstream strFileAux;
		strFileAux << m_strFolderLattices << PATH_SEPARATOR << (*it)->strFilePattern;
		char strFileLattice[1024+1];
		FileUtils::replaceExtension(strFileLattice,strFileAux.str().c_str(),"bin");
		cout << "lattice: " << strFileLattice << endl;
		HypothesisLattice *lattice = new HypothesisLattice(m_phoneSet,m_lexiconManager);
		try {
			lattice->load(strFileLattice);
		} catch (std::runtime_error &e) {
			std::cerr << e.what() << std::endl;
			BVC_WARNING << "unable to load the lattice: " << strFileLattice;
			delete mFeatures;
			continue;
		}
		
		//lattice->printProperties();
		// check lattice properties
		if ((lattice->isProperty(LATTICE_PROPERTY_AM_PROB) == false) ||
			(lattice->isProperty(LATTICE_PROPERTY_LM_PROB) == false) ||
			(lattice->isProperty(LATTICE_PROPERTY_INSERTION_PENALTY) == false) ||
			(lattice->isProperty(LATTICE_PROPERTY_HMMS) == false) ||
			(lattice->isProperty(LATTICE_PROPERTY_PHONE_ALIGN) == false)) {
			BVC_ERROR << "wrong lattice properties";
		}
		
		//LatticeDepth *depth = lattice->computeDepth();
		//printf("depth: %12.4f\n",depth->fDepth);
		
		//lattice->store("./lattice.txt",FILE_FORMAT_TEXT);
		
		// mark best path
		m_lexiconManager->removeNonStandardLexUnits((*it)->vLexUnit);
		LatticeWER *latticeWER = lattice->computeWER((*it)->vLexUnit,NULL,NULL,true,0);
		if ((!latticeWER) || (latticeWER->iErrors != 0)) {
			BVC_WARNING << "lattice WER is not zero, lattice does not contain the hand-made transcription: " 
				<< strFileLattice;
			if (latticeWER) {
				delete latticeWER;
			}
			delete lattice;
			delete mFeatures;
			continue;	
		}	
		
		//HypothesisLattice::print(latticeWER);
		delete latticeWER;
		
		// get the best-path from the lattice (transcription) and compute numerator stats from it
		//BestPath *bestPath = lattice->getBestPath();
		//bestPath->print();
		//m_lexiconManager->print((*it)->vLexUnit);
		VLPhoneAlignment *vLPhoneAlignment = lattice->getBestPathAlignment();
		assert(vLPhoneAlignment);
		
		// compute numerator statistics
		double dLikelihoodNum = -DBL_MAX;
		Alignment *alignmentNum = m_forwardBackward->processPhoneAlignment(*mFeatures,vLPhoneAlignment,dLikelihoodNum,&strErrorCode);
		if (strcmp(strErrorCode,FB_RETURN_CODE_SUCCESS) != 0) {
			BVC_WARNING << "unable to compute numerator occupation statistics: " << strErrorCode << ", " << strFileLattice;
			delete vLPhoneAlignment;
			delete lattice;
			delete mFeatures;
			continue;	
		}
		
		// get denominator statistics from the lattice
		double dLikelihoodDen = -DBL_MAX;		
		MOccupation *mOccupationDen = m_forwardBackward->processLattice(lattice,*mFeatures,m_fScaleAM,m_fScaleLM,
			dLikelihoodDen,m_bMMI,m_fBoostingFactor,&strErrorCode);	
		if (strcmp(strErrorCode,FB_RETURN_CODE_SUCCESS) != 0) {
			BVC_WARNING << "unable to compute lattice occupation statistics (denominator): " << strErrorCode << ", " << strFileLattice;
			delete alignmentNum;
			delete vLPhoneAlignment;
			delete lattice;
			delete mFeatures;
			continue;	
		}
		
		// perform statistics cancellation between numerator and denominator
		if (m_bCanceledStatistics) {
			statisticsCancellation(alignmentNum,mOccupationDen);
		}
		
		Alignment *alignmentDen = ForwardBackward::getAlignment(mOccupationDen,mFeatures->getRows());
		
		// accumulate statistics for both numerator and denominator
		accumulate(alignmentNum,*mFeatures,true);
		accumulate(alignmentDen,*mFeatures,false);	
		
		// get the best path with updated am-scores, lm-prob and insertion penalties
		BestPath *bestPath = lattice->getBestPath();
		assert(bestPath);
		LBestPathElement *lBestPathElement = bestPath->getBestPathElements();
		double dLMIP = 0.0;
		for(LBestPathElement::iterator it = lBestPathElement->begin() ; it != lBestPathElement->end() ; ++it) {
			dLMIP += m_fScaleLM*(*it)->fScoreLanguageModel+m_fScaleAM*(*it)->fInsertionPenalty;
		}
		delete bestPath;
		dLikelihoodTotalNumAcoustic += dLikelihoodNum;
		// apply acoustic scaling and add the lm and insertion penalty scores
		dLikelihoodNum *= m_fScaleAM;
		dLikelihoodNum += dLMIP;
		
		dLikelihoodTotalNum += dLikelihoodNum;
		dLikelihoodTotalDen += dLikelihoodDen;
		
		//printf("%12.4f %12.4f\n",dLikelihoodNum,dLikelihoodDen);
		
		iFeatureVectorsTotal += mFeatures->getRows();	
		
		// clean-up
		delete vLPhoneAlignment;
		delete lattice;
		delete alignmentNum;
		delete alignmentDen;
		delete mOccupationDen;
		delete mFeatures;	
		
		// update the progress bar if necessary
		float fPercentage = (((float)iUtterance)*100)/((float)iUtterancesTotal);
		if (fPercentage >= fPercentageDisplayed + 10.0) {
			fPercentageDisplayed += 10.0;
			printf("*");
			fflush(stdout);
		}
	}
	// update the progress bar if necessary
	while (fPercentageDisplayed < 100.0) {
		printf("*");
		fPercentageDisplayed += 10.0;
	}
	
	iFeatureVectorsUsedTotal = iFeatureVectorsTotal;
	
	// get the iteration end time
	double dEnd = TimeUtils::getTimeMilliseconds();
	double dMillisecondsInterval = dEnd - dBegin;
	
	// compute the Real Time Factor of the reestimation process
	double dRTF = (dMillisecondsInterval/10.0)/((double)iFeatureVectorsTotal);
	
	int iGaussians = m_hmmManager->getNumberGaussianComponents();
	// compute audio available for training
	int iHours,iMinutes,iSeconds;
	TimeUtils::convertHundredths((double)iFeatureVectorsTotal,iHours,iMinutes,iSeconds);
	// compute audio used
	int iHoursUsed,iMinutesUsed,iSecondsUsed;
	TimeUtils::convertHundredths((double)iFeatureVectorsUsedTotal,iHoursUsed,iMinutesUsed,iSecondsUsed);
	
	// show the accumulation information
	printf(" likelihood= (%.4f) %.4f %.4f %.4f [%8d Gauss][RTF=%.4f][%d:%02d'%02d''][%d:%02d'%02d'']\n",
		dLikelihoodTotalNumAcoustic,dLikelihoodTotalNum,dLikelihoodTotalDen,dLikelihoodTotalNum-dLikelihoodTotalDen,
		iGaussians,dRTF,iHours,iMinutes,iSeconds,iHoursUsed,iMinutesUsed,iSecondsUsed);	
	
	// dump the accumulators
	Accumulator::storeAccumulators(m_strFileAccumulatorsNum,m_iFeatureDimensionality,m_iCovarianceModeling,
		m_iHMMStates,m_iGaussianComponents,m_mAccumulatorNum);
	Accumulator::storeAccumulators(m_strFileAccumulatorsDen,m_iFeatureDimensionality,m_iCovarianceModeling,
		m_iHMMStates,m_iGaussianComponents,m_mAccumulatorDen);
	
	// destroy the accumulators
	Accumulator::destroy(m_mAccumulatorNum);
	Accumulator::destroy(m_mAccumulatorDen);
}
Exemple #6
0
void naiveBayes(string naiveBayesTestVSM)
{
	ifstream iFile(trainBayes.c_str()); //TF词典"E:\\final\\final\\myData\\myTFDic.txt";
	ifstream featureFile(featureDicPath.c_str());
	cout<<featureDicPath<<endl;
	double featureVal;
	string featureStr;
	string words;
	int TF;
	int cnt;
	int flag;
	int i,j;
	while(featureFile>>featureStr>>featureVal)
		featureDic[featureStr]++;
	while(iFile>>words)
	{
		iFile>>flag>>cnt;
		for(i = 0; i < cnt; i++)
		{
			iFile>>flag>>TF;
			if(featureDic.count(words))
				bayesDic[words][flag] = TF;
		}
	}
	i = j = 0;
	cout<<"******"<<bayesDic.size()<<endl;
	for(map<string,map<int,int> >::iterator itor = bayesDic.begin(); itor != bayesDic.end();itor++)
	{
		//cout<<itor->first<<endl;
		for(int k = 0; k < 8; k++)
			bayes[j][k] = 1.0 / (1000 + bayesDic[itor->first][9]);
		for(map<int,int>::iterator it = itor->second.begin(); it != itor->second.end();it++)
		{
			i = it->first - 1;
			if(i == 8) break;
					bayes[j][i] = 1.0 * (1 + it->second) / (1000.0 + bayesDic[itor->first][9]);
		}
		j++;
	}
	
	ofstream out("mydata\\bayesOnECE.txt");
	for( i = 0; i < 8; i++)
	{
		for(j = 0 ; j < 1000; j++)
				out<<bayes[j][i]<<" ";
		out<<endl;
	}
	double p[8];
	int c,pos,v,b;
	string kk;
	//ifstream in("testBayesOnCross.txt");
	ifstream in(naiveBayesTestVSM.c_str());
	//ofstream myout("ans.txt");
	int result[3117 + 3];
	int reCnt = 0;
	while(in>>kk)
	{
		//in>>kk;
		in>>c;
		for(i = 0; i < 8; i++)p[i] = 0;
		for( i = 0; i < c ;i++)
		{
			in>>pos>>v;
			for(j = 0; j < 8; j++)
			{
				p[j] += v * log(bayes[pos][j]);
				//p[j] += v * log(bayes[j][pos]);
			}
		}
	
		double max = -999999999;
		int ans;
		for(i = 0; i < 8; i++)
		{
			//cout<<p[i]<<endl;
			if(max < p[i])
			{
				max = p[i];
				ans = i;
			}
		}
		//myout<<ans+1<<endl;
		result[reCnt++] = ans;
	}

	for(i = 0; i < reCnt; i++)
    {
            int flag = result[i];
            b = i;
      switch(flag)
		{
		case 2:
			if(b >= 0 && b < 362)artNum[0]++;
			else P[0]++;
			break;
		case 3:
			if(b >= 362 && b < 753)artNum[1]++;
			else P[1]++;
			break;
		case 0:
			if(b >= 753 && b < 1166)artNum[2]++;
			else P[2]++;
			break;
		case 4:
			if(b >= 1166 && b < 1568)artNum[3]++;
			else P[3]++;
			break;
		case 5:
			if(b >= 1568 && b < 1967)artNum[4]++;
			else P[4]++;
			break;
		case 6:
			if(b >= 1967 && b < 2367)artNum[5]++;
			else P[5]++;
			break;
		case 7:
			if(b >= 2367 && b < 2768)artNum[6]++;
			else P[6]++;
			break;
		case 1:
			if(b >= 2768 && b < 3117)artNum[7]++;
			else P[7]++;
			break;
		}
     //b++;
    }
	
}