int main_run(char * args[]){ vector<vector<T>> ufile = read(args[1]); vector<vector<T>> tfile = read(args[2]); vector<vector<T>> logs = read(args[3]); HMM<T> hmm; /*for (auto i = transformed.begin(); i != transformed.end(); ++i) { for (auto j = i->begin(); j != i->end(); ++j) { cout<<j->first<<"\t"<<j->second<<"\n"; } }*/ hmm.Train(ufile, tfile); map<int, string> tagcodemap; vector<vector<T>> taglist = hmm.TagLogs(logs, tagcodemap); vector<vector<T>> transformed = transform2(taglist); //FP GROWTH PART FOR CLUSTERING after classification int s = atoi(args[4]); FPTree<T> tree(s); //consider the position of token in the line tree.build(transformed); std::vector<std::vector<T>> r; clock_t t = clock(); tree.mine(r); std::cout<<"Initial pattern no: "<<r.size()<<"\n"; Cluster<T> data; data.AssociatePatterns(transformed, r); cout << "Time: " << double(clock() - t) / CLOCKS_PER_SEC << "\n"; data.DisplayCluster(logs, transformed, r, tagcodemap); Analyze<T> analyze; vector<vector<T>> manual = read("manual"); cout<<"\n"<<analyze.Efficiency(taglist, manual); }
void HMMModel::loadHMMs(QString strPath) { if (!strPath.endsWith('/') && !strPath.endsWith('\\')) { strPath += "/"; } QStringList filters; filters << "*.hmm"; QDir dir(strPath); QStringList list = dir.entryList(filters); cleanHMMs(); QString strFileName; int nPos; std::string strSignName, strPathFile; HMM* phmm; const char* pCharCom; char* tmp; for (int l=0; l<list.count(); l++) { strFileName = list.at(l); nPos = strFileName.lastIndexOf(".hmm"); strSignName = strFileName.left(nPos).toStdString(); strPathFile = strPath.toLocal8Bit()+strFileName.toLocal8Bit(); pCharCom = strPathFile.c_str(); tmp = new char[strlen(pCharCom)+1]; strcpy(tmp, pCharCom); phmm = new HMM(); phmm->Load(tmp); m_mHMM.insert(std::pair<std::string, HMM*>(strSignName, phmm)); } }
void LoadHMM(HMM<distribution::DiscreteDistribution>& hmm, util::SaveRestoreUtility& sr) { std::string type; size_t states; sr.LoadParameter(type, "hmm_type"); if (type != "discrete") { Rcpp::Rcout << "Cannot load non-discrete HMM (of type " << type << ") as " << "discrete HMM!" << std::endl; } sr.LoadParameter(states, "hmm_states"); // Load transition matrix. sr.LoadParameter(hmm.Transition(), "hmm_transition"); // Now each emission distribution. hmm.Emission().resize(states); for (size_t i = 0; i < states; ++i) { std::stringstream s; s << "hmm_emission_distribution_" << i; sr.LoadParameter(hmm.Emission()[i].Probabilities(), s.str()); } hmm.Dimensionality() = 1; }
int Clusters:: reclusterNeyEssen() { cerr << "Calculating MLE for prior probabilities" << endl; vector<double> prior(numberClasses,0.0l); double xxx = 1.0l/(double)numberTypes; for (int i = 0; i < numberTypes; i++){ int c = classVector[i]; prior[c] += xxx; } for (int i = 0; i < numberClasses;i++) cerr << i << " " << prior[i] << endl; if (numberStates > 0){ cerr << "Training all the HMMs" << endl; for (int c = 0; c < numberClasses; c++){ // cerr << "Training HMM " << c << endl; HMM* hmmPtr = hmms[c]; HMM* newHmmPtr = new HMM(numberStates, alphabetSize); for (int i = 0; i < numberTypes; i++){ if (classVector[i] == c){ // then this word is in the right class // so train it on word i const string & word = *(corpus.wordArray[i]); vector<int> v; hmmPtr->convertString(word,v); // FIXME double weight = 1.0l; if (USE_TRUE_WEIGHT){ weight = corpus.countArray[i]; } hmmPtr->emSingle(*newHmmPtr, weight, v); } } newHmmPtr->normalise(); hmms[c] = newHmmPtr; delete hmmPtr; } } int something = 0; for (int i = 0; i < numberTypes; i++){ // cerr << "Word " << i; int w = sortedWords[i]; //cerr << *(corpus.wordArray[w]) << endl; if (counts[w] > FREQ_CUTOFF){ //cerr << "Doing " << w << endl; if (bestCluster(w, prior)){ something++; } } } return something; }
// [[Rcpp::export]] List launch_pmmh_cpp(List inputs, List modellist, List algoparameters){ string strmodel = as<string>(modellist["model"]); string strprior = as<string>(modellist["prior"]); NumericMatrix current = inputs["current"]; NumericMatrix observations = inputs["observations"]; List theta = inputs["theta"]; int nparticles = as<int>(algoparameters["nparticles"]); int niterations = as<int>(algoparameters["niterations"]); NumericMatrix cholesky_proposal = algoparameters["cholesky_proposal"]; NumericVector initial_theta = algoparameters["initial_theta"]; HMM* model; Prior* prior; if (strmodel.compare(string("model1")) == 0){ model = new BatteryModel(); if (strprior.compare(string("uniform")) == 0){ prior = new BatteryModelUniformPrior(); } if (strprior.compare(string("normal")) == 0){ prior = new BatteryModelNormalPrior(); } } if (strmodel.compare(string("model2")) == 0){ model = new BatteryModel2(); if (strprior.compare(string("uniform")) == 0){ prior = new BatteryModel2UniformPrior(); } if (strprior.compare(string("normal")) == 0){ prior = new BatteryModel2NormalPrior(); } } model->set_input(current); model->set_parameters(theta); model->set_observations(observations); PMMH pmmh(nparticles, niterations, model->dim_states); pmmh.set_prior(prior); pmmh.init(model, initial_theta); pmmh.set_proposal_cholesky(cholesky_proposal); pmmh.run(); delete model; delete prior; return Rcpp::List::create( Rcpp::Named("chain")= pmmh.chain_parameters, Rcpp::Named("naccepts") = pmmh.naccepts, Rcpp::Named("loglikelihood") = pmmh.loglikelihood, Rcpp::Named("loglikelihood_proposal") = pmmh.loglikelihood_proposal, Rcpp::Named("proposals") = pmmh.proposals, Rcpp::Named("nparticles") = nparticles, Rcpp::Named("niterations") = niterations, Rcpp::Named("cholesky_proposal") = cholesky_proposal); }
void LoadHMM(HMM<gmm::GMM<> >& hmm, util::SaveRestoreUtility& sr) { std::string type; size_t states; sr.LoadParameter(type, "hmm_type"); if (type != "gmm") { Rcpp::Rcout << "Cannot load non-GMM HMM (of type " << type << ") as " << "a Gaussian Mixture Model HMM!" << std::endl; } sr.LoadParameter(states, "hmm_states"); // Load transition matrix. sr.LoadParameter(hmm.Transition(), "hmm_transition"); // Now each emission distribution. hmm.Emission().resize(states, gmm::GMM<>(1, 1)); for (size_t i = 0; i < states; ++i) { std::stringstream s; s << "hmm_emission_" << i << "_gaussians"; size_t gaussians; sr.LoadParameter(gaussians, s.str()); s.str(""); // Extract dimensionality. arma::vec meanzero; s << "hmm_emission_" << i << "_gaussian_0_mean"; sr.LoadParameter(meanzero, s.str()); size_t dimensionality = meanzero.n_elem; // Initialize GMM correctly. hmm.Emission()[i].Gaussians() = gaussians; hmm.Emission()[i].Dimensionality() = dimensionality; for (size_t g = 0; g < gaussians; ++g) { s.str(""); s << "hmm_emission_" << i << "_gaussian_" << g << "_mean"; sr.LoadParameter(hmm.Emission()[i].Means()[g], s.str()); s.str(""); s << "hmm_emission_" << i << "_gaussian_" << g << "_covariance"; sr.LoadParameter(hmm.Emission()[i].Covariances()[g], s.str()); } s.str(""); s << "hmm_emission_" << i << "_weights"; sr.LoadParameter(hmm.Emission()[i].Weights(), s.str()); } hmm.Dimensionality() = hmm.Emission()[0].Dimensionality(); }
PosteriorViterbi::PosteriorViterbi(HMM &hmm, bool shouldAdd) : numStates(hmm.countStates()), hmmGraph(hmm), addRatherThanMultiply(shouldAdd) { // ctor }
void SaveHMM(const HMM<gmm::GMM<> >& hmm, util::SaveRestoreUtility& sr) { std::string type = "gmm"; size_t states = hmm.Transition().n_rows; sr.SaveParameter(type, "hmm_type"); sr.SaveParameter(states, "hmm_states"); sr.SaveParameter(hmm.Transition(), "hmm_transition"); // Now the emissions. for (size_t i = 0; i < states; ++i) { // Generate name. std::stringstream s; s << "hmm_emission_" << i << "_gaussians"; sr.SaveParameter(hmm.Emission()[i].Gaussians(), s.str()); s.str(""); s << "hmm_emission_" << i << "_weights"; sr.SaveParameter(hmm.Emission()[i].Weights(), s.str()); for (size_t g = 0; g < hmm.Emission()[i].Gaussians(); ++g) { s.str(""); s << "hmm_emission_" << i << "_gaussian_" << g << "_mean"; sr.SaveParameter(hmm.Emission()[i].Means()[g], s.str()); s.str(""); s << "hmm_emission_" << i << "_gaussian_" << g << "_covariance"; sr.SaveParameter(hmm.Emission()[i].Covariances()[g], s.str()); } } }
void SaveHMM(const HMM<distribution::DiscreteDistribution>& hmm, util::SaveRestoreUtility& sr) { std::string type = "discrete"; size_t states = hmm.Transition().n_rows; sr.SaveParameter(type, "hmm_type"); sr.SaveParameter(states, "hmm_states"); sr.SaveParameter(hmm.Transition(), "hmm_transition"); // Now the emissions. for (size_t i = 0; i < states; ++i) { // Generate name. std::stringstream s; s << "hmm_emission_distribution_" << i; sr.SaveParameter(hmm.Emission()[i].Probabilities(), s.str()); } }
HMM buildHMM(const ESBTL::Default_system & system, GMMCreator creator, ClusteringFunctor func) { arma::mat data; arma::urowvec labels; HMM hmm; unsigned int offset = 0; for (ESBTL::Default_system::Models_const_iterator it_model = system.models_begin(); it_model != system.models_end(); ++it_model) { const ESBTL::Default_system::Model & model = *it_model; arma::mat tempdata; arma::urowvec templabels = func(model, tempdata); templabels += offset; data = arma::join_rows(data, tempdata); labels = arma::join_rows(labels, templabels); offset = arma::max(labels); } hmm.baumWelchCached(data, creator(data, labels)); return hmm; }
void LoadHMM(HMM<distribution::GaussianDistribution>& hmm, util::SaveRestoreUtility& sr) { std::string type; size_t states; sr.LoadParameter(type, "hmm_type"); if (type != "gaussian") { Rcpp::Rcout << "Cannot load non-Gaussian HMM (of type " << type << ") as " << "a Gaussian HMM!" << std::endl; } sr.LoadParameter(states, "hmm_states"); // Load transition matrix. sr.LoadParameter(hmm.Transition(), "hmm_transition"); // Now each emission distribution. hmm.Emission().resize(states); for (size_t i = 0; i < states; ++i) { std::stringstream s; s << "hmm_emission_mean_" << i; sr.LoadParameter(hmm.Emission()[i].Mean(), s.str()); s.str(""); s << "hmm_emission_covariance_" << i; sr.LoadParameter(hmm.Emission()[i].Covariance(), s.str()); } hmm.Dimensionality() = hmm.Emission()[0].Mean().n_elem; }
void SaveHMM(const HMM<distribution::GaussianDistribution>& hmm, util::SaveRestoreUtility& sr) { std::string type = "gaussian"; size_t states = hmm.Transition().n_rows; sr.SaveParameter(type, "hmm_type"); sr.SaveParameter(states, "hmm_states"); sr.SaveParameter(hmm.Transition(), "hmm_transition"); // Now the emissions. for (size_t i = 0; i < states; ++i) { // Generate name. std::stringstream s; s << "hmm_emission_mean_" << i; sr.SaveParameter(hmm.Emission()[i].Mean(), s.str()); s.str(""); s << "hmm_emission_covariance_" << i; sr.SaveParameter(hmm.Emission()[i].Covariance(), s.str()); } }
int main() { kmeans("./dataset", "./result"); Forecast forecast("./result"); HMM hmm = HMM("./dataset", "./dictionnary.txt", forecast); hmm.print(); { cout << "Save" << endl; std::ofstream ofs("hmm_save"); boost::archive::text_oarchive oa(ofs); oa << hmm; } HMM hmm2 = HMM(); { cout << "Load" << endl; std::ifstream ifs("hmm_save"); boost::archive::text_iarchive ia(ifs); ia >> hmm2; } hmm2.print(); return (EXIT_SUCCESS); }
int main(int argc, char* argv[]) { HMM h; h.init(); std::cout << h; forward_viterbi(h.get_observations(), h.get_states(), h.get_start_probability(), h.get_transition_probability(), h.get_emission_probability()); }
// Calculate secondary structure for given HMM and return prediction void CalculateSS(HMM& q, char *ss_pred, char *ss_conf) { char tmpfile[]="/tmp/hhCalcSSXXXXXX"; if (mkstemp(tmpfile) == -1) { cerr << "ERROR! Could not create tmp-file!\n"; exit(4); } // Write log-odds matrix from q to tmpfile.mtx char filename[NAMELEN]; FILE* mtxf = NULL; strcpy(filename,tmpfile); strcat(filename,".mtx"); mtxf = fopen(filename,"w"); if (!mtxf) OpenFileError(filename); fprintf(mtxf,"%i\n",q.L); fprintf(mtxf,"%s\n",q.seq[q.nfirst]+1); fprintf(mtxf,"2.670000e-03\n4.100000e-02\n-3.194183e+00\n1.400000e-01\n2.670000e-03\n4.420198e-02\n-3.118986e+00\n1.400000e-01\n3.176060e-03\n1.339561e-01\n-2.010243e+00\n4.012145e-01\n"); for (int i = 1; i <= q.L; ++i) { fprintf(mtxf,"-32768 "); for (int a = 0; a < 20; ++a) { int tmp = iround(50*flog2(q.p[i][s2a[a]]/pb[s2a[a]])); fprintf(mtxf,"%5i ",tmp); if (a == 0) { // insert logodds value for B fprintf(mtxf,"%5i ",-32768); } else if (a == 18) { // insert logodds value for X fprintf(mtxf,"%5i ",-100); } else if (a == 19) { // insert logodds value for Z fprintf(mtxf,"%5i ",-32768); } } fprintf(mtxf,"-32768 -400\n"); } fclose(mtxf); // Calculate secondary structure CalculateSS(ss_pred, ss_conf, tmpfile); q.AddSSPrediction(ss_pred, ss_conf); // Remove temp-files std::string command = "rm " + (std::string)tmpfile + "*"; runSystem(command,v); }
///////////////////////////////////////////////////////////////////////////////////// // Do precalculations for q and t to prepare comparison ///////////////////////////////////////////////////////////////////////////////////// void PrepareTemplate(HMM& q, HMM& t, int format) { if (format==0) // HHM format { // Add transition pseudocounts to template t.AddTransitionPseudocounts(); // Don't use CS-pseudocounts because of runtime!!! // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] t.PreparePseudocounts(); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] t.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc); t.CalculateAminoAcidBackground(); } else // HHMER format { // Don't add transition pseudocounts to template // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0); // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] // t.PreparePseudocounts(); // DON'T ADD amino acid pseudocounts to temlate: pcm=0! t.p[i][a] = t.f[i][a] t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); t.CalculateAminoAcidBackground(); } if (par.forward>=1) t.Log2LinTransitionProbs(1.0); // Factor Null model into HMM t // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p t.IncludeNullModelInHMM(q,t); // Can go BEFORE the loop if not dependent on template return; }
int main (int argc, const char * argv[]) { TimeSeriesClassificationData trainingData; //This will store our training data GestureRecognitionPipeline pipeline; //This is a wrapper for our classifier and any pre/post processing modules string dirPath = "/home/vlad/AndroidStudioProjects/DataCapture/dataSetGenerator/build"; if (!trainingData.loadDatasetFromFile(dirPath + "/acc-training-set-segmented.data")) { printf("Cannot open training set\n"); return 0; } printf("Successfully opened training data set ...\n"); HMM hmm; hmm.setHMMType( HMM_CONTINUOUS ); hmm.setDownsampleFactor( 5 ); hmm.setAutoEstimateSigma( true ); hmm.setSigma( 20.0 ); hmm.setModelType( HMM_LEFTRIGHT ); hmm.setDelta( 1 ); // LowPassFilter lpf(0.1, 1, 3); // pipeline.setPreProcessingModule(lpf); pipeline.setClassifier( hmm ); pipeline.train(trainingData, 20); //You can then get then get the accuracy of how well the pipeline performed during the k-fold cross validation testing double accuracy = pipeline.getCrossValidationAccuracy(); printf("Accuracy: %f\n", accuracy); }
tuple<vector<double>, Matrix<double>, Matrix<double>> forward_backward(string obs, const HMM& model) { if (!model.isFinalized()) throw runtime_error("Model should be finalized!"); // Forward algorithm Matrix<double> forward(obs.length(), model.numStates(), 0); vector<double> cs(obs.length(), 0); // Calculate c1 for (size_t state = 0; state < model.numStates(); state++) cs[0] += model.startProb(state) * model.emissionProb(state, obs.substr(0,1)); // Base case for (size_t state = 0; state < model.numStates(); state++) forward(0, state) = model.startProb(state) * model.emissionProb(state, obs.substr(0,1)) / cs[0]; // Recursion for (size_t i = 1; i < obs.length(); i++) { vector<double> delta(model.numStates(), 0); for (size_t state = 0; state < model.numStates(); state++) { if (i < model.stateArity(state)) continue; for (auto prevState : model.incommingStates(state)) { double val = forward(i - model.stateArity(state), prevState) * model.transitionProb(prevState, state); for (size_t k = 1; k < model.stateArity(state); k++) val /= cs[i - k]; delta[state] += val; } delta[state] *= model.emissionProb(state, obs.substr(i - model.stateArity(state) + 1, model.stateArity(state))); cs[i] += delta[state]; } for (size_t state = 0; state < model.numStates(); state++) { forward(i, state) = delta[state] / cs[i]; } } // Backward algorithm Matrix<double> backward(obs.length(), model.numStates(), 0); const size_t N = obs.length() - 1; for (size_t state = 0; state < model.numStates(); state++) backward(N, state) = 1; for (long i = N - 1; i >= 0; i--) { for (size_t state = 0; state < model.numStates(); state++) { double prob = 0; for (auto nextState : model.outgoingStates(state)) { if (i + model.stateArity(nextState) > N) continue; double val = backward(i + model.stateArity(nextState), nextState) * model.transitionProb(state, nextState) * model.emissionProb(nextState, obs.substr(i + 1, model.stateArity(nextState))); for (size_t k = 0; k < model.stateArity(nextState); k++) val /= cs[i + 1 + k]; prob += val; } backward(i, state) = prob; } } return make_tuple(cs, forward, backward); }
FastViterbi::FastViterbi(HMM &hmm,bool posterior) : numStates(hmm.countStates()), hmmGraph(hmm), posterior(posterior) { // ctor }
Clusters:: Clusters(int numberClasses_, const SimpleCorpusOne & corpus_, int numberStates_, int alphabetSize_, bool randomised) : numberClasses(numberClasses_), numberTypes(corpus_.numberTypes), numberTokens(corpus_.numberTokens), numberStates(numberStates_), alphabetSize(alphabetSize_), data(corpus_.data), corpus(corpus_), clusterBigrams(numberClasses_,numberClasses_) { classVector.resize(numberTypes); counts.resize(numberTypes); sortedWords.resize(numberTypes); first.resize(numberTypes); clusterUnigrams.resize(numberClasses); next = new int[numberTokens]; for (int i = 0; i < numberTokens; i++) next[i] = numberTokens; for (int w = 0; w < numberTypes; w++){ counts[w]=0; classVector[w] = numberClasses -1; } // counts are set for (int i = 0; i < numberTokens; i++) counts[data[i]]++; // now find the most frequent numberClasses -1 of them. vector< pair<int,int> > countsTable(numberTypes); for (int i = 0; i < numberTypes; i++){ countsTable[i] = pair<int,int>(counts[i],i); //cerr << counts[i] << " " << i << endl; } cerr << "Sorting words" << endl; sort(countsTable.begin(),countsTable.end()); for (int i = 0; i < numberTypes; i++){ first[i] = -1; sortedWords[i] = countsTable[numberTypes - 1 - i].second; //cerr << "sort " << i << " " << sortedWords[i] << " , n =" << countsTable[numberTypes - 1 - i].first << endl; } if (randomised) { for (int i = 0; i < numberTypes; i++){ if (counts[i] > FREQ_CUTOFF){ int rc = (int) (1.0 * numberClasses *rand()/(RAND_MAX+1.0)); classVector[i] = rc; } } } else { for (int i = 0; i < numberClasses-1; i++){ classVector[sortedWords[i]]= i; } } vector<int> last(numberTypes,0); cerr << "Indexing data" << endl; for (int i = 0; i < numberTokens-1; i++){ int w = data[i]; int w2 = data[i+1]; if (w2 < 0 || w2 > numberTypes -1){ cerr << i+1 << " " << w2 << endl; } assert(w >= 0 && w < numberTypes); assert(w2 >= 0 && w2 < numberTypes); if (first[w] == -1){ first[w] = i; last[w] = i; } else { next[last[w]] = i; last[w] = i; } int c1 = classVector[w]; int c2 = classVector[w2]; assert(c1 >= 0 && c1 < numberClasses); assert(c2 >= 0 && c2 < numberClasses); clusterBigrams(c1,c2)++; clusterUnigrams[c1]++; } cerr << "Finished indexing " << endl; // be careful clusterUnigrams[classVector[data[numberTokens-1]]]++; cerr << "Numberstates " << numberStates << endl; if (numberStates > 0){ cerr << "Starting to do the HMMs" << endl; hmms.resize(numberClasses); for (int i = 0; i < numberClasses; i++){ HMM* hmmPtr = new HMM(numberStates, alphabetSize); hmmPtr->randomise(); hmmPtr->normalise(); hmms[i] = hmmPtr; } } }
int main(int argc, const char * argv[]){ //Load the training data TimeSeriesClassificationData trainingData; if( !trainingData.load("HMMTrainingData.grt") ){ cout << "ERROR: Failed to load training data!\n"; return false; } //Remove 20% of the training data to use as test data TimeSeriesClassificationData testData = trainingData.partition( 80 ); //Create a new HMM instance HMM hmm; //Set the HMM as a Continuous HMM hmm.setHMMType( HMM_CONTINUOUS ); //Set the downsample factor, a higher downsample factor will speed up the prediction time, but might reduce the classification accuracy hmm.setDownsampleFactor( 5 ); //Set the committee size, this sets the (top) number of models that will be used to make a prediction hmm.setCommitteeSize( 10 ); //Tell the hmm algorithm that we want it to estimate sigma from the training data hmm.setAutoEstimateSigma( true ); //Set the minimum value for sigma, you might need to adjust this based on the range of your data //If you set setAutoEstimateSigma to false, then all sigma values will use the value below hmm.setSigma( 20.0 ); //Set the HMM model type to LEFTRIGHT with a delta of 1, this means the HMM can only move from the left-most state to the right-most state //in steps of 1 hmm.setModelType( HMM_LEFTRIGHT ); hmm.setDelta( 1 ); //Train the HMM model if( !hmm.train( trainingData ) ){ cout << "ERROR: Failed to train the HMM model!\n"; return false; } //Save the HMM model to a file if( !hmm.save( "HMMModel.grt" ) ){ cout << "ERROR: Failed to save the model to a file!\n"; return false; } //Load the HMM model from a file if( !hmm.load( "HMMModel.grt" ) ){ cout << "ERROR: Failed to load the model from a file!\n"; return false; } //Compute the accuracy of the HMM models using the test data double numCorrect = 0; double numTests = 0; for(UINT i=0; i<testData.getNumSamples(); i++){ UINT classLabel = testData[i].getClassLabel(); hmm.predict( testData[i].getData() ); if( classLabel == hmm.getPredictedClassLabel() ) numCorrect++; numTests++; VectorFloat classLikelihoods = hmm.getClassLikelihoods(); VectorFloat classDistances = hmm.getClassDistances(); cout << "ClassLabel: " << classLabel; cout << " PredictedClassLabel: " << hmm.getPredictedClassLabel(); cout << " MaxLikelihood: " << hmm.getMaximumLikelihood(); cout << " ClassLikelihoods: "; for(UINT k=0; k<classLikelihoods.size(); k++){ cout << classLikelihoods[k] << "\t"; } cout << "ClassDistances: "; for(UINT k=0; k<classDistances.size(); k++){ cout << classDistances[k] << "\t"; } cout << endl; } cout << "Test Accuracy: " << numCorrect/numTests*100.0 << endl; return true; }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){ double *data, *transition, *gauss, *statesOut; unsigned int dataSize[2], dataPointCount, dataStart, dataEnd, statesCount, iterationCount; /* Check for proper number of arguments. */ if(nrhs < 3) { mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:invalidNumInputs", "Three input arguments required." ); } else if(nlhs > 5) { mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:maxlhs", "Too many output arguments." ); } /* The input must be a noncomplex double.*/ if( !mxIsDouble(prhs[0]) || mxIsComplex(prhs[0]) || !mxIsDouble(prhs[1]) || mxIsComplex(prhs[1]) ){ mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:inputNotRealDouble", "Input must be a noncomplex double." ); } dataSize[0] = (int) mxGetM(prhs[0]); dataSize[1] = (int) mxGetN(prhs[0]); dataPointCount = dataSize[0] * dataSize[1]; statesCount = (int) mxGetM(prhs[1]); if (mxGetN(prhs[1]) != statesCount){ mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:noSquareTransitionMatrix", "Transition matrix has to be MxM." ); } if (mxGetM(prhs[2]) != statesCount){ mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:notMatchingStatesCount", "Gauss definition matrix has a different states count than the transition matrix." ); } if (mxGetN(prhs[2]) != 2){ mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:invalidGaussDefinition", "Gauss definition matrix has to be Mx2." ); } /* Create matrix for the return argument. */ plhs[0] = mxCreateDoubleMatrix(dataSize[0], dataSize[1], mxREAL); /* Assign pointers to each input and output. */ data = mxGetPr(prhs[0]); transition = mxGetPr(prhs[1]); gauss = mxGetPr(prhs[2]); statesOut = mxGetPr(plhs[0]); using namespace hiddenMarkovModel; HMMConfiguration configuration; if (nrhs > 3){ mxArray const *options = prhs[3]; if (mxIsStruct(options)){ // load configuration from MATLAB struct mxArray *value; value = mxGetField(options, 0, "verbose"); if (value != NULL && mxIsLogicalScalarTrue(value)){ configuration.verbose = true; DEFAULT_FALSE(verboseOutputEmission); DEFAULT_TRUE(verboseOutputTransition); } DEFAULT_DOUBLE(minSelfTransition, 0); DEFAULT_DOUBLE(minEmission, 1e-6); DEFAULT_TRUE(doEmissionUpdate); DEFAULT_TRUE(doTransitionUpdate); DEFAULT_INT(binningCount, 300); DEFAULT_INT(maxIterations, 100); DEFAULT_INT(abortStateChanges, 5); DEFAULT_FALSE(useMinimalBinningRange); if (configuration.useMinimalBinningRange){ DEFAULT_DOUBLE(lowerBinningRangeLimit, 0); DEFAULT_DOUBLE(upperBinningRangeLimit, 1); } } else if (mxIsChar(options)){ // load configuration from file std::ifstream file(mxArrayToString(options)); if (file.good()){ configuration = HMMConfiguration::fromFile(file); } else { mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:configFileNotFound", "Configuration file not found." ); } } else { mexErrMsgIdAndTxt( "MATLAB:Fit:HMM:invalidConfigParameter", "Invalid configuration parameter." ); } } std::vector<InitialEmissionProbability*> initStates(0); for (unsigned int i = 0; i < statesCount; i += 1){ initStates.push_back(new GaussState(gauss[i], gauss[statesCount + i])); } // check for NaN at the start dataStart = 0; for (unsigned int i = 0; i < dataPointCount; i += 1){ if (!mxIsNaN(data[i])){ dataStart = i; break; } } // check for NaN at the end for (dataEnd = dataStart; dataEnd < dataPointCount; dataEnd += 1){ if (mxIsNaN(data[dataEnd])){ break; } } HMM model (data + dataStart, dataEnd - dataStart, initStates, configuration); // delete state pointers initStates.clear(); for (unsigned int i = 0; i < statesCount; i += 1){ for (unsigned int j = 0; j < statesCount; j += 1){ model.setTransition(transition[i + statesCount * j], i, j); } } model.autoSetSelfTransition(); model.run(iterationCount); std::vector<unsigned int> states (dataEnd - dataStart, 0); model.viterbi(states); for (unsigned int i = 0; i < dataSize[0] * dataSize[1]; i += 1){ if (i < dataStart || i >= dataEnd){ statesOut[i] = mxGetNaN(); } else { statesOut[i] = (double) states[i - dataStart] + 1; } } if (nlhs > 1){ /* Create matrix for the transition output. */ plhs[1] = mxCreateDoubleMatrix(statesCount, statesCount, mxREAL); double *transitionOut = mxGetPr(plhs[1]); for (unsigned int from = 0; from < statesCount; from += 1){ for (unsigned int to = 0; to < statesCount; to += 1){ transitionOut[from + to * statesCount] = model.getTransition(from, to); } } if (nlhs > 2){ /* Create matrix for the emission output. */ plhs[2] = mxCreateDoubleMatrix(statesCount, configuration.binningCount, mxREAL); double *emissionOut = mxGetPr(plhs[2]); for (unsigned int state = 0; state < statesCount; state += 1){ for (unsigned int bin = 0; bin < configuration.binningCount; bin += 1){ emissionOut[state + bin * statesCount] = model.getEmissionPropability(state, bin); } } if (nlhs > 3){ /* Create vector for the emission binning centers. */ array1D range(2, 0); model.getBinningRange(range); double binStart = range[0]; double binDiff = range[1] - range[0]; plhs[3] = mxCreateDoubleMatrix(1, configuration.binningCount, mxREAL); double *binningCenters = mxGetPr(plhs[3]); for (unsigned int bin = 0; bin < configuration.binningCount; bin += 1){ binningCenters[bin] = binStart + binDiff * ( (0.5 + (double) bin) / (double) configuration.binningCount ); } if (nlhs > 4){ /* Create matrix for the iteration count output. */ plhs[4] = mxCreateDoubleScalar((double) iterationCount); } } } } }
shared_ptr< HMM<double> > Merge_Models(shared_ptr < HMM<double> > cpg, shared_ptr< HMM<double> > non_cpg, uint average_cpg_length, uint average_non_cpg_length) { if (cpg->get_no_states() != non_cpg->get_no_states()) { throw("Models states number must be same"); } if (cpg->get_alphabet_size() != non_cpg->get_alphabet_size()) { throw("Models alphabet size must be same"); } double leave_cpg_probability = 1/(double)average_cpg_length; double stay_cpg_probability = 1 - leave_cpg_probability; double leave_non_cpg_probability = 1/(double)average_non_cpg_length; double stay_non_cpg_probability = 1 - leave_non_cpg_probability; //initial probabilities shared_ptr< HMMVector<double> > initial_probabilities(new HMMVector<double>(cpg->get_no_states()*2)); uint i; for (i = 0; i < cpg->get_no_states(); ++i) { (*initial_probabilities)(i) = cpg->get_initial_probs()(i)/2; } for (uint j = 0; j < non_cpg->get_no_states(); ++j) { (*initial_probabilities)(j + i) = non_cpg->get_initial_probs()(j)/2; } //transition probabilities shared_ptr< HMMMatrix<double> > transition_probabilities(new HMMMatrix<double>(cpg->get_no_states()*2, cpg->get_no_states()*2)); for (uint i = 0; i < transition_probabilities->get_no_rows(); ++i) { for (uint j = 0; j < transition_probabilities->get_no_columns(); ++j) { if (i < cpg->get_no_states() && j < cpg->get_no_states()) { (*transition_probabilities)(i, j) = cpg->get_trans_probs()(i, j)*stay_cpg_probability; } if (i < cpg->get_no_states() && j >= cpg->get_no_states()) { (*transition_probabilities)(i, j) = cpg->get_trans_probs()(i, j - cpg->get_no_states())*leave_cpg_probability; } if (i >= cpg->get_no_states() && j < cpg->get_no_states()) { (*transition_probabilities)(i, j) = non_cpg->get_trans_probs()(i - cpg->get_no_states(), j)*leave_non_cpg_probability; } if (i >= cpg->get_no_states() && j >= cpg->get_no_states()) { (*transition_probabilities)(i, j) = non_cpg->get_trans_probs()(i - cpg->get_no_states(), j - cpg->get_no_states())*stay_non_cpg_probability; } } } //emission probabilities shared_ptr< HMMMatrix<double> > emission_probabilities(new HMMMatrix<double>(cpg->get_alphabet_size(), cpg->get_no_states()*2)); for (uint i = 0; i < cpg->get_alphabet_size(); ++i) { for (uint j = 0; j < cpg->get_no_states()*2; ++j) { if (j < cpg->get_no_states()) { (*emission_probabilities)(i, j) = cpg->get_emission_probs()(i, j); } else { (*emission_probabilities)(i, j) = non_cpg->get_emission_probs()(i, j - cpg->get_no_states()); } } } HMM<double>* hmm = new HMM<double>(initial_probabilities, transition_probabilities, emission_probabilities); hmm->Save_Parameters(); return shared_ptr< HMM<double> >(hmm); }
int main() { HMM<int, int> hmm; hmm.loadHMM("hmm.model"); hmm.printHMM(); return 0; }
int main(int argc, const char * argv[]){ //Load the training data TimeSeriesClassificationData trainingData; if( !trainingData.loadDatasetFromFile("HMMTrainingData.grt") ){ cout << "ERROR: Failed to load training data!\n"; return false; } //Remove 20% of the training data to use as test data TimeSeriesClassificationData testData = trainingData.partition( 80 ); //The input to the HMM must be a quantized discrete value //We therefore use a KMeansQuantizer to covert the N-dimensional continuous data into 1-dimensional discrete data const UINT NUM_SYMBOLS = 10; KMeansQuantizer quantizer( NUM_SYMBOLS ); //Train the quantizer using the training data if( !quantizer.train( trainingData ) ){ cout << "ERROR: Failed to train quantizer!\n"; return false; } //Quantize the training data TimeSeriesClassificationData quantizedTrainingData( 1 ); for(UINT i=0; i<trainingData.getNumSamples(); i++){ UINT classLabel = trainingData[i].getClassLabel(); MatrixDouble quantizedSample; for(UINT j=0; j<trainingData[i].getLength(); j++){ quantizer.quantize( trainingData[i].getData().getRowVector(j) ); quantizedSample.push_back( quantizer.getFeatureVector() ); } if( !quantizedTrainingData.addSample(classLabel, quantizedSample) ){ cout << "ERROR: Failed to quantize training data!\n"; return false; } } //Create a new HMM instance HMM hmm; //Set the number of states in each model hmm.setNumStates( 4 ); //Set the number of symbols in each model, this must match the number of symbols in the quantizer hmm.setNumSymbols( NUM_SYMBOLS ); //Set the HMM model type to LEFTRIGHT with a delta of 1 hmm.setModelType( HiddenMarkovModel::LEFTRIGHT ); hmm.setDelta( 1 ); //Set the training parameters hmm.setMinImprovement( 1.0e-5 ); hmm.setMaxNumIterations( 100 ); hmm.setNumRandomTrainingIterations( 20 ); //Train the HMM model if( !hmm.train( quantizedTrainingData ) ){ cout << "ERROR: Failed to train the HMM model!\n"; return false; } //Save the HMM model to a file if( !hmm.save( "HMMModel.grt" ) ){ cout << "ERROR: Failed to save the model to a file!\n"; return false; } //Load the HMM model from a file if( !hmm.load( "HMMModel.grt" ) ){ cout << "ERROR: Failed to load the model from a file!\n"; return false; } //Quantize the test data TimeSeriesClassificationData quantizedTestData( 1 ); for(UINT i=0; i<testData.getNumSamples(); i++){ UINT classLabel = testData[i].getClassLabel(); MatrixDouble quantizedSample; for(UINT j=0; j<testData[i].getLength(); j++){ quantizer.quantize( testData[i].getData().getRowVector(j) ); quantizedSample.push_back( quantizer.getFeatureVector() ); } if( !quantizedTestData.addSample(classLabel, quantizedSample) ){ cout << "ERROR: Failed to quantize training data!\n"; return false; } } //Compute the accuracy of the HMM models using the test data double numCorrect = 0; double numTests = 0; for(UINT i=0; i<quantizedTestData.getNumSamples(); i++){ UINT classLabel = quantizedTestData[i].getClassLabel(); hmm.predict( quantizedTestData[i].getData() ); if( classLabel == hmm.getPredictedClassLabel() ) numCorrect++; numTests++; VectorDouble classLikelihoods = hmm.getClassLikelihoods(); VectorDouble classDistances = hmm.getClassDistances(); cout << "ClassLabel: " << classLabel; cout << " PredictedClassLabel: " << hmm.getPredictedClassLabel(); cout << " MaxLikelihood: " << hmm.getMaximumLikelihood(); cout << " ClassLikelihoods: "; for(UINT k=0; k<classLikelihoods.size(); k++){ cout << classLikelihoods[k] << "\t"; } cout << "ClassDistances: "; for(UINT k=0; k<classDistances.size(); k++){ cout << classDistances[k] << "\t"; } cout << endl; } cout << "Test Accuracy: " << numCorrect/numTests*100.0 << endl; return true; }
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. void ReadAndPrepare(char* infile, HMM& q, Alignment* qali=NULL) { char path[NAMELEN]; // Open query file and determine file type char line[LINELEN]=""; // input line FILE* inf=NULL; if (strcmp(infile,"stdin")) { inf = fopen(infile, "r"); if (!inf) OpenFileError(infile); Pathname(path,infile); } else { inf = stdin; if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name); *path='\0'; } fgetline(line,LINELEN-1,inf); // Is it an hhm file? if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) { if (v>=2) cout<<"Query file is in HHM format\n"; // Rewind to beginning of line and read query hhm file rewind(inf); q.Read(inf,path); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); // Add transition pseudocounts to query -> q.p[i][a] q.AddTransitionPseudocounts(); if (!*par.clusterfile) { //compute context-specific pseudocounts? // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);; } else { // Add context specific pseudocount to query q.AddContextSpecificPseudocounts(par.pcm); } q.CalculateAminoAcidBackground(); } // ... or is it an a2m/a3m alignment file else if (line[0]=='#' || line[0]=='>') { Alignment* pali; if (qali==NULL) pali=new(Alignment); else pali=qali; if (par.calibrate) { printf("\nError in %s: only HHM files can be calibrated.\n",program_name); printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile); exit(1); } if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n"; // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument) pali->Read(inf,infile,line); // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] // and store marked sequences in name[k] and seq[k] pali->Compress(infile); // Sort out the nseqdis most dissimilar sequences for display in the output alignments pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); if (par.Neff>=0.999) pali->FilterNeff(); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] pali->FrequenciesAndTransitions(q); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM); // Add transition pseudocounts to query -> p[i][a] q.AddTransitionPseudocounts(); if (!*par.clusterfile) { //compute context-specific pseudocounts? // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc); } else { // Add context specific pseudocount to query q.AddContextSpecificPseudocounts(par.pcm); } q.CalculateAminoAcidBackground(); if (qali==NULL) delete(pali); } else if (!strncmp(line,"HMMER",5)) { /////////////////////////////////////////////////////////////////////////////////////// // Don't allow HMMER format as input due to the severe loss of sensitivity!!!! (only allowed in HHmake) if (strncmp(program_name,"hhmake",6)) { cerr<<endl<<"Error in "<<program_name<<": HMMER format not allowed as input due to the severe loss of sensitivity!\n"; exit(1); } // Is infile a HMMER3 file? if (!strncmp(line,"HMMER3",6)) { if (v>=2) cout<<"Query file is in HMMER3 format\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer3(inf,path); // Don't add transition pseudocounts to query!! // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); q.CalculateAminoAcidBackground(); } // ... or is infile an old HMMER file? else if (!strncmp(line,"HMMER",5)) { if (v>=2) cout<<"Query file is in HMMER format\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer(inf,path); // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); q.CalculateAminoAcidBackground(); } } else { cerr<<endl<<"Error in "<<program_name<<": unrecognized input file format in \'"<<infile<<"\'\n"; cerr<<"line = "<<line<<"\n"; exit(1); } fclose(inf); if (par.addss==1) CalculateSS(q); if (par.columnscore == 5 && !q.divided_by_local_bg_freqs) q.DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs); if (par.forward>=1) q.Log2LinTransitionProbs(1.0); return; }
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. void ReadInput(char* infile, HMM& q, Alignment* qali=NULL) { char path[NAMELEN]; // Open query file and determine file type char line[LINELEN]=""; // input line FILE* inf=NULL; if (strcmp(infile,"stdin")) { inf = fopen(infile, "r"); if (!inf) OpenFileError(infile); Pathname(path,infile); } else { inf = stdin; if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name); *path='\0'; } fgetline(line,LINELEN-1,inf); // Is infile a HMMER3 file? if (!strncmp(line,"HMMER3",6)) { if (v>=2) cout<<"Query file is in HMMER3 format\n"; cerr<<"WARNING: Use of HMMER3 format as input will result in severe loss of sensitivity!\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer3(inf,path); } // ... or is infile an old HMMER file? else if (!strncmp(line,"HMMER",5)) { if (v>=2) cout<<"Query file is in HMMER format\n"; cerr<<"WARNING: Use of HMMER format as input will result in severe loss of sensitivity!\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer(inf,path); } // ... or is it an hhm file? else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) { if (v>=2) cout<<"Query file is in HHM format\n"; // Rewind to beginning of line and read query hhm file rewind(inf); q.Read(inf,path); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); } // ... or is it an alignment file else { Alignment* pali; if (qali==NULL) pali=new(Alignment); else pali=qali; if (par.calibrate) { printf("\nError in %s: only HHM files can be calibrated.\n",program_name); printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile); exit(1); } if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n"; // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument) pali->Read(inf,infile,line); // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] // and store marked sequences in name[k] and seq[k] pali->Compress(infile); // Sort out the nseqdis most dissimilar sequences for display in the output alignments pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); if (par.Neff>=0.999) pali->FilterNeff(); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] pali->FrequenciesAndTransitions(q); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM); if (qali==NULL) delete(pali); } fclose(inf); return; }
///////////////////////////////////////////////////////////////////////////////////// //// MAIN PROGRAM ///////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char* argv_conf[MAXOPT]; // Input arguments from .hhdefaults file (first=1: argv_conf[0] is not used) int argc_conf; // Number of arguments in argv_conf strcpy(par.infile, ""); strcpy(par.outfile, ""); strcpy(par.alnfile, ""); //Default parameter settings par.nseqdis = MAXSEQ - 1; // maximum number of sequences to be written par.showcons = 0; par.cons = 1; par.Ndiff = 0; par.max_seqid = 100; par.coverage = 0; par.pc_hhm_context_engine.pca = 0.0; // no amino acid pseudocounts par.pc_hhm_nocontext_a = 0.0; // no amino acid pseudocounts par.gapb = 0.0; // no transition pseudocounts // Make command line input globally available par.argv = argv; par.argc = argc; RemovePathAndExtension(program_name, argv[0]); // Enable changing verbose mode before defaults file and command line are processed int v = 2; for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "-def")) par.readdefaultsfile = 1; else if (strcmp(argv[i], "-v") == 0) { v = atoi(argv[i + 1]); } } par.v = Log::from_int(v); Log::reporting_level() = par.v; par.SetDefaultPaths(); // Read .hhdefaults file? if (par.readdefaultsfile) { // Process default otpions from .hhconfig file ReadDefaultsFile(argc_conf, argv_conf); ProcessArguments(argc_conf, argv_conf); } // Process command line options (they override defaults from .hhdefaults file) ProcessArguments(argc, argv); Alignment* qali = new Alignment(MAXSEQ, par.maxres); HMM* q = new HMM(MAXSEQDIS, par.maxres); //Create a HMM with maximum of par.maxres match states // q is only available after maxres is known, so we had to move this here for (int i = 1; i <= argc - 1; i++) { if (!strcmp(argv[i], "-name") && (i < argc - 1)) { strmcpy(q->name, argv[++i], NAMELEN - 1); //copy longname to name... strmcpy(q->longname, argv[i], DESCLEN - 1); //copy full name to longname } } // Check command line input and default values if (!*par.infile) { help(); HH_LOG(ERROR) << "Input file is missing!" << std::endl; exit(4); } // Get basename RemoveExtension(q->file, par.infile); //Get basename of infile (w/o extension): // Outfile not given? Name it basename.hhm if (!*par.outfile && !*par.alnfile) { RemoveExtension(par.outfile, par.infile); strcat(par.outfile, ".seq"); } // Prepare CS pseudocounts lib if (!par.nocontxt && *par.clusterfile) { InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); } // Set substitution matrix; adjust to query aa distribution if par.pcm==3 SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim); // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. char input_format = 0; ReadQueryFile(par, par.infile, input_format, par.wg, q, qali, pb, S, Sim); // Same code as in PrepareQueryHMM(par.infile,input_format,q,qali), except that we add SS prediction // Add Pseudocounts, if no HMMER input if (input_format == 0) { // Transform transition freqs to lin space if not already done q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, par.gapb, par.gapb); // Comput substitution matrix pseudocounts if (par.nocontxt) { // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q->PreparePseudocounts(R); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode, par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c); } else { // Add full context specific pseudocounts to query q->AddContextSpecificPseudocounts(pc_hhm_context_engine, pc_hhm_context_mode); } } else { q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c); } q->CalculateAminoAcidBackground(pb); if (par.columnscore == 5 && !q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs( par.half_window_size_local_aa_bg_freqs, pb); // Write consensus sequence to sequence file // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions() if (*par.outfile) { FILE* outf = NULL; if (strcmp(par.outfile, "stdout")) { outf = fopen(par.outfile, "a"); if (!outf) OpenFileError(par.outfile, __FILE__, __LINE__, __func__); } else outf = stdout; // OLD //// ">name_consensus" -> ">name consensus" //strsubst(q->sname[q->nfirst],"_consensus"," consensus"); //fprintf(outf,">%s\n%s\n",q->sname[q->nfirst],q->seq[q->nfirst]+1); // NEW (long header needed for NR30cons database) fprintf(outf, ">%s\n%s\n", q->longname, q->seq[q->nfirst] + 1); fclose(outf); } // Print A3M/A2M/FASTA output alignment if (*par.alnfile) { HalfAlignment qa; int n = imin(q->n_display, par.nseqdis + (q->nss_dssp >= 0) + (q->nss_pred >= 0) + (q->nss_conf >= 0) + (q->ncons >= 0)); qa.Set(q->name, q->seq, q->sname, n, q->L, q->nss_dssp, q->nss_pred, q->nss_conf, q->nsa_dssp, q->ncons); if (par.outformat == 1) qa.BuildFASTA(); else if (par.outformat == 2) qa.BuildA2M(); else if (par.outformat == 3) qa.BuildA3M(); if (qali->readCommentLine) qa.Print(par.alnfile, par.append, qali->longname); // print alignment to outfile else qa.Print(par.alnfile, par.append); // print alignment to outfile } delete qali; delete q; DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); }
pair<double,vector<size_t>> viterbi(string observation, const HMM& model) { if (!model.isFinalized()) throw invalid_argument("Model should be finalized!"); // (state, prob) Matrix<pair<int,double>> omega(observation.length(), model.numStates(), make_pair(-1, -numeric_limits<double>::infinity())); unordered_map<double, double> logmemory; auto ln = [&logmemory] (double arg) { if (logmemory.count(arg) > 0) return logmemory.at(arg); double val = log(arg); logmemory.insert(make_pair(arg, val)); return val; }; for (size_t i = 0; i < model.numStates(); i++) omega(0, i) = make_pair(-1, ln(model.startProb(i)) + ln(model.emissionProb(i, observation.substr(0,1)))); for (size_t l = 1; l < observation.length(); l++) { for (size_t i = 0; i < model.numStates(); i++) { // Find where we should come from pair<int, double> best = make_pair(-1, -numeric_limits<double>::infinity()); for (auto k : model.incommingStates(i)) { if (l < model.stateArity(i)) continue; double candidate = omega(l - model.stateArity(i) , k).second + ln(model.transitionProb(k, i)); if (candidate > best.second) best = make_pair(k, candidate); } if (best.first == -1) { // State is not possible omega(l, i) = make_pair(-1, -numeric_limits<double>::infinity()); } else { // Update current cell with right values omega(l, i) = make_pair(best.first, best.second + ln(model.emissionProb(i, observation.substr(l - model.stateArity(i) + 1, model.stateArity(i))))); } } } // Final result is now in prev pair<int, double> best = make_pair(-1, -numeric_limits<double>::infinity()); for (int i = 0; i < model.numStates(); i++) { double candidate = omega(observation.length()-1, i).second; if (candidate > best.second) best = make_pair(i, candidate); } if (best.first == -1) return make_pair(-numeric_limits<double>::infinity(), vector<size_t>()); // Backtrack vector<size_t> stateTrace; stateTrace.push_back(best.first); size_t pos = observation.length() - 1; auto cur = omega(pos, best.first); size_t prevState = best.first; // TODO: Could probably be refactored while (cur.first != -1) { stateTrace.push_back(cur.first); pos -= model.stateArity(prevState); prevState = cur.first; cur = omega(pos, cur.first); } return make_pair(best.second, vector<size_t>(stateTrace.rbegin(), stateTrace.rend())); }