Exemple #1
0
int main_run(char * args[]){
	vector<vector<T>> ufile = read(args[1]);
		vector<vector<T>> tfile = read(args[2]);
		vector<vector<T>> logs = read(args[3]);
		HMM<T> hmm;
		/*for (auto i = transformed.begin(); i != transformed.end(); ++i) {
		  for (auto j = i->begin(); j != i->end(); ++j) {
		  cout<<j->first<<"\t"<<j->second<<"\n";
		  }
		  }*/
		hmm.Train(ufile, tfile);
		map<int, string> tagcodemap;
		vector<vector<T>> taglist = hmm.TagLogs(logs, tagcodemap);
		vector<vector<T>> transformed = transform2(taglist);
		//FP GROWTH PART FOR CLUSTERING after classification
		int s = atoi(args[4]);
		FPTree<T> tree(s);
		//consider the position of token in the line
		tree.build(transformed);
		std::vector<std::vector<T>> r;
		clock_t t = clock();
		tree.mine(r);
		std::cout<<"Initial pattern no: "<<r.size()<<"\n";
		Cluster<T> data;
		data.AssociatePatterns(transformed, r);
		cout << "Time: " << double(clock() - t) / CLOCKS_PER_SEC << "\n";
		data.DisplayCluster(logs, transformed, r, tagcodemap);
		Analyze<T> analyze;
		vector<vector<T>> manual = read("manual");
		cout<<"\n"<<analyze.Efficiency(taglist, manual);
}
void HMMModel::loadHMMs(QString strPath)
{
	if (!strPath.endsWith('/') && !strPath.endsWith('\\'))
	{
		strPath += "/";
	}
	QStringList filters;
	filters << "*.hmm";
	QDir dir(strPath);
	QStringList list = dir.entryList(filters);

	cleanHMMs();
	QString strFileName;
	int nPos;
	std::string strSignName, strPathFile;
	HMM* phmm;
	const char* pCharCom;
	char* tmp;
	for (int l=0; l<list.count(); l++)
	{
		strFileName = list.at(l);
		nPos = strFileName.lastIndexOf(".hmm");
		strSignName = strFileName.left(nPos).toStdString();
		strPathFile = strPath.toLocal8Bit()+strFileName.toLocal8Bit();
		pCharCom = strPathFile.c_str();
		tmp = new char[strlen(pCharCom)+1];
		strcpy(tmp, pCharCom);

		phmm = new HMM();
		phmm->Load(tmp);
		m_mHMM.insert(std::pair<std::string, HMM*>(strSignName, phmm));
	}
}
void LoadHMM(HMM<distribution::DiscreteDistribution>& hmm,
             util::SaveRestoreUtility& sr)
{
  std::string type;
  size_t states;

  sr.LoadParameter(type, "hmm_type");
  if (type != "discrete")
  {
    Rcpp::Rcout << "Cannot load non-discrete HMM (of type " << type << ") as "
        << "discrete HMM!" << std::endl;
  }

  sr.LoadParameter(states, "hmm_states");

  // Load transition matrix.
  sr.LoadParameter(hmm.Transition(), "hmm_transition");

  // Now each emission distribution.
  hmm.Emission().resize(states);
  for (size_t i = 0; i < states; ++i)
  {
    std::stringstream s;
    s << "hmm_emission_distribution_" << i;
    sr.LoadParameter(hmm.Emission()[i].Probabilities(), s.str());
  }

  hmm.Dimensionality() = 1;
}
int 
Clusters::
reclusterNeyEssen()
{
  cerr << "Calculating MLE for prior probabilities" << endl;
  vector<double> prior(numberClasses,0.0l);
  double xxx = 1.0l/(double)numberTypes;
  for (int i = 0; i < numberTypes; i++){
    int c = classVector[i];
    prior[c] += xxx;
  }
  for (int i = 0; i < numberClasses;i++)
    cerr << i << " " << prior[i] << endl;
  if (numberStates > 0){
    cerr << "Training all the HMMs" << endl;
    for (int c = 0; c < numberClasses; c++){
      //      cerr << "Training HMM " << c << endl;
      HMM* hmmPtr = hmms[c];
      HMM* newHmmPtr = new HMM(numberStates, alphabetSize);
      for (int i = 0; i < numberTypes; i++){
	if (classVector[i] == c){
	  // then this word is in the right class
	  // so train it on word i
	  const string & word = *(corpus.wordArray[i]);
	  vector<int> v;
	  hmmPtr->convertString(word,v);
	  // FIXME 
	  double weight = 1.0l;
	  if (USE_TRUE_WEIGHT){
	    weight = corpus.countArray[i];
	  }
	  hmmPtr->emSingle(*newHmmPtr, weight, v);
	}
      }
      newHmmPtr->normalise();
      hmms[c] = newHmmPtr;
      delete hmmPtr;
    }
  }
  int something = 0;
  for (int i = 0; i < numberTypes; i++){
    //    cerr << "Word " << i;
    int w = sortedWords[i];
    //cerr << *(corpus.wordArray[w]) << endl;
    if (counts[w] > FREQ_CUTOFF){
      //cerr << "Doing " << w << endl;
      if (bestCluster(w, prior)){
	something++;
      }
    }
  }
 
  return something;
}
// [[Rcpp::export]]
List launch_pmmh_cpp(List inputs, List modellist,
                 List algoparameters){
  string strmodel = as<string>(modellist["model"]);
  string strprior = as<string>(modellist["prior"]);
  NumericMatrix current = inputs["current"];
  NumericMatrix observations = inputs["observations"];
  List theta = inputs["theta"];
  int nparticles = as<int>(algoparameters["nparticles"]);
  int niterations = as<int>(algoparameters["niterations"]);
  NumericMatrix cholesky_proposal = algoparameters["cholesky_proposal"];
  NumericVector initial_theta = algoparameters["initial_theta"];
  HMM* model;
  Prior* prior;
  if (strmodel.compare(string("model1")) == 0){
    model = new BatteryModel();
    if (strprior.compare(string("uniform")) == 0){
      prior = new BatteryModelUniformPrior();
    }
    if (strprior.compare(string("normal")) == 0){
      prior = new BatteryModelNormalPrior();
    }
  }
  if (strmodel.compare(string("model2")) == 0){
    model = new BatteryModel2();
    if (strprior.compare(string("uniform")) == 0){
      prior = new BatteryModel2UniformPrior();
    }
    if (strprior.compare(string("normal")) == 0){
      prior = new BatteryModel2NormalPrior();
    }
  }
  model->set_input(current);
  model->set_parameters(theta);
  model->set_observations(observations);
  PMMH pmmh(nparticles, niterations, model->dim_states);
  pmmh.set_prior(prior);
  pmmh.init(model, initial_theta);
  pmmh.set_proposal_cholesky(cholesky_proposal);
  pmmh.run();

  delete model;
  delete prior;
  return Rcpp::List::create(
                            Rcpp::Named("chain")= pmmh.chain_parameters,
                            Rcpp::Named("naccepts") = pmmh.naccepts,
                            Rcpp::Named("loglikelihood") = pmmh.loglikelihood,
                            Rcpp::Named("loglikelihood_proposal") = pmmh.loglikelihood_proposal,
                            Rcpp::Named("proposals") = pmmh.proposals,
                            Rcpp::Named("nparticles") = nparticles,
                            Rcpp::Named("niterations") = niterations,
                            Rcpp::Named("cholesky_proposal") = cholesky_proposal);
}
void LoadHMM(HMM<gmm::GMM<> >& hmm,
             util::SaveRestoreUtility& sr)
{
  std::string type;
  size_t states;

  sr.LoadParameter(type, "hmm_type");
  if (type != "gmm")
  {
    Rcpp::Rcout << "Cannot load non-GMM HMM (of type " << type << ") as "
        << "a Gaussian Mixture Model HMM!" << std::endl;
  }

  sr.LoadParameter(states, "hmm_states");

  // Load transition matrix.
  sr.LoadParameter(hmm.Transition(), "hmm_transition");

  // Now each emission distribution.
  hmm.Emission().resize(states, gmm::GMM<>(1, 1));
  for (size_t i = 0; i < states; ++i)
  {
    std::stringstream s;
    s << "hmm_emission_" << i << "_gaussians";
    size_t gaussians;
    sr.LoadParameter(gaussians, s.str());

    s.str("");
    // Extract dimensionality.
    arma::vec meanzero;
    s << "hmm_emission_" << i << "_gaussian_0_mean";
    sr.LoadParameter(meanzero, s.str());
    size_t dimensionality = meanzero.n_elem;

    // Initialize GMM correctly.
    hmm.Emission()[i].Gaussians() = gaussians;
    hmm.Emission()[i].Dimensionality() = dimensionality;

    for (size_t g = 0; g < gaussians; ++g)
    {
      s.str("");
      s << "hmm_emission_" << i << "_gaussian_" << g << "_mean";
      sr.LoadParameter(hmm.Emission()[i].Means()[g], s.str());

      s.str("");
      s << "hmm_emission_" << i << "_gaussian_" << g << "_covariance";
      sr.LoadParameter(hmm.Emission()[i].Covariances()[g], s.str());
    }

    s.str("");
    s << "hmm_emission_" << i << "_weights";
    sr.LoadParameter(hmm.Emission()[i].Weights(), s.str());
  }

  hmm.Dimensionality() = hmm.Emission()[0].Dimensionality();
}
Exemple #7
0
PosteriorViterbi::PosteriorViterbi(HMM &hmm,
				   bool shouldAdd)
  : numStates(hmm.countStates()), hmmGraph(hmm),
    addRatherThanMultiply(shouldAdd)
{
  // ctor
}
void SaveHMM(const HMM<gmm::GMM<> >& hmm,
             util::SaveRestoreUtility& sr)
{
  std::string type = "gmm";
  size_t states = hmm.Transition().n_rows;

  sr.SaveParameter(type, "hmm_type");
  sr.SaveParameter(states, "hmm_states");
  sr.SaveParameter(hmm.Transition(), "hmm_transition");

  // Now the emissions.
  for (size_t i = 0; i < states; ++i)
  {
    // Generate name.
    std::stringstream s;
    s << "hmm_emission_" << i << "_gaussians";
    sr.SaveParameter(hmm.Emission()[i].Gaussians(), s.str());

    s.str("");
    s << "hmm_emission_" << i << "_weights";
    sr.SaveParameter(hmm.Emission()[i].Weights(), s.str());

    for (size_t g = 0; g < hmm.Emission()[i].Gaussians(); ++g)
    {
      s.str("");
      s << "hmm_emission_" << i << "_gaussian_" << g << "_mean";
      sr.SaveParameter(hmm.Emission()[i].Means()[g], s.str());

      s.str("");
      s << "hmm_emission_" << i << "_gaussian_" << g << "_covariance";
      sr.SaveParameter(hmm.Emission()[i].Covariances()[g], s.str());
    }
  }
}
void SaveHMM(const HMM<distribution::DiscreteDistribution>& hmm,
             util::SaveRestoreUtility& sr)
{
  std::string type = "discrete";
  size_t states = hmm.Transition().n_rows;

  sr.SaveParameter(type, "hmm_type");
  sr.SaveParameter(states, "hmm_states");
  sr.SaveParameter(hmm.Transition(), "hmm_transition");

  // Now the emissions.
  for (size_t i = 0; i < states; ++i)
  {
    // Generate name.
    std::stringstream s;
    s << "hmm_emission_distribution_" << i;
    sr.SaveParameter(hmm.Emission()[i].Probabilities(), s.str());
  }
}
Exemple #10
0
HMM
buildHMM(const ESBTL::Default_system & system, GMMCreator creator, ClusteringFunctor func) {
  arma::mat data;
  arma::urowvec labels;
  HMM hmm;
  unsigned int offset = 0;
  for (ESBTL::Default_system::Models_const_iterator it_model = system.models_begin();
      it_model != system.models_end();
      ++it_model) {
    const ESBTL::Default_system::Model & model = *it_model;
    arma::mat tempdata;
    arma::urowvec templabels = func(model, tempdata); 
    templabels += offset;
    data = arma::join_rows(data, tempdata);
    labels = arma::join_rows(labels, templabels);
    offset = arma::max(labels);
  }
  hmm.baumWelchCached(data, creator(data, labels));
  return hmm;
}
void LoadHMM(HMM<distribution::GaussianDistribution>& hmm,
             util::SaveRestoreUtility& sr)
{
  std::string type;
  size_t states;

  sr.LoadParameter(type, "hmm_type");
  if (type != "gaussian")
  {
    Rcpp::Rcout << "Cannot load non-Gaussian HMM (of type " << type << ") as "
        << "a Gaussian HMM!" << std::endl;
  }

  sr.LoadParameter(states, "hmm_states");

  // Load transition matrix.
  sr.LoadParameter(hmm.Transition(), "hmm_transition");

  // Now each emission distribution.
  hmm.Emission().resize(states);
  for (size_t i = 0; i < states; ++i)
  {
    std::stringstream s;
    s << "hmm_emission_mean_" << i;
    sr.LoadParameter(hmm.Emission()[i].Mean(), s.str());

    s.str("");
    s << "hmm_emission_covariance_" << i;
    sr.LoadParameter(hmm.Emission()[i].Covariance(), s.str());
  }

  hmm.Dimensionality() = hmm.Emission()[0].Mean().n_elem;
}
void SaveHMM(const HMM<distribution::GaussianDistribution>& hmm,
             util::SaveRestoreUtility& sr)
{
  std::string type = "gaussian";
  size_t states = hmm.Transition().n_rows;

  sr.SaveParameter(type, "hmm_type");
  sr.SaveParameter(states, "hmm_states");
  sr.SaveParameter(hmm.Transition(), "hmm_transition");

  // Now the emissions.
  for (size_t i = 0; i < states; ++i)
  {
    // Generate name.
    std::stringstream s;
    s << "hmm_emission_mean_" << i;
    sr.SaveParameter(hmm.Emission()[i].Mean(), s.str());

    s.str("");
    s << "hmm_emission_covariance_" << i;
    sr.SaveParameter(hmm.Emission()[i].Covariance(), s.str());
  }
}
int main() {
    kmeans("./dataset", "./result");
    Forecast forecast("./result");
    HMM hmm = HMM("./dataset", "./dictionnary.txt", forecast);

    hmm.print();
    {
        cout << "Save" << endl;
        std::ofstream ofs("hmm_save");
        boost::archive::text_oarchive oa(ofs);
        oa << hmm;
    }
    
    HMM hmm2 = HMM();
    {
        cout << "Load" << endl;
        std::ifstream ifs("hmm_save");
        boost::archive::text_iarchive ia(ifs);
        ia >> hmm2;
    }
    hmm2.print();    
    return (EXIT_SUCCESS);
}
Exemple #14
0
int main(int argc, char* argv[])
{
  HMM h;

  h.init();
  std::cout << h;

  forward_viterbi(h.get_observations(), 
                  h.get_states(), 
                  h.get_start_probability(), 
                  h.get_transition_probability(), 
                  h.get_emission_probability());
}
Exemple #15
0
// Calculate secondary structure for given HMM and return prediction
void CalculateSS(HMM& q, char *ss_pred, char *ss_conf)
{
  char tmpfile[]="/tmp/hhCalcSSXXXXXX";
  if (mkstemp(tmpfile) == -1) {
    cerr << "ERROR! Could not create tmp-file!\n"; 
    exit(4);
  }
  
  // Write log-odds matrix from q to tmpfile.mtx
  char filename[NAMELEN];
  FILE* mtxf = NULL;

  strcpy(filename,tmpfile);
  strcat(filename,".mtx");
  mtxf = fopen(filename,"w");
  if (!mtxf) OpenFileError(filename);

  fprintf(mtxf,"%i\n",q.L);
  fprintf(mtxf,"%s\n",q.seq[q.nfirst]+1);
  fprintf(mtxf,"2.670000e-03\n4.100000e-02\n-3.194183e+00\n1.400000e-01\n2.670000e-03\n4.420198e-02\n-3.118986e+00\n1.400000e-01\n3.176060e-03\n1.339561e-01\n-2.010243e+00\n4.012145e-01\n");
  
  for (int i = 1; i <= q.L; ++i) 
    {
      fprintf(mtxf,"-32768 ");
      for (int a = 0; a < 20; ++a)
	{
	  int tmp = iround(50*flog2(q.p[i][s2a[a]]/pb[s2a[a]]));
	  fprintf(mtxf,"%5i ",tmp);
	  if (a == 0) {   // insert logodds value for B
	    fprintf(mtxf,"%5i ",-32768);
	  } else if (a == 18) {   // insert logodds value for X
	    fprintf(mtxf,"%5i ",-100);
	  } else if (a == 19) {   // insert logodds value for Z
	    fprintf(mtxf,"%5i ",-32768);
	  }
	}
      fprintf(mtxf,"-32768 -400\n");
    }
  fclose(mtxf);

  // Calculate secondary structure
  CalculateSS(ss_pred, ss_conf, tmpfile);
  
  q.AddSSPrediction(ss_pred, ss_conf);

  // Remove temp-files
  std::string command = "rm " + (std::string)tmpfile + "*";
  runSystem(command,v);
}
Exemple #16
0
/////////////////////////////////////////////////////////////////////////////////////
// Do precalculations for q and t to prepare comparison
/////////////////////////////////////////////////////////////////////////////////////
void PrepareTemplate(HMM& q, HMM& t, int format)
{
    if (format==0) // HHM format
    {
        // Add transition pseudocounts to template
        t.AddTransitionPseudocounts();

	// Don't use CS-pseudocounts because of runtime!!!
	// Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	t.PreparePseudocounts();
	
	// Add amino acid pseudocounts to query:  p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	t.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);

        t.CalculateAminoAcidBackground();
    }
    else // HHMER format
    {
        // Don't add transition pseudocounts to template
        // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0);

        // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
        // t.PreparePseudocounts();

        // DON'T ADD amino acid pseudocounts to temlate: pcm=0!  t.p[i][a] = t.f[i][a]
        t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
        t.CalculateAminoAcidBackground();
    }

    if (par.forward>=1) t.Log2LinTransitionProbs(1.0);

    // Factor Null model into HMM t
    // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p
    t.IncludeNullModelInHMM(q,t);  // Can go BEFORE the loop if not dependent on template

    return;
}
int main (int argc, const char * argv[])
{
    TimeSeriesClassificationData trainingData;      //This will store our training data

    GestureRecognitionPipeline pipeline;               //This is a wrapper for our classifier and any pre/post processing modules

	string dirPath = "/home/vlad/AndroidStudioProjects/DataCapture/dataSetGenerator/build";


	if (!trainingData.loadDatasetFromFile(dirPath + "/acc-training-set-segmented.data")) {
		printf("Cannot open training set\n");
		return 0;
	}

	printf("Successfully opened training data set ...\n");

    HMM hmm;
    hmm.setHMMType( HMM_CONTINUOUS );
    hmm.setDownsampleFactor( 5 );
    hmm.setAutoEstimateSigma( true );
    hmm.setSigma( 20.0 );

    hmm.setModelType( HMM_LEFTRIGHT );
    hmm.setDelta( 1 );
//    LowPassFilter lpf(0.1, 1, 3);
//    pipeline.setPreProcessingModule(lpf);

    pipeline.setClassifier( hmm );

    pipeline.train(trainingData, 20);

    //You can then get then get the accuracy of how well the pipeline performed during the k-fold cross validation testing
    double accuracy = pipeline.getCrossValidationAccuracy();

    printf("Accuracy: %f\n", accuracy);

}
Exemple #18
0
tuple<vector<double>, Matrix<double>, Matrix<double>> forward_backward(string obs, const HMM& model)
{
    if (!model.isFinalized())
        throw runtime_error("Model should be finalized!");
    
    // Forward algorithm
    Matrix<double> forward(obs.length(), model.numStates(), 0);
    vector<double> cs(obs.length(), 0);
    
    // Calculate c1
    for (size_t state = 0; state < model.numStates(); state++)
        cs[0] += model.startProb(state) * model.emissionProb(state, obs.substr(0,1));
    // Base case
    for (size_t state = 0; state < model.numStates(); state++)
        forward(0, state) = model.startProb(state) * model.emissionProb(state, obs.substr(0,1)) / cs[0];

    // Recursion
    for (size_t i = 1; i < obs.length(); i++) {
        vector<double> delta(model.numStates(), 0);
        for (size_t state = 0; state < model.numStates(); state++) {
            if (i < model.stateArity(state))
                continue;
            
            for (auto prevState : model.incommingStates(state)) {
                double val = forward(i - model.stateArity(state), prevState) * model.transitionProb(prevState, state);
                for (size_t k = 1; k < model.stateArity(state); k++)
                    val /= cs[i - k];
                
                delta[state] += val;
            }
            delta[state] *= model.emissionProb(state, obs.substr(i - model.stateArity(state) + 1, model.stateArity(state)));
            
            cs[i] += delta[state];
        }
        
        for (size_t state = 0; state < model.numStates(); state++) {
            forward(i, state) = delta[state] / cs[i];
        }
    }
    
    // Backward algorithm
    Matrix<double> backward(obs.length(), model.numStates(), 0);
    const size_t N = obs.length() - 1;
    for (size_t state = 0; state < model.numStates(); state++)
        backward(N, state) = 1;
    
    for (long i = N - 1; i >= 0; i--) {
        for (size_t state = 0; state < model.numStates(); state++) {
            double prob = 0;
            
            for (auto nextState : model.outgoingStates(state)) {
                if (i + model.stateArity(nextState) > N)
                    continue;
                
                double val = backward(i + model.stateArity(nextState), nextState) * model.transitionProb(state, nextState)
                               * model.emissionProb(nextState, obs.substr(i + 1, model.stateArity(nextState)));
                
                for (size_t k = 0; k < model.stateArity(nextState); k++)
                    val /= cs[i + 1 + k];
                
                prob += val;
            }
            backward(i, state) = prob;
        }
    }
    
    return make_tuple(cs, forward, backward);
}
Exemple #19
0
FastViterbi::FastViterbi(HMM &hmm,bool posterior)
  : numStates(hmm.countStates()), hmmGraph(hmm), posterior(posterior)
{
  // ctor
}
Clusters::
Clusters(int numberClasses_, 
	 const SimpleCorpusOne & corpus_,
	 int numberStates_,
	 int alphabetSize_,
	 bool randomised)
  :
  numberClasses(numberClasses_), 
  numberTypes(corpus_.numberTypes), 
  numberTokens(corpus_.numberTokens),
  numberStates(numberStates_),
  alphabetSize(alphabetSize_),
  data(corpus_.data),
  corpus(corpus_),
  clusterBigrams(numberClasses_,numberClasses_)
{
  classVector.resize(numberTypes);
  counts.resize(numberTypes); 
  sortedWords.resize(numberTypes);
  first.resize(numberTypes);
  clusterUnigrams.resize(numberClasses);
  next = new int[numberTokens];
  for (int i = 0; i < numberTokens; i++)
    next[i] = numberTokens;
  for (int w = 0; w < numberTypes; w++){
    counts[w]=0;
    classVector[w] = numberClasses -1;
  }
  // counts are set
  for (int i = 0; i < numberTokens; i++)
    counts[data[i]]++;
  // now find the most frequent numberClasses -1 of them.
  vector< pair<int,int> > countsTable(numberTypes);
  for (int i = 0; i < numberTypes; i++){
    countsTable[i] = pair<int,int>(counts[i],i);
    //cerr << counts[i] << " " << i << endl;
  }
  
  cerr << "Sorting words" << endl;
  sort(countsTable.begin(),countsTable.end());

  for (int i = 0; i < numberTypes; i++){
    first[i] = -1;
    sortedWords[i] = countsTable[numberTypes - 1 - i].second;
    //cerr << "sort " << i << " " << sortedWords[i] << " , n =" << countsTable[numberTypes - 1 - i].first << endl;
  }

  if (randomised)
    {
      for (int i = 0; i < numberTypes; i++){
	if (counts[i] > FREQ_CUTOFF){
	  int rc = (int) (1.0 * numberClasses *rand()/(RAND_MAX+1.0));
	  classVector[i] = rc;
	}
      }
    }
  else {
    for (int i = 0; i < numberClasses-1; i++){
      classVector[sortedWords[i]]= i;
    }
  }
  
  vector<int> last(numberTypes,0);
  cerr << "Indexing data" << endl;
  for (int i = 0; i < numberTokens-1; i++){
    int w = data[i];
    int w2 = data[i+1];
    if (w2 < 0 || w2 > numberTypes -1){
      cerr << i+1 << " " << w2 << endl;
    }
    assert(w >= 0 && w < numberTypes);
    assert(w2 >= 0 && w2 < numberTypes);
    if (first[w] == -1){
      first[w] = i;
      last[w] = i;
    }
    else
      {
	next[last[w]] = i;
	last[w] = i;
      }
    int c1 = classVector[w];
    int c2 = classVector[w2];
    assert(c1 >= 0 && c1 < numberClasses);
    assert(c2 >= 0 && c2 < numberClasses);
    clusterBigrams(c1,c2)++;
    clusterUnigrams[c1]++;
  }
  cerr << "Finished indexing " << endl;

  // be careful
  clusterUnigrams[classVector[data[numberTokens-1]]]++;
  cerr << "Numberstates " << numberStates << endl;
  if (numberStates > 0){
    cerr << "Starting to do the HMMs" << endl;
    hmms.resize(numberClasses);
    for (int i = 0; i < numberClasses; i++){
      HMM* hmmPtr = new HMM(numberStates, alphabetSize);
      hmmPtr->randomise();
      hmmPtr->normalise();
      hmms[i] = hmmPtr;
    }
  }
}
int main(int argc, const char * argv[]){
    
    //Load the training data
    TimeSeriesClassificationData trainingData;
    
    if( !trainingData.load("HMMTrainingData.grt") ){
        cout << "ERROR: Failed to load training data!\n";
        return false;
    }
    
    //Remove 20% of the training data to use as test data
    TimeSeriesClassificationData testData = trainingData.partition( 80 );
    
    //Create a new HMM instance
    HMM hmm;
    
    //Set the HMM as a Continuous HMM
    hmm.setHMMType( HMM_CONTINUOUS );
    
    //Set the downsample factor, a higher downsample factor will speed up the prediction time, but might reduce the classification accuracy
    hmm.setDownsampleFactor( 5 );
    
    //Set the committee size, this sets the (top) number of models that will be used to make a prediction
    hmm.setCommitteeSize( 10 );
    
    //Tell the hmm algorithm that we want it to estimate sigma from the training data
    hmm.setAutoEstimateSigma( true );
    
    //Set the minimum value for sigma, you might need to adjust this based on the range of your data
    //If you set setAutoEstimateSigma to false, then all sigma values will use the value below
    hmm.setSigma( 20.0 );
    
    //Set the HMM model type to LEFTRIGHT with a delta of 1, this means the HMM can only move from the left-most state to the right-most state
    //in steps of 1
    hmm.setModelType( HMM_LEFTRIGHT );
    hmm.setDelta( 1 );
    
    //Train the HMM model
    if( !hmm.train( trainingData ) ){
        cout << "ERROR: Failed to train the HMM model!\n";
        return false;
    }
    
    //Save the HMM model to a file
    if( !hmm.save( "HMMModel.grt" ) ){
        cout << "ERROR: Failed to save the model to a file!\n";
        return false;
    }
    
    //Load the HMM model from a file
    if( !hmm.load( "HMMModel.grt" ) ){
        cout << "ERROR: Failed to load the model from a file!\n";
        return false;
    }

    //Compute the accuracy of the HMM models using the test data
    double numCorrect = 0;
    double numTests = 0;
    for(UINT i=0; i<testData.getNumSamples(); i++){
        
        UINT classLabel = testData[i].getClassLabel();
        hmm.predict( testData[i].getData() );
        
        if( classLabel == hmm.getPredictedClassLabel() ) numCorrect++;
        numTests++;
        
        VectorFloat classLikelihoods = hmm.getClassLikelihoods();
        VectorFloat classDistances = hmm.getClassDistances();
        
        cout << "ClassLabel: " << classLabel;
        cout << " PredictedClassLabel: " << hmm.getPredictedClassLabel();
        cout << " MaxLikelihood: " << hmm.getMaximumLikelihood();
        
        cout << "  ClassLikelihoods: ";
        for(UINT k=0; k<classLikelihoods.size(); k++){
            cout << classLikelihoods[k] << "\t";
        }
        
        cout << "ClassDistances: ";
        for(UINT k=0; k<classDistances.size(); k++){
            cout << classDistances[k] << "\t";
        }
        cout << endl;
    }
    
    cout << "Test Accuracy: " << numCorrect/numTests*100.0 << endl;
    
    return true;
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){
	double *data, *transition, *gauss, *statesOut;
	unsigned int dataSize[2], dataPointCount, dataStart, dataEnd, statesCount, iterationCount;

	/* Check for proper number of arguments. */
	if(nrhs < 3) {
		mexErrMsgIdAndTxt(
			"MATLAB:Fit:HMM:invalidNumInputs",
			"Three input arguments required."
		);
	} else if(nlhs > 5) {
		mexErrMsgIdAndTxt(
			"MATLAB:Fit:HMM:maxlhs",
			"Too many output arguments."
		);
	}



	/* The input must be a noncomplex double.*/
	if(
		!mxIsDouble(prhs[0]) || mxIsComplex(prhs[0]) ||
		!mxIsDouble(prhs[1]) || mxIsComplex(prhs[1])
	){
		mexErrMsgIdAndTxt(
			"MATLAB:Fit:HMM:inputNotRealDouble",
			"Input must be a noncomplex double."
		);
	}

	dataSize[0] = (int) mxGetM(prhs[0]);
	dataSize[1] = (int) mxGetN(prhs[0]);
	dataPointCount = dataSize[0] * dataSize[1];
	statesCount = (int) mxGetM(prhs[1]);
	if (mxGetN(prhs[1]) != statesCount){
		mexErrMsgIdAndTxt(
			"MATLAB:Fit:HMM:noSquareTransitionMatrix",
			"Transition matrix has to be MxM."
		);
	}

	if (mxGetM(prhs[2]) != statesCount){
		mexErrMsgIdAndTxt(
			"MATLAB:Fit:HMM:notMatchingStatesCount",
			"Gauss definition matrix has a different states count than the transition matrix."
		);
	}
	if (mxGetN(prhs[2]) != 2){
		mexErrMsgIdAndTxt(
			"MATLAB:Fit:HMM:invalidGaussDefinition",
			"Gauss definition matrix has to be Mx2."
		);
	}

	/* Create matrix for the return argument. */
	plhs[0] = mxCreateDoubleMatrix(dataSize[0], dataSize[1], mxREAL);

	/* Assign pointers to each input and output. */
	data = mxGetPr(prhs[0]);
	transition = mxGetPr(prhs[1]);
	gauss = mxGetPr(prhs[2]);

	statesOut = mxGetPr(plhs[0]);

	using namespace hiddenMarkovModel;

	HMMConfiguration configuration;
	if (nrhs > 3){
		mxArray const *options = prhs[3];
		if (mxIsStruct(options)){
			// load configuration from MATLAB struct
			mxArray *value;
			
			value = mxGetField(options, 0, "verbose");
			if (value != NULL && mxIsLogicalScalarTrue(value)){
				configuration.verbose = true;
				
				DEFAULT_FALSE(verboseOutputEmission);
				DEFAULT_TRUE(verboseOutputTransition);
			}
			
			DEFAULT_DOUBLE(minSelfTransition, 0);
			DEFAULT_DOUBLE(minEmission, 1e-6);
			
			DEFAULT_TRUE(doEmissionUpdate);
			DEFAULT_TRUE(doTransitionUpdate);
			
			DEFAULT_INT(binningCount, 300);
			DEFAULT_INT(maxIterations, 100);
			DEFAULT_INT(abortStateChanges, 5);
            
            DEFAULT_FALSE(useMinimalBinningRange);
            if (configuration.useMinimalBinningRange){
                DEFAULT_DOUBLE(lowerBinningRangeLimit, 0);
                DEFAULT_DOUBLE(upperBinningRangeLimit, 1);
            }
		}
		else if (mxIsChar(options)){
			// load configuration from file
			std::ifstream file(mxArrayToString(options));
			if (file.good()){
				configuration = HMMConfiguration::fromFile(file);
			}
			else {
				mexErrMsgIdAndTxt(
					"MATLAB:Fit:HMM:configFileNotFound",
					"Configuration file not found."
				);
			}
		}
		else {
			mexErrMsgIdAndTxt(
				"MATLAB:Fit:HMM:invalidConfigParameter",
				"Invalid configuration parameter."
			);
		}
	}

	std::vector<InitialEmissionProbability*> initStates(0);
	for (unsigned int i = 0; i < statesCount; i += 1){
		initStates.push_back(new GaussState(gauss[i], gauss[statesCount + i]));
	}
	
	// check for NaN at the start
	dataStart = 0;
	for (unsigned int i = 0; i < dataPointCount; i += 1){
		if (!mxIsNaN(data[i])){
			dataStart = i;
			break;
		}
	}
	// check for NaN at the end
	for (dataEnd = dataStart; dataEnd < dataPointCount; dataEnd += 1){
		if (mxIsNaN(data[dataEnd])){
			break;
		}
	}

	HMM model (data + dataStart, dataEnd - dataStart, initStates, configuration);

	// delete state pointers
	initStates.clear();

	for (unsigned int i = 0; i < statesCount; i += 1){
		for (unsigned int j = 0; j < statesCount; j += 1){
			model.setTransition(transition[i + statesCount * j], i, j);
		}
	}
	model.autoSetSelfTransition();
	model.run(iterationCount);

	std::vector<unsigned int> states (dataEnd - dataStart, 0);

	model.viterbi(states);

	for (unsigned int i = 0; i < dataSize[0] * dataSize[1]; i += 1){
		if (i < dataStart || i >= dataEnd){
			statesOut[i] = mxGetNaN();
		}
		else {
			statesOut[i] = (double) states[i - dataStart] + 1;
		}
	}
	
	if (nlhs > 1){
		/* Create matrix for the transition output. */
		plhs[1] = mxCreateDoubleMatrix(statesCount, statesCount, mxREAL);
		double *transitionOut = mxGetPr(plhs[1]);
		for (unsigned int from = 0; from < statesCount; from += 1){
			for (unsigned int to = 0; to < statesCount; to += 1){
				transitionOut[from + to * statesCount] = model.getTransition(from, to);
			}
		}
		
		if (nlhs > 2){
			/* Create matrix for the emission output. */
			plhs[2] = mxCreateDoubleMatrix(statesCount, configuration.binningCount, mxREAL);
			double *emissionOut = mxGetPr(plhs[2]);
			for (unsigned int state = 0; state < statesCount; state += 1){
				for (unsigned int bin = 0; bin < configuration.binningCount; bin += 1){
					emissionOut[state + bin * statesCount] = model.getEmissionPropability(state, bin);
				}
			}
            
            if (nlhs > 3){
                /* Create vector for the emission binning centers. */
                array1D range(2, 0);
                model.getBinningRange(range);
                double binStart = range[0];
                double binDiff = range[1] - range[0];
                
                plhs[3] = mxCreateDoubleMatrix(1, configuration.binningCount, mxREAL);
                double *binningCenters = mxGetPr(plhs[3]);
                for (unsigned int bin = 0; bin < configuration.binningCount; bin += 1){
                    binningCenters[bin] = binStart +
						binDiff * (
							(0.5 + (double) bin) /
							(double) configuration.binningCount
						);
                }
                
                if (nlhs > 4){
                    /* Create matrix for the iteration count output. */
                    plhs[4] = mxCreateDoubleScalar((double) iterationCount);
                }
            }
		}
	}
}
Exemple #23
0
shared_ptr< HMM<double> > Merge_Models(shared_ptr < HMM<double> > cpg, shared_ptr< HMM<double> > non_cpg, uint average_cpg_length, uint average_non_cpg_length)
{
    if (cpg->get_no_states() != non_cpg->get_no_states())
    {
        throw("Models states number must be same");
    }

    if (cpg->get_alphabet_size() != non_cpg->get_alphabet_size())
    {
        throw("Models alphabet size must be same");
    }

    double leave_cpg_probability = 1/(double)average_cpg_length;
    double stay_cpg_probability = 1 - leave_cpg_probability;
    double leave_non_cpg_probability = 1/(double)average_non_cpg_length;
    double stay_non_cpg_probability = 1 - leave_non_cpg_probability;

    //initial probabilities
    shared_ptr< HMMVector<double> > initial_probabilities(new HMMVector<double>(cpg->get_no_states()*2));
    uint i;
    for (i = 0; i < cpg->get_no_states(); ++i)
    {
        (*initial_probabilities)(i) = cpg->get_initial_probs()(i)/2;
    }
    for (uint j = 0; j < non_cpg->get_no_states(); ++j)
    {
        (*initial_probabilities)(j + i) = non_cpg->get_initial_probs()(j)/2;
    }

    //transition probabilities
    shared_ptr< HMMMatrix<double> > transition_probabilities(new HMMMatrix<double>(cpg->get_no_states()*2, cpg->get_no_states()*2));
    for (uint i = 0; i < transition_probabilities->get_no_rows(); ++i)
    {
        for (uint j = 0; j < transition_probabilities->get_no_columns(); ++j)
        {
            if (i < cpg->get_no_states() && j < cpg->get_no_states())
            {
                (*transition_probabilities)(i, j) = cpg->get_trans_probs()(i, j)*stay_cpg_probability;
            }
            if (i < cpg->get_no_states() && j >= cpg->get_no_states())
            {
                (*transition_probabilities)(i, j) = cpg->get_trans_probs()(i, j - cpg->get_no_states())*leave_cpg_probability;
            }
            if (i >= cpg->get_no_states() && j < cpg->get_no_states())
            {
                (*transition_probabilities)(i, j) = non_cpg->get_trans_probs()(i - cpg->get_no_states(), j)*leave_non_cpg_probability;
            }
            if (i >= cpg->get_no_states() && j >= cpg->get_no_states())
            {
                (*transition_probabilities)(i, j) =
                        non_cpg->get_trans_probs()(i - cpg->get_no_states(), j - cpg->get_no_states())*stay_non_cpg_probability;
            }
        }
    }

    //emission probabilities
    shared_ptr< HMMMatrix<double> > emission_probabilities(new HMMMatrix<double>(cpg->get_alphabet_size(), cpg->get_no_states()*2));
    for (uint i = 0; i < cpg->get_alphabet_size(); ++i)
    {
        for (uint j = 0; j < cpg->get_no_states()*2; ++j)
        {
            if (j < cpg->get_no_states())
            {
                (*emission_probabilities)(i, j) = cpg->get_emission_probs()(i, j);
            }
            else
            {
                (*emission_probabilities)(i, j) = non_cpg->get_emission_probs()(i, j - cpg->get_no_states());
            }
        }
    }

    HMM<double>* hmm = new HMM<double>(initial_probabilities, transition_probabilities, emission_probabilities);
    hmm->Save_Parameters();

    return shared_ptr< HMM<double> >(hmm);
}
int main() {
	HMM<int, int> hmm;
	hmm.loadHMM("hmm.model");
	hmm.printHMM();
	return 0;
}
Exemple #25
0
int main(int argc, const char * argv[]){
    
    //Load the training data
    TimeSeriesClassificationData trainingData;
    
    if( !trainingData.loadDatasetFromFile("HMMTrainingData.grt") ){
        cout << "ERROR: Failed to load training data!\n";
        return false;
    }
    
    //Remove 20% of the training data to use as test data
    TimeSeriesClassificationData testData = trainingData.partition( 80 );
    
    //The input to the HMM must be a quantized discrete value
    //We therefore use a KMeansQuantizer to covert the N-dimensional continuous data into 1-dimensional discrete data
    const UINT NUM_SYMBOLS = 10;
    KMeansQuantizer quantizer( NUM_SYMBOLS );
    
    //Train the quantizer using the training data
    if( !quantizer.train( trainingData ) ){
        cout << "ERROR: Failed to train quantizer!\n";
        return false;
    }
    
    //Quantize the training data
    TimeSeriesClassificationData quantizedTrainingData( 1 );
    
    for(UINT i=0; i<trainingData.getNumSamples(); i++){
        
        UINT classLabel = trainingData[i].getClassLabel();
        MatrixDouble quantizedSample;
        
        for(UINT j=0; j<trainingData[i].getLength(); j++){
            quantizer.quantize( trainingData[i].getData().getRowVector(j) );
            
            quantizedSample.push_back( quantizer.getFeatureVector() );
        }
        
        if( !quantizedTrainingData.addSample(classLabel, quantizedSample) ){
            cout << "ERROR: Failed to quantize training data!\n";
            return false;
        }
        
    }
    
    //Create a new HMM instance
    HMM hmm;
    
    //Set the number of states in each model
    hmm.setNumStates( 4 );
    
    //Set the number of symbols in each model, this must match the number of symbols in the quantizer
    hmm.setNumSymbols( NUM_SYMBOLS );
    
    //Set the HMM model type to LEFTRIGHT with a delta of 1
    hmm.setModelType( HiddenMarkovModel::LEFTRIGHT );
    hmm.setDelta( 1 );
    
    //Set the training parameters
    hmm.setMinImprovement( 1.0e-5 );
    hmm.setMaxNumIterations( 100 );
    hmm.setNumRandomTrainingIterations( 20 );
    
    //Train the HMM model
    if( !hmm.train( quantizedTrainingData ) ){
        cout << "ERROR: Failed to train the HMM model!\n";
        return false;
    }
    
    //Save the HMM model to a file
    if( !hmm.save( "HMMModel.grt" ) ){
        cout << "ERROR: Failed to save the model to a file!\n";
        return false;
    }
    
    //Load the HMM model from a file
    if( !hmm.load( "HMMModel.grt" ) ){
        cout << "ERROR: Failed to load the model from a file!\n";
        return false;
    }
    
    //Quantize the test data
    TimeSeriesClassificationData quantizedTestData( 1 );
    
    for(UINT i=0; i<testData.getNumSamples(); i++){
        
        UINT classLabel = testData[i].getClassLabel();
        MatrixDouble quantizedSample;
        
        for(UINT j=0; j<testData[i].getLength(); j++){
            quantizer.quantize( testData[i].getData().getRowVector(j) );
            
            quantizedSample.push_back( quantizer.getFeatureVector() );
        }
        
        if( !quantizedTestData.addSample(classLabel, quantizedSample) ){
            cout << "ERROR: Failed to quantize training data!\n";
            return false;
        }
    }
    
    //Compute the accuracy of the HMM models using the test data
    double numCorrect = 0;
    double numTests = 0;
    for(UINT i=0; i<quantizedTestData.getNumSamples(); i++){
        
        UINT classLabel = quantizedTestData[i].getClassLabel();
        hmm.predict( quantizedTestData[i].getData() );
        
        if( classLabel == hmm.getPredictedClassLabel() ) numCorrect++;
        numTests++;
        
        VectorDouble classLikelihoods = hmm.getClassLikelihoods();
        VectorDouble classDistances = hmm.getClassDistances();
        
        cout << "ClassLabel: " << classLabel;
        cout << " PredictedClassLabel: " << hmm.getPredictedClassLabel();
        cout << " MaxLikelihood: " << hmm.getMaximumLikelihood();
        
        cout << "  ClassLikelihoods: ";
        for(UINT k=0; k<classLikelihoods.size(); k++){
            cout << classLikelihoods[k] << "\t";
        }
        
        cout << "ClassDistances: ";
        for(UINT k=0; k<classDistances.size(); k++){
            cout << classDistances[k] << "\t";
        }
        cout << endl;
    }
    
    cout << "Test Accuracy: " << numCorrect/numTests*100.0 << endl;
    
    return true;
}
Exemple #26
0
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
void ReadAndPrepare(char* infile, HMM& q, Alignment* qali=NULL)
{
    char path[NAMELEN];

    // Open query file and determine file type
    char line[LINELEN]=""; // input line
    FILE* inf=NULL;
    if (strcmp(infile,"stdin"))
    {
        inf = fopen(infile, "r");
        if (!inf) OpenFileError(infile);
        Pathname(path,infile);
    }
    else
    {
        inf = stdin;
        if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
        *path='\0';
    }

    fgetline(line,LINELEN-1,inf);

    // Is it an hhm file?
    if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2))
    {
        if (v>=2) cout<<"Query file is in HHM format\n";

        // Rewind to beginning of line and read query hhm file
        rewind(inf);
        q.Read(inf,path);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);

        // Add transition pseudocounts to query -> q.p[i][a]
        q.AddTransitionPseudocounts();

        if (!*par.clusterfile) { //compute context-specific pseudocounts?
	  // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	  q.PreparePseudocounts();
	  // Add amino acid pseudocounts to query:  q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	  q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);;
        } else {
	  // Add context specific pseudocount to query
	  q.AddContextSpecificPseudocounts(par.pcm);
        }
        
        q.CalculateAminoAcidBackground();
    }

    // ... or is it an a2m/a3m alignment file
    else if (line[0]=='#' || line[0]=='>')
    {
        Alignment* pali;
        if (qali==NULL) pali=new(Alignment); else pali=qali;
        if (par.calibrate) {
            printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
            printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
            exit(1);
        }

        if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";

        // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
        pali->Read(inf,infile,line);

        // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
        // and store marked sequences in name[k] and seq[k]
        pali->Compress(infile);

        // Sort out the nseqdis most dissimilar sequences for display in the output alignments
        pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);

        // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
        pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);

 	if (par.Neff>=0.999) 
	  pali->FilterNeff();

	// Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
        pali->FrequenciesAndTransitions(q);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);

        // Add transition pseudocounts to query -> p[i][a]
        q.AddTransitionPseudocounts();

        if (!*par.clusterfile) { //compute context-specific pseudocounts?
	  // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	  q.PreparePseudocounts();
	  // Add amino acid pseudocounts to query:  p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	  q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);
        } else {
	  // Add context specific pseudocount to query
	  q.AddContextSpecificPseudocounts(par.pcm);
        }

        q.CalculateAminoAcidBackground();

        if (qali==NULL) delete(pali);
    
    } else if (!strncmp(line,"HMMER",5)) {

        ///////////////////////////////////////////////////////////////////////////////////////
        // Don't allow HMMER format as input due to the severe loss of sensitivity!!!! (only allowed in HHmake)
        if (strncmp(program_name,"hhmake",6)) {
	  cerr<<endl<<"Error in "<<program_name<<": HMMER format not allowed as input due to the severe loss of sensitivity!\n";
	  exit(1);
        }
      
        // Is infile a HMMER3 file?
	if (!strncmp(line,"HMMER3",6))
	  {
	    if (v>=2) cout<<"Query file is in HMMER3 format\n";
	    
	    // Read 'query HMMER file
	    rewind(inf);
	    q.ReadHMMer3(inf,path);
	    
	    // Don't add transition pseudocounts to query!!
	    // DON'T ADD amino acid pseudocounts to query: pcm=0!  q.p[i][a] = f[i][a]
	    q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
	    q.CalculateAminoAcidBackground();
	  }
	
	// ... or is infile an old HMMER file?
	else if (!strncmp(line,"HMMER",5))
	  {
	    if (v>=2) cout<<"Query file is in HMMER format\n";
	    
	    // Read 'query HMMER file
	    rewind(inf);
	    q.ReadHMMer(inf,path);
	    
	    // DON'T ADD amino acid pseudocounts to query: pcm=0!  q.p[i][a] = f[i][a]
	    q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
	    q.CalculateAminoAcidBackground();
	  }
	
    } else {
      cerr<<endl<<"Error in "<<program_name<<": unrecognized input file format in \'"<<infile<<"\'\n";
      cerr<<"line = "<<line<<"\n";
      exit(1);
    }
    fclose(inf);

    if (par.addss==1)
      CalculateSS(q);

    if (par.columnscore == 5 && !q.divided_by_local_bg_freqs) q.DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs);

    if (par.forward>=1) q.Log2LinTransitionProbs(1.0);
    return;
}
Exemple #27
0
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
void ReadInput(char* infile, HMM& q, Alignment* qali=NULL)
{
    char path[NAMELEN];

    // Open query file and determine file type
    char line[LINELEN]=""; // input line
    FILE* inf=NULL;
    if (strcmp(infile,"stdin"))
    {
        inf = fopen(infile, "r");
        if (!inf) OpenFileError(infile);
        Pathname(path,infile);
    }
    else
    {
        inf = stdin;
        if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
        *path='\0';
    }

    fgetline(line,LINELEN-1,inf);

    // Is infile a HMMER3 file?
    if (!strncmp(line,"HMMER3",6))
    {
        if (v>=2) cout<<"Query file is in HMMER3 format\n";
	cerr<<"WARNING: Use of HMMER3 format as input will result in severe loss of sensitivity!\n";

        // Read 'query HMMER file
        rewind(inf);
        q.ReadHMMer3(inf,path);
    }

    // ... or is infile an old HMMER file?
    else if (!strncmp(line,"HMMER",5))
    {
        if (v>=2) cout<<"Query file is in HMMER format\n";
	cerr<<"WARNING: Use of HMMER format as input will result in severe loss of sensitivity!\n";

        // Read 'query HMMER file
        rewind(inf);
        q.ReadHMMer(inf,path);
    }

    // ... or is it an hhm file?
    else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2))
    {
        if (v>=2) cout<<"Query file is in HHM format\n";

        // Rewind to beginning of line and read query hhm file
        rewind(inf);
        q.Read(inf,path);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);

    }
    // ... or is it an alignment file
    else
    {
        Alignment* pali;
        if (qali==NULL) pali=new(Alignment); else pali=qali;
        if (par.calibrate) {
            printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
            printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
            exit(1);
        }

        if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";

        // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
        pali->Read(inf,infile,line);

        // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
        // and store marked sequences in name[k] and seq[k]
        pali->Compress(infile);

        // Sort out the nseqdis most dissimilar sequences for display in the output alignments
        pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);

        // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
        pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);

	if (par.Neff>=0.999) 
	  pali->FilterNeff();

        // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
        pali->FrequenciesAndTransitions(q);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);

        if (qali==NULL) delete(pali);
    }
    fclose(inf);

    return;
}
Exemple #28
0
/////////////////////////////////////////////////////////////////////////////////////
//// MAIN PROGRAM
/////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
    char* argv_conf[MAXOPT]; // Input arguments from .hhdefaults file (first=1: argv_conf[0] is not used)
    int argc_conf;               // Number of arguments in argv_conf

    strcpy(par.infile, "");
    strcpy(par.outfile, "");
    strcpy(par.alnfile, "");

    //Default parameter settings
    par.nseqdis = MAXSEQ - 1;        // maximum number of sequences to be written
    par.showcons = 0;
    par.cons = 1;
    par.Ndiff = 0;
    par.max_seqid = 100;
    par.coverage = 0;
    par.pc_hhm_context_engine.pca = 0.0;  // no amino acid pseudocounts
    par.pc_hhm_nocontext_a = 0.0;  // no amino acid pseudocounts
    par.gapb = 0.0; // no transition pseudocounts

    // Make command line input globally available
    par.argv = argv;
    par.argc = argc;
    RemovePathAndExtension(program_name, argv[0]);

    // Enable changing verbose mode before defaults file and command line are processed
    int v = 2;
    for (int i = 1; i < argc; i++) {
        if (!strcmp(argv[i], "-def"))
            par.readdefaultsfile = 1;
        else if (strcmp(argv[i], "-v") == 0) {
            v = atoi(argv[i + 1]);
        }
    }
    par.v = Log::from_int(v);
    Log::reporting_level() = par.v;

    par.SetDefaultPaths();

    // Read .hhdefaults file?
    if (par.readdefaultsfile) {
        // Process default otpions from .hhconfig file
        ReadDefaultsFile(argc_conf, argv_conf);
        ProcessArguments(argc_conf, argv_conf);
    }

    // Process command line options (they override defaults from .hhdefaults file)
    ProcessArguments(argc, argv);

    Alignment* qali = new Alignment(MAXSEQ, par.maxres);
    HMM* q = new HMM(MAXSEQDIS, par.maxres);        //Create a HMM with maximum of par.maxres match states

    // q is only available after maxres is known, so we had to move this here
    for (int i = 1; i <= argc - 1; i++) {
        if (!strcmp(argv[i], "-name") && (i < argc - 1)) {
            strmcpy(q->name, argv[++i], NAMELEN - 1); //copy longname to name...
            strmcpy(q->longname, argv[i], DESCLEN - 1);   //copy full name to longname
        }
    }

    // Check command line input and default values
    if (!*par.infile) {
        help();
        HH_LOG(ERROR) << "Input file is missing!" << std::endl;
        exit(4);
    }

    // Get basename
    RemoveExtension(q->file, par.infile); //Get basename of infile (w/o extension):

    // Outfile not given? Name it basename.hhm
    if (!*par.outfile && !*par.alnfile) {
        RemoveExtension(par.outfile, par.infile);
        strcat(par.outfile, ".seq");
    }

    // Prepare CS pseudocounts lib
    if (!par.nocontxt && *par.clusterfile) {
        InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine,
                                     pc_hhm_context_mode, pc_prefilter_context_engine,
                                     pc_prefilter_context_mode);
    }

    // Set substitution matrix; adjust to query aa distribution if par.pcm==3
    SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim);

    // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
    char input_format = 0;
    ReadQueryFile(par, par.infile, input_format, par.wg, q, qali, pb, S, Sim);

    // Same code as in PrepareQueryHMM(par.infile,input_format,q,qali), except that we add SS prediction
    // Add Pseudocounts, if no HMMER input
    if (input_format == 0) {
        // Transform transition freqs to lin space if not already done
        q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
                                     par.gaph, par.gapi, par.gapb, par.gapb);

        // Comput substitution matrix pseudocounts
        if (par.nocontxt) {
            // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
            q->PreparePseudocounts(R);
            // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
            q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode,
                                        par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b,
                                        par.pc_hhm_nocontext_c);
        }
        else {
            // Add full context specific pseudocounts to query
            q->AddContextSpecificPseudocounts(pc_hhm_context_engine,
                                              pc_hhm_context_mode);
        }
    }
    else {
        q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a,
                                    par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
    }

    q->CalculateAminoAcidBackground(pb);

    if (par.columnscore == 5 && !q->divided_by_local_bg_freqs)
        q->DivideBySqrtOfLocalBackgroundFreqs(
            par.half_window_size_local_aa_bg_freqs, pb);

    // Write consensus sequence to sequence file
    // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions()
    if (*par.outfile) {
        FILE* outf = NULL;
        if (strcmp(par.outfile, "stdout")) {
            outf = fopen(par.outfile, "a");
            if (!outf)
                OpenFileError(par.outfile, __FILE__, __LINE__, __func__);
        }
        else
            outf = stdout;
        // OLD
        //// ">name_consensus" -> ">name consensus"
        //strsubst(q->sname[q->nfirst],"_consensus"," consensus");
        //fprintf(outf,">%s\n%s\n",q->sname[q->nfirst],q->seq[q->nfirst]+1);
        // NEW (long header needed for NR30cons database)
        fprintf(outf, ">%s\n%s\n", q->longname, q->seq[q->nfirst] + 1);
        fclose(outf);
    }

    // Print A3M/A2M/FASTA output alignment
    if (*par.alnfile) {
        HalfAlignment qa;
        int n = imin(q->n_display,
                     par.nseqdis + (q->nss_dssp >= 0) + (q->nss_pred >= 0)
                     + (q->nss_conf >= 0) + (q->ncons >= 0));
        qa.Set(q->name, q->seq, q->sname, n, q->L, q->nss_dssp, q->nss_pred,
               q->nss_conf, q->nsa_dssp, q->ncons);

        if (par.outformat == 1)
            qa.BuildFASTA();
        else if (par.outformat == 2)
            qa.BuildA2M();
        else if (par.outformat == 3)
            qa.BuildA3M();
        if (qali->readCommentLine)
            qa.Print(par.alnfile, par.append, qali->longname); // print alignment to outfile
        else
            qa.Print(par.alnfile, par.append);   // print alignment to outfile
    }

    delete qali;
    delete q;

    DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine,
                             pc_hhm_context_mode, pc_prefilter_context_engine,
                             pc_prefilter_context_mode);
}
Exemple #29
0
pair<double,vector<size_t>> viterbi(string observation, const HMM& model)
{
    if (!model.isFinalized())
        throw invalid_argument("Model should be finalized!");
    
    // (state, prob)
    Matrix<pair<int,double>> omega(observation.length(), model.numStates(),
                                   make_pair(-1, -numeric_limits<double>::infinity()));

    unordered_map<double, double> logmemory;
    auto ln = [&logmemory] (double arg) {
        if (logmemory.count(arg) > 0)
            return logmemory.at(arg);
        
        double val = log(arg);
        logmemory.insert(make_pair(arg, val));
        return val;
    };
    
    for (size_t i = 0; i < model.numStates(); i++)
        omega(0, i) = make_pair(-1, ln(model.startProb(i)) + ln(model.emissionProb(i, observation.substr(0,1))));
    
    for (size_t l = 1; l < observation.length(); l++) {
        for (size_t i = 0; i < model.numStates(); i++) {
            // Find where we should come from
            pair<int, double> best = make_pair(-1, -numeric_limits<double>::infinity());
            for (auto k : model.incommingStates(i)) {
                if (l < model.stateArity(i))
                    continue;
                    
                double candidate = omega(l - model.stateArity(i) , k).second + ln(model.transitionProb(k, i));
                if (candidate > best.second)
                    best = make_pair(k, candidate);
            }
            
            if (best.first == -1) {
                // State is not possible
                omega(l, i) = make_pair(-1, -numeric_limits<double>::infinity());
            } else {
                // Update current cell with right values
                omega(l, i) = make_pair(best.first,
                                        best.second + ln(model.emissionProb(i, observation.substr(l - model.stateArity(i) + 1, model.stateArity(i)))));
            }
        }
    }
    
    // Final result is now in prev
    pair<int, double> best = make_pair(-1, -numeric_limits<double>::infinity());
    for (int i = 0; i < model.numStates(); i++) {
        double candidate = omega(observation.length()-1, i).second;
        if (candidate > best.second)
            best = make_pair(i, candidate);
    }
    
    if (best.first == -1)
        return make_pair(-numeric_limits<double>::infinity(), vector<size_t>());
    
    // Backtrack
    vector<size_t> stateTrace;
    stateTrace.push_back(best.first);
    size_t pos = observation.length() - 1;
    auto cur = omega(pos, best.first);
    size_t prevState = best.first; // TODO: Could probably be refactored
    while (cur.first != -1) {
        stateTrace.push_back(cur.first);

        pos -= model.stateArity(prevState);
        prevState = cur.first;
        cur = omega(pos, cur.first);
    }
    
    return make_pair(best.second,
                     vector<size_t>(stateTrace.rbegin(), stateTrace.rend()));
}