void 
TaggerMergingData::calculate_parameters_merging(istream& fcounts, int corpus_length) {
  map<int, map<int, double> > tags_pair;
  map<int, map<int, double> > emis;
  map<int, double> tags_count;
  map<int, double> ambclass_count;
  map<int, double> tags_count_for_emis;

  map<int, map<int, double> > coarse_tags_pair; 
  map<int, map<int, double> > coarse_emis; 
  map<int, double> coarse_tags_count; 
  map<int, double> coarse_ambclass_count;
  map<int, double> coarse_tags_count_for_emis; 

  set<int>::iterator iti, itj, itk, itcoarse;

  read_counts(fcounts, tags_pair, emis, tags_count, ambclass_count, tags_count_for_emis);

  cerr<<"Calculating new parameters ... "<<flush;

  //Calculate coarse tag counts from fine-grained tag counts
  for(int i=0; i<COARSE_N; i++) {
    for(int j=0; j<COARSE_N; j++) {
      coarse_tags_pair[i][j]=0.0;

      for(iti=coarse2fine[i].begin(); iti!=coarse2fine[i].end(); iti++) {
	for(itj=coarse2fine[j].begin(); itj!=coarse2fine[j].end(); itj++) {
	  coarse_tags_pair[i][j]+=tags_pair[*iti][*itj];
	}
      }
      //cerr<<"coarse_tags_pair["<<i<<"]["<<j<<"]="<<coarse_tags_pair[i][j]<<"\n";
    }
  }

  //Calculate coarse amb. class emission counts
  for(int k=0; k<COARSE_M; k++) {
    for(itcoarse=coarse_output[k].begin(); itcoarse!=coarse_output[k].end(); itcoarse++) {
      coarse_emis[*itcoarse][k]=0.0;
      for(iti=coarse2fine[*itcoarse].begin(); iti!=coarse2fine[*itcoarse].end(); iti++) {
	for(itk=coarseamb2fineamb[k].begin(); itk!=coarseamb2fineamb[k].end(); itk++) {
	  if (TaggerData::getOutput()[*itk].find(*iti)!=TaggerData::getOutput()[*itk].end()) {
	    coarse_emis[*itcoarse][k]+=emis[*iti][*itk];
	  }
	}
      }
      //cerr<<"coarse_emis["<<*itcoarse<<"]["<<k<<"]="<<coarse_emis[*itcoarse][k]<<"\n";
    }
  }

  for(int i=0; i<COARSE_N; i++) {
    coarse_tags_count[i]=0.0;
    coarse_tags_count_for_emis[i]=0.0;
    for(iti=coarse2fine[i].begin(); iti!=coarse2fine[i].end(); iti++) {
      coarse_tags_count[i]+=tags_count[*iti];
      coarse_tags_count_for_emis[i]+=tags_count_for_emis[*iti];
    }
    //cerr<<"coarse_tags_count["<<i<<"]="<<coarse_tags_count[i]<<"\n";
    //cerr<<"coarse_tags_count_for_emis["<<"i"<<"]="<<coarse_tags_count_for_emis[i]<<"\n";
  }

  for(int k=0; k<COARSE_M; k++) {
    coarse_ambclass_count[k]=0.0;
    //cerr<<"k="<<k<<" ";
    for(itk=coarseamb2fineamb[k].begin(); itk!=coarseamb2fineamb[k].end(); itk++) {
      //cerr<<*itk<<"("<<ambclass_count[*itk]<<") ";
      coarse_ambclass_count[k]+=ambclass_count[*itk];
    }
    //cerr<<"coarse_ambclass_count["<<k<<"]="<<coarse_ambclass_count[k]<<"\n";
  }

  SmoothUtils::calculate_smoothed_parameters(*this, coarse_tags_count, coarse_tags_pair, 
					     coarse_ambclass_count, coarse_emis, 
					     coarse_tags_count_for_emis, corpus_length);
  cerr<<"done.\n";
}
Exemplo n.º 2
0
void run(std::string tree_filename, std::string fasta_filename, std::string model_name) {
  Model Mod;                 // The model
  Counts data;               // the counts
  Parameters Par;            // the parameters
  std::vector<double> br;    // branch lengths
  double eps = 1e-8;         // The threshold for the EM algorithm.

  Parameters Parsim;         // used for simulating data.
  std::vector<double> brsim; // branch lengths of simulated data.

  std::vector<std::vector<double> > Cov;  // Covariance matrix
  std::vector<double> variances;          // The variances


  bool simulate;
  bool nonident;
  std::string parameters_filename;
  std::string covariances_filename;

  // initialize random number generator with time(0).
  random_initialize();

  parameters_filename = strip_extension(fasta_filename) + ".dat";
  covariances_filename = strip_extension(fasta_filename) + ".cov";

  // Creates the pointers to the model-specific functions.
  Mod = create_model(model_name);
  std::cout << "Model: " << Mod.name << std::endl;

  // Reads the tree.
  Tree T = read_tree(tree_filename);

  // Prints the Tree
  std::cout << "Tree:" << std::endl;
  print_tree(T);

  // Check for possible nonidentifiability issues.
  nonident = nonident_warning(T);

  // Initialize the parameters for simulation of K81 data for testing
  Parsim = create_parameters(T);

  if (fasta_filename == ":test") {      // if fasta file is :test generate random data.
    simulate = true;

    // Warn
    std::cout << "WARNING: Using simulated data " << std::endl << std::endl;

    // Generate random parameters
    random_parameters_length(T, Mod, Parsim);

    // Simulate the data
    data = random_fake_counts(T, 1000, Parsim);

    // Prints branch-lengths for future check.
    branch_lengths(Parsim, brsim);
    std::cout << "Simulated branch lengths:" << std::endl;
    print_vector(brsim);

  } else {                                  // otherwise read the data
    simulate = false;

    // Read the counts.
    std::cout << "Reading fasta file:" << std::endl;
    read_counts(T, data, fasta_filename);
    add_pseudocounts(0.01, data);
    std::cout << std::endl;
  }

  // Check whether the data and the tree match.
  if (T.nalpha != data.nalpha || T.nleaves != data.nspecies) {
    throw std::invalid_argument("The order of the sequences or their number and the phylogenetic tree do not match.");
  }

  //Par = create_parameters(T);
  //print_parameters(Par);
  //print_vector(Par.r);

  //clock_t
  long start_time, end_time;

  // Runs the EM algorithm. Par is used as initial parameters.
  // After execution, Par contains the MLE computed by the algorithm.

 // for local max over multiple iterations
  Parameters Parmax = Par;
  Model Modmax = Mod;

  float likelL = 0.0;
  float likelMax = -1000000.0;
  float timerec;
  float timemax;

  int outfiles; //whether to save output
  std::cout << "Starting the EM algorithm: " << std::endl;

  int s;
  int S = 0; //count of cases with neg branches

  int iter;
  int iterMax;

  for (int it_runs = 0; it_runs < 10; it_runs++) {
      Par = create_parameters(T);
      Mod = create_model(model_name);
      std::cout << it_runs << ", " ;

      start_time = clock();

      std::tie(likelL, iter) = EMalgorithm(T, Mod, Par, data, eps);

      end_time = clock();
      //print_parameters(Par);

      // Choses the best permutation.
      guess_permutation(T, Mod, Par);

      branch_lengths(Par, br);

      //print_vector(br);
      s = find_negative(br);
      S +=s;
      timerec = ((float)end_time - start_time) / CLOCKS_PER_SEC;

      //assign the 1st iter time value, inc ase it's the best
      if (it_runs == 0){
        timemax = timerec;
        iterMax = iter;
      }

      if (likelL > likelMax){
        Parmax = Par;
        Modmax = Mod;
        timemax = timerec;
        likelMax = likelL;
        iterMax = iter;
      }

  }


  // If parameters are not identifiable, the computation of the covariance matrix will
  // fail as the Fisher info matrix will not be invertible.
  if (!nonident) {
    // Compute the covariance matrix using observed Fisher.
    full_MLE_observed_covariance_matrix(T, Modmax, Parmax, data, Cov);
    variances.resize(Cov.size());
    for(unsigned int i=0; i < Cov.size(); i++) {
      variances[i] = Cov[i][i];
    }

    // OUTPUT Save the sigmas into a file
    //save_sigmas_to(covariances_filename, Cov);
  }

  std::cout << std::endl;
  std::cout << "Finished." << std::endl;
  std::cout << "Likelihood: " << log_likelihood(T, Parmax, data) << std::endl ;
  std::cout << "Time: " << timemax << std::endl << std::endl;
  std::cout << "negative branches: "  << S << std::endl;
  std::cout << "Iter: "  << iterMax << std::endl;

  //std::cout << "Branch lengths: " << std::endl;
  //print_vector(br);
  outfiles = 0;
  if (!nonident && outfiles) {
    std::cout << "Parameter variances: " << std::endl;
    print_vector(variances);
  }

  std::cout << "Newick Tree:" << std::endl;
  print_newick_tree(T, br);

  // if is a simulation, print the L2 distance !
  if (simulate) {
    std::cout << "L2 distance:   " << parameters_distance(Par, Parsim) << std::endl;
    std::cout << "KL divergence: " << KL_divergence(T, Par, Parsim) << std::endl;
    std::cout << std::endl;
  }

  // if it is not a simulation, store the parameters in a file !
  if (!simulate && outfiles) {
    std::fstream st;
    st.precision(15);
    st.setf(std::ios::fixed,std::ios::floatfield);
    st.open(parameters_filename.c_str(), std::ios::out);
    print_parameters(Par, st);
  }
}