Ejemplo n.º 1
0
/// \brief Estimate the empirical frequencies of different letters from the alignment, with pseudocounts
///
/// \param args The command line parameters.
/// \param A The alignment.
///
valarray<double> empirical_frequencies(const variables_map& args,const alignment& A) 
{
  const alphabet& a = A.get_alphabet();

  // Count the occurrence of the different letters
  valarray<double> counts = letter_counts(A);

  valarray<double> frequencies(a.size());

  // empirical frequencies
  frequencies = A.get_alphabet().get_frequencies_from_counts(counts,chop_internal(A).n_sequences());

  return frequencies;
}
Ejemplo n.º 2
0
vector<int> column_count(const alignment& A, int c)
{
  const alphabet& a = A.get_alphabet();
  vector<int> count(a.size()+1,0);

  for(int i=0;i<A.n_sequences();i++) {
    int l = A(c,i);
    if (A.get_alphabet().is_letter(l))
      count[l]++;

    if (l == alphabet::gap or l == alphabet::unknown)
      count.back()++;
  }
  return count;
}
Ejemplo n.º 3
0
  void peel_branch(int b0,Likelihood_Cache& cache, const alignment& A, const Tree& T, 
		   const MatCache& transition_P, const MultiModel& MModel)
  {
    total_peel_branches++;

    // compute branches-in
    int bb = T.directed_branch(b0).branches_before().size();

    if (bb == 0) {
      int n_states = cache.scratch(0).size2();
      int n_letters = A.get_alphabet().n_letters();
      if (n_states == n_letters) {
	if (dynamic_cast<const F81_Model*>(&MModel.base_model(0)))
	  peel_leaf_branch_F81(b0, cache, A, T, MModel);
	else
	  peel_leaf_branch(b0, cache, A, T, transition_P, MModel);
      }
      else
	peel_leaf_branch_modulated(b0, cache, A, T, transition_P, MModel);
    }
    else if (bb == 2) {
      if (dynamic_cast<const F81_Model*>(&MModel.base_model(0)))
	peel_internal_branch_F81(b0, cache, A, T, MModel);
      else
	peel_internal_branch(b0, cache, A, T, transition_P, MModel);
    }
    else
      std::abort();

    cache.validate_branch(b0);
  }
Ejemplo n.º 4
0
std::pair<vector<int>,vector<int> > find_major_character(const alignment& A,int allowed_differences)
{
  const alphabet& a = A.get_alphabet();

  vector<int> majority(A.length(), alphabet::unknown);

  vector<int> safe(A.length(), 0);

  for(int c=0;c<majority.size();c++) 
  {
    vector<int> count = column_count(A,c);
    
    int max_letter = argmax(count);
    majority[c] = max_letter;
    
    // NOTE! Major character is gap if there is more than 1 gap!
    if (count[a.size()] > 1)
      majority[c] = alphabet::gap;
    else if (A.n_sequences() - count[max_letter] <= allowed_differences)
      safe[c] = 1;
    
    /*
      if (safe[c] == 1) {
      std::cerr<<"Column "<<c+1<<" is safe: "<<a.lookup(max_letter)<<"\n";
      }
    */
  }
  
  return std::pair<vector<int>,vector<int> >(majority,safe);
}
Ejemplo n.º 5
0
/// \brief Estimate the empirical frequencies of different letters from the alignment, with pseudocounts
///
/// \param args The command line parameters.
/// \param A The alignment.
///
valarray<double> empirical_frequencies(const variables_map& args,const alignment& A) 
{
  const alphabet& a = A.get_alphabet();

  // Count the occurrence of the different letters
  valarray<double> counts = letter_counts(A);

  valarray<double> frequencies(a.size());

  // empirical frequencies
  if (not args.count("frequencies"))
    frequencies = A.get_alphabet().get_frequencies_from_counts(counts,A.n_sequences()/2);

  // uniform frequencies
  else if (args["frequencies"].as<string>() == "uniform")
    frequencies = 1.0/a.size();

  // triplet frequencies <- nucleotide frequencies
  else if (args["frequencies"].as<string>() == "nucleotides") {
    const Triplets* T = dynamic_cast<const Triplets*>(&a);

    if (not T) throw myexception()<<"You can only specify nucleotide frequencies on Triplet or Codon alphabets.";
    valarray<double> N_counts = get_nucleotide_counts_from_codon_counts(*T,counts);
    valarray<double> fN = T->getNucleotides().get_frequencies_from_counts(N_counts,A.n_sequences()/2);

    frequencies = get_codon_frequencies_from_independent_nucleotide_frequencies(*T,fN);
  }

  // specified frequencies
  else {
    vector<double> f = split<double>(args["frequencies"].as<string>(),',');

    if (f.size() != a.size())
      throw myexception()<<"You specified "<<f.size()<<" frequencies, but there are "
			 <<a.size()<<" letters of the alphabet!";

    for(int i=0;i<f.size();i++)
      frequencies[i] = f[i];
  }

  return frequencies;
}
Ejemplo n.º 6
0
void count_letters(const alignment& A, int c, valarray<int>& counts)
{
  const alphabet& a = A.get_alphabet();
  assert(counts.size() == a.size());

  counts = 0;
  for(int i=0;i<A.n_sequences();i++) 
  {
    int l = A(c,i);
    if (a.is_letter(l))
      counts[l]++;
  }
}
Ejemplo n.º 7
0
  void peel_leaf_branch_modulated(int b0,Likelihood_Cache& cache, const alignment& A, 
				  const Tree& T, 
				  const MatCache& transition_P,const MultiModel& MModel)
  {
    total_peel_leaf_branches++;

    const alphabet& a = A.get_alphabet();

    // The number of directed branches is twice the number of undirected branches
    const int B        = T.n_branches();

    // scratch matrix
    Matrix& S = cache.scratch(0);
    const int n_models  = S.size1();
    const int n_states  = S.size2();
    const int n_letters = a.n_letters();
    //    const int N = n_states/n_letters;
    assert(MModel.n_states() == n_states);

    //    std::clog<<"length of subA for branch "<<b0<<" is "<<length<<"\n";
    if (not subA_index_valid(A,b0))
      update_subA_index_branch(A,T,b0);

    const vector<unsigned>& smap = MModel.state_letters();

    for(int i=0;i<subA_length(A,b0);i++)
    {
      // compute the distribution at the parent node
      int l2 = A.note(0,i+1,b0);

      if (a.is_letter(l2))
	for(int m=0;m<n_models;m++) {
	  const Matrix& Q = transition_P[m][b0%B];
	  for(int s1=0;s1<n_states;s1++)
	    cache(i,b0)(m,s1) = sum(Q,smap,n_letters,s1,l2);
	}
      else if (a.is_letter_class(l2)) {
	for(int m=0;m<n_models;m++) {
	  const Matrix& Q = transition_P[m][b0%B];
	  for(int s1=0;s1<n_states;s1++)
	    cache(i,b0)(m,s1) = sum(Q,smap,s1,l2,a);
	}
      }
      else
	for(int m=0;m<n_models;m++)
	  for(int s=0;s<n_states;s++)
	    cache(i,b0)(m,s) = 1;
    }
  }
Ejemplo n.º 8
0
// This function ignores information in ambiguous letters
dynamic_bitset<> letter_variable_sites(const alignment& A)
{
  const alphabet& a = A.get_alphabet();

  valarray<int> counts(0, a.size());

  dynamic_bitset<> columns(A.length());
  for(int c=0; c<A.length(); c++)
  {
    count_letters(A,c,counts);
    if (variable_counts(counts))
      columns[c] = true;
  }
  return columns;
}
Ejemplo n.º 9
0
// This function ignores information in ambiguous letters
unsigned n_letter_variable_sites(const alignment& A)
{
  const alphabet& a = A.get_alphabet();

  valarray<int> counts(0, a.size());

  unsigned n=0;
  for(int c=0; c<A.length(); c++)
  {
    count_letters(A,c,counts);
    if (variable_counts(counts))
      n++;
  }
  return n;
}
Ejemplo n.º 10
0
void count_gaps(const alignment& A, int c, valarray<int>& counts)
{
  const alphabet& a = A.get_alphabet();
  assert(counts.size() == 2);

  counts = 0;
  for(int i=0;i<A.n_sequences();i++) 
  {
    int l = A(c,i);
    if (a.is_feature(l))
      counts[0]++;
    else if (l == alphabet::gap)
      counts[1]++;
  }
}
Ejemplo n.º 11
0
/// Compute the number of times each letter of the alphabet occurs in \a A.
valarray<double> letter_counts(const alignment& A) 
{
  const alphabet& a = A.get_alphabet();

  // Count the occurrence of the different letters
  valarray<double> counts(0.0, a.size());
  for(int i=0;i<A.length();i++) {
    for(int j=0;j<A.n_sequences();j++) {
      if (a.is_letter(A(i,j)))
	counts[A(i,j)]++;
    }
  }

  return counts;
}
Ejemplo n.º 12
0
alignment shuffle_alignment(const alignment& A, const vector<int>& order)
{
  unsigned L = A.length();

  alignment A2(A.get_alphabet(), order.size(), L);

  for(int i=0;i<order.size();i++) 
  {
    int j = order[i];
    assert(0 <= j and j < A.n_sequences());

    A2.seq(i) = A.seq(j);
    for(int c=0;c<L;c++)
      A2(c,i) = A(c,j);
  }

  return A2;
}
Ejemplo n.º 13
0
alignment complement(const alignment& A)
{
  const alphabet& a = A.get_alphabet();

  owned_ptr<Nucleotides> N(dynamic_cast<const Nucleotides&>(a));

  if (not N)
    throw myexception()<<"Sequences have alphabet "<<a.name<<" -- reverse complement not allowed";

  alignment A2 = A;

  // Reverse
  for(int i=0;i<A2.n_sequences();i++) 
    for(int j=0;j<A2.length();j++)
      A2(j,i) = N->complement(A(j,i));

  return A2;
}
Ejemplo n.º 14
0
void check_leaf_sequences(const alignment& A,int n_leaves) {

  vector<sequence> sequences = A.convert_to_sequences();

  const alphabet& a = A.get_alphabet();

  for(int i=0;i<n_leaves;i++) {

    sequences[i].strip_gaps();
    if (not (a(sequences[i]) == a(A.seq(i)))) {
      cerr<<"leaf sequence "<<i<<" corrupted!\n";

      cerr<<"orig: "<<A.seq(i)<<endl;

      cerr<<"new : "<<sequences[i]<<endl;

      std::abort();
    }
  }
}
Ejemplo n.º 15
0
B n_mutations(const alignment& A, const SequenceTree& T,const ublas::matrix<B>& cost)
{
  const alphabet& a = A.get_alphabet();

  vector<int> letters(T.n_leaves());

  int root = T.directed_branch(0).target();

  vector<const_branchview> branches = branches_toward_node(T,root);

  ublas::matrix<B> n_muts(T.n_nodes(), a.size());

  double tree_length = 0;
  for(int c=0;c<A.length();c++) {
    for(int i=0;i<T.n_leaves();i++)
      letters[i] = A(c,i);
    double length = n_mutations<B>(a,letters,T,cost,n_muts,branches);
    tree_length += length;
  }

  return tree_length;
}
Ejemplo n.º 16
0
void check_letters_OK(const alignment& A) {
  const alphabet& a = A.get_alphabet();

  bool bad=false;
  for(int i=0;i<A.length();i++)
    for(int j=0;j<A.n_sequences();j++)
      if (A(i,j) >=0 and A(i,j) < a.size())
	; // this is a letter
      else if (A(i,j) >= a.n_letters() and A(i,j) < a.n_letter_classes())
	; // this is a letter class
      else if (A(i,j) == alphabet::gap)
	; // this is a '-'
      else if (A(i,j) == alphabet::not_gap)
	; // this is a '*'
      else if (A(i,j) == alphabet::unknown)
	; // this is a '?'
      else {
	bad = true;
	cerr<<"A("<<i<<","<<j<<") = "<<A(i,j)<<endl;
      }
  if (bad)
    std::abort();
}
Ejemplo n.º 17
0
OwnedPointer<MultiModel> get_smodel(const variables_map& args, 
				    const alignment& A) 
{
  string smodel_name = args["smodel"].as<string>();
  return get_smodel(args,smodel_name,A.get_alphabet(),empirical_frequencies(args,A));
}
Ejemplo n.º 18
0
  void peel_leaf_branch_F81(int b0,Likelihood_Cache& cache, const alignment& A, const Tree& T, 
			    const MultiModel& MModel)
  {
    total_peel_leaf_branches++;

    //    std::cerr<<"got here! (leaf)"<<endl;

    const alphabet& a = A.get_alphabet();

    // The number of directed branches is twice the number of undirected branches
    const int B        = T.n_branches();

    // scratch matrix
    Matrix& S = cache.scratch(0);
    const int n_models  = S.size1();
    const int n_states  = S.size2();
    //    const int n_letters = a.n_letters();
    assert(MModel.n_states() == n_states);

    //    std::clog<<"length of subA for branch "<<b0<<" is "<<length<<"\n";
    if (not subA_index_valid(A,b0))
      update_subA_index_branch(A,T,b0);

    //    const vector<unsigned>& smap = MModel.state_letters();

    vector<const F81_Model*> SubModels(n_models);
    for(int m=0;m<n_models;m++) {
      SubModels[m] = static_cast<const F81_Model*>(&MModel.base_model(m));
      assert(SubModels[m]);
    }
    const double t = T.directed_branch(b0).length();

    valarray<double> exp_a_t(n_models);
    for(int m=0;m<n_models;m++) 
      exp_a_t[m] = exp(-t * SubModels[m]->alpha());

    Matrix& F = cache.scratch(1);
    FrequencyMatrix(F,MModel); // F(m,l2)

    for(int i=0;i<subA_length(A,b0);i++)
    {
      // compute the distribution at the parent node
      int l2 = A.note(0,i+1,b0);

      if (a.is_letter(l2))
	for(int m=0;m<n_models;m++) {
	  const valarray<double>& pi = SubModels[m]->frequencies();
	  for(int s1=0;s1<n_states;s1++)
	    cache(i,b0)(m,s1) = (1.0-exp_a_t[m])*pi[l2];
	  cache(i,b0)(m,l2) += exp_a_t[m];
	}
      else if (a.is_letter_class(l2)) 
      {
	for(int m=0;m<n_models;m++) 
	{
	  double sum=0;
	  for(int l=0;l<a.size();l++)
	    if (a.matches(l,l2))
	      sum += F(m,l);
	  for(int s1=0;s1<n_states;s1++)
	    cache(i,b0)(m,s1) = (1.0-exp_a_t[m])*sum;
	  for(int l=0;l<a.size();l++)
	    if (a.matches(l,l2))
	      cache(i,b0)(m,l) += exp_a_t[m];
	}
      }
      else
	for(int m=0;m<n_models;m++)
	  for(int s=0;s<n_states;s++)
	    cache(i,b0)(m,s) = 1;
    }
  }
Ejemplo n.º 19
0
  Matrix get_rate_probabilities(const alignment& A,const MatCache& MC,const Tree& T,
				Likelihood_Cache& cache,const MultiModel& MModel)
  {
    const alphabet& a = A.get_alphabet();

    const int root = cache.root;
    
    // make sure that we are up-to-date
    calculate_caches(A,MC,T,cache,MModel);

    // declare a matrix to store our results in
    Matrix probs(A.length(),MModel.n_base_models());

    // initialize the entries to prior probability of each sub-model
    for(int m=0;m<probs.size2();m++)
      for(int c=0;c<probs.size1();c++)
	probs(c,m) = MModel.distribution()[m];

    // compute root branches
    vector<int> rb;
    for(const_in_edges_iterator i = T[root].branches_in();i;i++)
      rb.push_back(*i);

    // get the index
    ublas::matrix<int> index = subA_index(root,A,T);

    // scratch matrix 
    Matrix & S = cache.scratch(0);
    const int n_models = S.size1();
    const int n_states    = S.size2();

    // cache matrix of frequencies
    Matrix F(n_models,n_states);
    for(int m=0;m<n_models;m++) {
      double p = MModel.distribution()[m];
      const valarray<double>& f = MModel.base_model(m).frequencies();
      for(int s=0;s<n_states;s++) 
	F(m,s) = f[s]*p;
    }

    const vector<unsigned>& smap = MModel.state_letters();

    for(int i=0;i<index.size1();i++) {
      double p_col = 0;
      for(int m=0;m<n_models;m++) {

	//-------------- Set letter & model prior probabilities  ---------------//
	for(int s=0;s<n_states;s++) 
	  S(m,s) = F(m,s);

	//-------------- Propagate and collect information at 'root' -----------//
	for(int j=0;j<rb.size();j++) {
	  int i0 = index(i,j);
	  if (i0 != alphabet::gap)
	    for(int s=0;s<n_states;s++) 
	      S(m,s) *= cache(i0,rb[j])(m,s);
	}

	//--------- If there is a letter at the root, condition on it ---------//
	if (root < T.n_leaves()) {
	  int rl = A.seq(root)[i];
	  if (a.is_letter_class(rl))
	    for(int s=0;s<n_states;s++)
	      if (not a.matches(smap[s],rl))
		S(m,s) = 0;
	}

	//--------- If there is a letter at the root, condition on it ---------//
	probs(i,m) = 0;
	for(int s=0;s<n_states;s++)
	  probs(i,m) += S(m,s);

	// A specific model (e.g. the INV model) could be impossible
	assert(0 <= probs(i,m) and probs(i,m) <= 1.00000000001);

	p_col += probs(i,m);
      }

      // SOME model must be possible
      assert(0 <= p_col and p_col <= 1.00000000001);
      for(int m=0;m<n_models;m++)
	probs(i,m) /= p_col;
    }
    return probs;
  }
Ejemplo n.º 20
0
int n_mutations(const alignment& A, const SequenceTree& T)
{
  return n_mutations<int>(A,T,unit_cost_matrix(A.get_alphabet()));
}