/// \brief Estimate the empirical frequencies of different letters from the alignment, with pseudocounts /// /// \param args The command line parameters. /// \param A The alignment. /// valarray<double> empirical_frequencies(const variables_map& args,const alignment& A) { const alphabet& a = A.get_alphabet(); // Count the occurrence of the different letters valarray<double> counts = letter_counts(A); valarray<double> frequencies(a.size()); // empirical frequencies frequencies = A.get_alphabet().get_frequencies_from_counts(counts,chop_internal(A).n_sequences()); return frequencies; }
vector<int> column_count(const alignment& A, int c) { const alphabet& a = A.get_alphabet(); vector<int> count(a.size()+1,0); for(int i=0;i<A.n_sequences();i++) { int l = A(c,i); if (A.get_alphabet().is_letter(l)) count[l]++; if (l == alphabet::gap or l == alphabet::unknown) count.back()++; } return count; }
void peel_branch(int b0,Likelihood_Cache& cache, const alignment& A, const Tree& T, const MatCache& transition_P, const MultiModel& MModel) { total_peel_branches++; // compute branches-in int bb = T.directed_branch(b0).branches_before().size(); if (bb == 0) { int n_states = cache.scratch(0).size2(); int n_letters = A.get_alphabet().n_letters(); if (n_states == n_letters) { if (dynamic_cast<const F81_Model*>(&MModel.base_model(0))) peel_leaf_branch_F81(b0, cache, A, T, MModel); else peel_leaf_branch(b0, cache, A, T, transition_P, MModel); } else peel_leaf_branch_modulated(b0, cache, A, T, transition_P, MModel); } else if (bb == 2) { if (dynamic_cast<const F81_Model*>(&MModel.base_model(0))) peel_internal_branch_F81(b0, cache, A, T, MModel); else peel_internal_branch(b0, cache, A, T, transition_P, MModel); } else std::abort(); cache.validate_branch(b0); }
std::pair<vector<int>,vector<int> > find_major_character(const alignment& A,int allowed_differences) { const alphabet& a = A.get_alphabet(); vector<int> majority(A.length(), alphabet::unknown); vector<int> safe(A.length(), 0); for(int c=0;c<majority.size();c++) { vector<int> count = column_count(A,c); int max_letter = argmax(count); majority[c] = max_letter; // NOTE! Major character is gap if there is more than 1 gap! if (count[a.size()] > 1) majority[c] = alphabet::gap; else if (A.n_sequences() - count[max_letter] <= allowed_differences) safe[c] = 1; /* if (safe[c] == 1) { std::cerr<<"Column "<<c+1<<" is safe: "<<a.lookup(max_letter)<<"\n"; } */ } return std::pair<vector<int>,vector<int> >(majority,safe); }
/// \brief Estimate the empirical frequencies of different letters from the alignment, with pseudocounts /// /// \param args The command line parameters. /// \param A The alignment. /// valarray<double> empirical_frequencies(const variables_map& args,const alignment& A) { const alphabet& a = A.get_alphabet(); // Count the occurrence of the different letters valarray<double> counts = letter_counts(A); valarray<double> frequencies(a.size()); // empirical frequencies if (not args.count("frequencies")) frequencies = A.get_alphabet().get_frequencies_from_counts(counts,A.n_sequences()/2); // uniform frequencies else if (args["frequencies"].as<string>() == "uniform") frequencies = 1.0/a.size(); // triplet frequencies <- nucleotide frequencies else if (args["frequencies"].as<string>() == "nucleotides") { const Triplets* T = dynamic_cast<const Triplets*>(&a); if (not T) throw myexception()<<"You can only specify nucleotide frequencies on Triplet or Codon alphabets."; valarray<double> N_counts = get_nucleotide_counts_from_codon_counts(*T,counts); valarray<double> fN = T->getNucleotides().get_frequencies_from_counts(N_counts,A.n_sequences()/2); frequencies = get_codon_frequencies_from_independent_nucleotide_frequencies(*T,fN); } // specified frequencies else { vector<double> f = split<double>(args["frequencies"].as<string>(),','); if (f.size() != a.size()) throw myexception()<<"You specified "<<f.size()<<" frequencies, but there are " <<a.size()<<" letters of the alphabet!"; for(int i=0;i<f.size();i++) frequencies[i] = f[i]; } return frequencies; }
void count_letters(const alignment& A, int c, valarray<int>& counts) { const alphabet& a = A.get_alphabet(); assert(counts.size() == a.size()); counts = 0; for(int i=0;i<A.n_sequences();i++) { int l = A(c,i); if (a.is_letter(l)) counts[l]++; } }
void peel_leaf_branch_modulated(int b0,Likelihood_Cache& cache, const alignment& A, const Tree& T, const MatCache& transition_P,const MultiModel& MModel) { total_peel_leaf_branches++; const alphabet& a = A.get_alphabet(); // The number of directed branches is twice the number of undirected branches const int B = T.n_branches(); // scratch matrix Matrix& S = cache.scratch(0); const int n_models = S.size1(); const int n_states = S.size2(); const int n_letters = a.n_letters(); // const int N = n_states/n_letters; assert(MModel.n_states() == n_states); // std::clog<<"length of subA for branch "<<b0<<" is "<<length<<"\n"; if (not subA_index_valid(A,b0)) update_subA_index_branch(A,T,b0); const vector<unsigned>& smap = MModel.state_letters(); for(int i=0;i<subA_length(A,b0);i++) { // compute the distribution at the parent node int l2 = A.note(0,i+1,b0); if (a.is_letter(l2)) for(int m=0;m<n_models;m++) { const Matrix& Q = transition_P[m][b0%B]; for(int s1=0;s1<n_states;s1++) cache(i,b0)(m,s1) = sum(Q,smap,n_letters,s1,l2); } else if (a.is_letter_class(l2)) { for(int m=0;m<n_models;m++) { const Matrix& Q = transition_P[m][b0%B]; for(int s1=0;s1<n_states;s1++) cache(i,b0)(m,s1) = sum(Q,smap,s1,l2,a); } } else for(int m=0;m<n_models;m++) for(int s=0;s<n_states;s++) cache(i,b0)(m,s) = 1; } }
// This function ignores information in ambiguous letters dynamic_bitset<> letter_variable_sites(const alignment& A) { const alphabet& a = A.get_alphabet(); valarray<int> counts(0, a.size()); dynamic_bitset<> columns(A.length()); for(int c=0; c<A.length(); c++) { count_letters(A,c,counts); if (variable_counts(counts)) columns[c] = true; } return columns; }
// This function ignores information in ambiguous letters unsigned n_letter_variable_sites(const alignment& A) { const alphabet& a = A.get_alphabet(); valarray<int> counts(0, a.size()); unsigned n=0; for(int c=0; c<A.length(); c++) { count_letters(A,c,counts); if (variable_counts(counts)) n++; } return n; }
void count_gaps(const alignment& A, int c, valarray<int>& counts) { const alphabet& a = A.get_alphabet(); assert(counts.size() == 2); counts = 0; for(int i=0;i<A.n_sequences();i++) { int l = A(c,i); if (a.is_feature(l)) counts[0]++; else if (l == alphabet::gap) counts[1]++; } }
/// Compute the number of times each letter of the alphabet occurs in \a A. valarray<double> letter_counts(const alignment& A) { const alphabet& a = A.get_alphabet(); // Count the occurrence of the different letters valarray<double> counts(0.0, a.size()); for(int i=0;i<A.length();i++) { for(int j=0;j<A.n_sequences();j++) { if (a.is_letter(A(i,j))) counts[A(i,j)]++; } } return counts; }
alignment shuffle_alignment(const alignment& A, const vector<int>& order) { unsigned L = A.length(); alignment A2(A.get_alphabet(), order.size(), L); for(int i=0;i<order.size();i++) { int j = order[i]; assert(0 <= j and j < A.n_sequences()); A2.seq(i) = A.seq(j); for(int c=0;c<L;c++) A2(c,i) = A(c,j); } return A2; }
alignment complement(const alignment& A) { const alphabet& a = A.get_alphabet(); owned_ptr<Nucleotides> N(dynamic_cast<const Nucleotides&>(a)); if (not N) throw myexception()<<"Sequences have alphabet "<<a.name<<" -- reverse complement not allowed"; alignment A2 = A; // Reverse for(int i=0;i<A2.n_sequences();i++) for(int j=0;j<A2.length();j++) A2(j,i) = N->complement(A(j,i)); return A2; }
void check_leaf_sequences(const alignment& A,int n_leaves) { vector<sequence> sequences = A.convert_to_sequences(); const alphabet& a = A.get_alphabet(); for(int i=0;i<n_leaves;i++) { sequences[i].strip_gaps(); if (not (a(sequences[i]) == a(A.seq(i)))) { cerr<<"leaf sequence "<<i<<" corrupted!\n"; cerr<<"orig: "<<A.seq(i)<<endl; cerr<<"new : "<<sequences[i]<<endl; std::abort(); } } }
B n_mutations(const alignment& A, const SequenceTree& T,const ublas::matrix<B>& cost) { const alphabet& a = A.get_alphabet(); vector<int> letters(T.n_leaves()); int root = T.directed_branch(0).target(); vector<const_branchview> branches = branches_toward_node(T,root); ublas::matrix<B> n_muts(T.n_nodes(), a.size()); double tree_length = 0; for(int c=0;c<A.length();c++) { for(int i=0;i<T.n_leaves();i++) letters[i] = A(c,i); double length = n_mutations<B>(a,letters,T,cost,n_muts,branches); tree_length += length; } return tree_length; }
void check_letters_OK(const alignment& A) { const alphabet& a = A.get_alphabet(); bool bad=false; for(int i=0;i<A.length();i++) for(int j=0;j<A.n_sequences();j++) if (A(i,j) >=0 and A(i,j) < a.size()) ; // this is a letter else if (A(i,j) >= a.n_letters() and A(i,j) < a.n_letter_classes()) ; // this is a letter class else if (A(i,j) == alphabet::gap) ; // this is a '-' else if (A(i,j) == alphabet::not_gap) ; // this is a '*' else if (A(i,j) == alphabet::unknown) ; // this is a '?' else { bad = true; cerr<<"A("<<i<<","<<j<<") = "<<A(i,j)<<endl; } if (bad) std::abort(); }
OwnedPointer<MultiModel> get_smodel(const variables_map& args, const alignment& A) { string smodel_name = args["smodel"].as<string>(); return get_smodel(args,smodel_name,A.get_alphabet(),empirical_frequencies(args,A)); }
void peel_leaf_branch_F81(int b0,Likelihood_Cache& cache, const alignment& A, const Tree& T, const MultiModel& MModel) { total_peel_leaf_branches++; // std::cerr<<"got here! (leaf)"<<endl; const alphabet& a = A.get_alphabet(); // The number of directed branches is twice the number of undirected branches const int B = T.n_branches(); // scratch matrix Matrix& S = cache.scratch(0); const int n_models = S.size1(); const int n_states = S.size2(); // const int n_letters = a.n_letters(); assert(MModel.n_states() == n_states); // std::clog<<"length of subA for branch "<<b0<<" is "<<length<<"\n"; if (not subA_index_valid(A,b0)) update_subA_index_branch(A,T,b0); // const vector<unsigned>& smap = MModel.state_letters(); vector<const F81_Model*> SubModels(n_models); for(int m=0;m<n_models;m++) { SubModels[m] = static_cast<const F81_Model*>(&MModel.base_model(m)); assert(SubModels[m]); } const double t = T.directed_branch(b0).length(); valarray<double> exp_a_t(n_models); for(int m=0;m<n_models;m++) exp_a_t[m] = exp(-t * SubModels[m]->alpha()); Matrix& F = cache.scratch(1); FrequencyMatrix(F,MModel); // F(m,l2) for(int i=0;i<subA_length(A,b0);i++) { // compute the distribution at the parent node int l2 = A.note(0,i+1,b0); if (a.is_letter(l2)) for(int m=0;m<n_models;m++) { const valarray<double>& pi = SubModels[m]->frequencies(); for(int s1=0;s1<n_states;s1++) cache(i,b0)(m,s1) = (1.0-exp_a_t[m])*pi[l2]; cache(i,b0)(m,l2) += exp_a_t[m]; } else if (a.is_letter_class(l2)) { for(int m=0;m<n_models;m++) { double sum=0; for(int l=0;l<a.size();l++) if (a.matches(l,l2)) sum += F(m,l); for(int s1=0;s1<n_states;s1++) cache(i,b0)(m,s1) = (1.0-exp_a_t[m])*sum; for(int l=0;l<a.size();l++) if (a.matches(l,l2)) cache(i,b0)(m,l) += exp_a_t[m]; } } else for(int m=0;m<n_models;m++) for(int s=0;s<n_states;s++) cache(i,b0)(m,s) = 1; } }
Matrix get_rate_probabilities(const alignment& A,const MatCache& MC,const Tree& T, Likelihood_Cache& cache,const MultiModel& MModel) { const alphabet& a = A.get_alphabet(); const int root = cache.root; // make sure that we are up-to-date calculate_caches(A,MC,T,cache,MModel); // declare a matrix to store our results in Matrix probs(A.length(),MModel.n_base_models()); // initialize the entries to prior probability of each sub-model for(int m=0;m<probs.size2();m++) for(int c=0;c<probs.size1();c++) probs(c,m) = MModel.distribution()[m]; // compute root branches vector<int> rb; for(const_in_edges_iterator i = T[root].branches_in();i;i++) rb.push_back(*i); // get the index ublas::matrix<int> index = subA_index(root,A,T); // scratch matrix Matrix & S = cache.scratch(0); const int n_models = S.size1(); const int n_states = S.size2(); // cache matrix of frequencies Matrix F(n_models,n_states); for(int m=0;m<n_models;m++) { double p = MModel.distribution()[m]; const valarray<double>& f = MModel.base_model(m).frequencies(); for(int s=0;s<n_states;s++) F(m,s) = f[s]*p; } const vector<unsigned>& smap = MModel.state_letters(); for(int i=0;i<index.size1();i++) { double p_col = 0; for(int m=0;m<n_models;m++) { //-------------- Set letter & model prior probabilities ---------------// for(int s=0;s<n_states;s++) S(m,s) = F(m,s); //-------------- Propagate and collect information at 'root' -----------// for(int j=0;j<rb.size();j++) { int i0 = index(i,j); if (i0 != alphabet::gap) for(int s=0;s<n_states;s++) S(m,s) *= cache(i0,rb[j])(m,s); } //--------- If there is a letter at the root, condition on it ---------// if (root < T.n_leaves()) { int rl = A.seq(root)[i]; if (a.is_letter_class(rl)) for(int s=0;s<n_states;s++) if (not a.matches(smap[s],rl)) S(m,s) = 0; } //--------- If there is a letter at the root, condition on it ---------// probs(i,m) = 0; for(int s=0;s<n_states;s++) probs(i,m) += S(m,s); // A specific model (e.g. the INV model) could be impossible assert(0 <= probs(i,m) and probs(i,m) <= 1.00000000001); p_col += probs(i,m); } // SOME model must be possible assert(0 <= p_col and p_col <= 1.00000000001); for(int m=0;m<n_models;m++) probs(i,m) /= p_col; } return probs; }
int n_mutations(const alignment& A, const SequenceTree& T) { return n_mutations<int>(A,T,unit_cost_matrix(A.get_alphabet())); }