ublas::matrix<int> get_path_counts(const alignment& A,int node1, int node2) { using namespace A2; int state1 = states::S; ublas::matrix<int> counts(5,5); counts.clear(); for(int column=0;column<A.length();column++) { int state2 = -1; if (A.gap(column,node1)) { if (A.gap(column,node2)) continue; else state2 = states::G1; } else { if (A.gap(column,node2)) state2 = states::G2; else state2 = states::M; } counts(state1,state2)++; state1 = state2; } counts(state1,states::E)++; return counts; }
alignment add_internal(alignment A,const SequenceTree& T) { // Complain if A and T don't correspond if (A.n_sequences() != T.n_leaves()) throw myexception()<<"Number of sequence in alignment doesn't match number of leaves in tree" <<"- can't add internal sequences"; // Add empty sequences vector<sequence> S; for(int i=T.n_leaves();i<T.n_nodes();i++) { sequence s; if (T.label(i) == "") throw myexception()<<"Adding internal sequences: Tree has missing internal node name!"; s.name = T.label(i); S.push_back(s); } A.add_sequences(S); return A; }
alignment reorder_sequences(const alignment& A, const vector<string>& names) { // Check the names and stuff. vector<string> n2 = sequence_names(A); if (names == n2) return A; alignment A2; try { vector<int> new_order = compute_mapping(names,n2); A2 = reorder_sequences(A,new_order); } catch(bad_mapping<string>& e) { e.clear(); if (e.size2 < e.size1) e<<"Alignment has too few sequences! (Got "<<A.n_sequences()<<", expected "<<names.size()<<")\n"; if (e.size1 < e.size2) e<<"Alignmnent has too many sequences! (Got "<<A.n_sequences()<<", expected "<<names.size()<<")\n"; if (e.from == 0) e<<"Alignment is missing sequence \""<<e.missing<<"\"."; else e<<"Alignment has extra sequence \""<<e.missing<<"\"."; throw e; } return A2; }
std::pair<vector<int>,vector<int> > find_major_character(const alignment& A,int allowed_differences) { const alphabet& a = A.get_alphabet(); vector<int> majority(A.length(), alphabet::unknown); vector<int> safe(A.length(), 0); for(int c=0;c<majority.size();c++) { vector<int> count = column_count(A,c); int max_letter = argmax(count); majority[c] = max_letter; // NOTE! Major character is gap if there is more than 1 gap! if (count[a.size()] > 1) majority[c] = alphabet::gap; else if (A.n_sequences() - count[max_letter] <= allowed_differences) safe[c] = 1; /* if (safe[c] == 1) { std::cerr<<"Column "<<c+1<<" is safe: "<<a.lookup(max_letter)<<"\n"; } */ } return std::pair<vector<int>,vector<int> >(majority,safe); }
LogProb transpair_model4::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const { LogProb total = 1.0 ; static const LogProb almostZero = 1E-299 ; if( distortionType&1 ) { total *= prob_of_target_and_alignment_given_source_1(al,verb); } if( distortionType&2 ) { for(WordIndex j=1;j<=m;j++) if( al(j) ) if( al.get_head(al(j))==j) { int ep=al.prev_cept(al(j)); float x2=probFirst[ep](j,al.get_center(ep)); massert(x2<=1.0); total*=x2; if( verb) cerr << "IBM-4: d=1 of " << j << ": " << x2 << " -> " << total << endl; } else { float x2=probSecond(j,al.prev_in_cept(j)); massert(x2<=1.0); total*=x2; if( verb) cerr << "IBM-4: d>1 of " << j << ": " << x2 << " -> " << total << endl; } } return total?total:almostZero; }
/// \brief Load a tree and an alignment based on command line parameters. /// /// \param args The command line parameters. /// \param alignments The alignments. /// \param T The leaf-labelled tree. /// \param internal_sequences Should each resulting alignment have sequences for internal nodes on the tree? /// void load_A_and_T(const variables_map& args,alignment& A,RootedSequenceTree& T,bool internal_sequences) { A = load_A(args,internal_sequences); T = load_T(args); //------------- Link Alignment and Tree -----------------// link(A,T,internal_sequences); //---------------- Randomize alignment? -----------------// if (args.count("randomize-alignment")) A = randomize(A,T.n_leaves()); else if (args.count("unalign-all")) A = unalign_all(A,T.n_leaves()); //------------------ Analyze 'internal'------------------// if ((args.count("internal") and args["internal"].as<string>() == "+") or args.count("randomize-alignment")) for(int column=0;column< A.length();column++) { for(int i=T.n_leaves();i<A.n_sequences();i++) A.set_value(column,i, alphabet::not_gap ); } //---- Check that internal sequence satisfy constraints ----// check_alignment(A,T,internal_sequences); }
alignment get_alignment(const ublas::matrix<int>& M, alignment& A1) { alignment A2 = A1; A2.changelength(M.size1()); // get letters information vector<vector<int> > sequences; for(int i=0;i<A1.n_sequences();i++) { vector<int> sequence; for(int c=0;c<A1.length();c++) { if (not A1.gap(c,i)) sequence.push_back(A1(c,i)); } sequences.push_back(sequence); } for(int i=0;i<A2.n_sequences();i++) { for(int c=0;c<A2.length();c++) { int index = M(c,i); if (index >= 0) index = sequences[i][index]; A2.set_value(c,i, index); } } return A2; }
/// \brief Create an alignment with randomized homology /// /// \param A An alignment containing the sequences to re-align alignment randomize(const alignment& A,int n) { if (n == -1) n = A.n_sequences(); int maxlength = -1; for(int s=0; s<n; s++) { if (A.seqlength(s) > maxlength) maxlength = A.seqlength(s); } // Choose the length of the new alignment alignment A2 = A; int newlength = int( maxlength + 2 + 0.1*maxlength); A2.changelength(newlength); // For each row of the alignment const int temp = alphabet::gap; for(int i=0; i<n; i++) { /// Collect the letters of the row vector<int> s = alignment_row_letters(A,i); /// Randomly insert gaps until the row is filled while(s.size() < newlength) { int pos = myrandom(s.size()+1); s.insert(s.begin()+pos,temp); } for(int c=0; c<A2.length(); c++) A2(c,i) = s[c]; } remove_empty_columns(A2); return A2; }
/// \brief TODOCUMENT /// /// \relates display_colourer display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT const str_vec &arg_names, ///< TODOCUMENT const alignment &arg_alignment ///< TODOCUMENT ) { const alignment::size_type num_entries = arg_alignment.num_entries(); const alignment::size_type aln_length = arg_alignment.length(); if ( aln_length <= 0 || num_entries <= 0 ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty")); } if ( num_entries != arg_pdbs.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs")); } if ( num_entries != arg_names.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names")); } auto &&result_spec = arg_colourer.get_colour_spec( alignment_context( arg_pdbs, arg_names, arg_alignment ) ); return has_score_colour_handler( arg_colourer ) ? adjust_display_colour_spec_copy( std::forward< decltype( result_spec ) >( result_spec ), get_score_colour_handler( arg_colourer ), arg_alignment ) : result_spec; }
/// \brief Load an alignment based on command line parameters and generate a random tree. /// /// \param args The command line parameters. /// \param alignments The alignments. /// \param T The leaf-labelled tree. /// \param internal_sequences Should each resulting alignment have sequences for internal nodes on the tree? /// void load_A_and_random_T(const variables_map& args,alignment& A,SequenceTree& T,bool internal_sequences) { // NO internal sequences, yet! A = load_A(args,internal_sequences); //------------- Load random tree ------------------------// SequenceTree TC = star_tree(sequence_names(A)); if (args.count("t-constraint")) TC = load_constraint_tree(args["t-constraint"].as<string>(),sequence_names(A)); T = TC; RandomTree(T,1.0); //------------- Link Alignment and Tree -----------------// link(A,T,internal_sequences); //---------------- Randomize alignment? -----------------// if (args.count("randomize-alignment")) A = randomize(A,T.n_leaves()); //------------------ Analyze 'internal'------------------// if ((args.count("internal") and args["internal"].as<string>() == "+") or args.count("randomize-alignment")) for(int column=0;column< A.length();column++) { for(int i=T.n_leaves();i<A.n_sequences();i++) A(column,i) = alphabet::not_gap; } //---- Check that internal sequence satisfy constraints ----// check_alignment(A,T,internal_sequences); }
bool intersect(int c1, int c2, const alignment& A) { for(int i=0;i<A.n_sequences();i++) { if (not A.gap(c1,i) and not A.gap(c2,i)) return true; } return false; }
int n_characters(const alignment& A, int column) { int count=0; for(int i=0;i<A.n_sequences();i++) if (A.character(column,i)) count++; return count; }
vector<int> alignment_row_letters(const alignment& A, int i) { vector<int> s; for(int c=0;c<A.length();c++) if (A.character(c,i)) s.push_back(A(c,i)); return s; }
/// \brief TODOCUMENT float_score_type score_colour_handler::get_score_of_postion(const alignment &arg_alignment, ///< TODOCUMENT const size_t &arg_entry, ///< TODOCUMENT const size_t &arg_index ///< TODOCUMENT ) const { const bool using_scores = show_scores_if_present && arg_alignment.is_scored(); const bool using_this_score = using_scores && has_score( arg_alignment.get_alignment_residue_scores(), arg_entry, arg_index ); return using_this_score ? get_score( arg_alignment.get_alignment_residue_scores(), arg_entry, arg_index, ! scores_to_equivs, normalise_scores ) : 1.0; }
void check_names_unique(const alignment& A) { // check that names are all unique for(int i=0;i<A.n_sequences();i++) { for(int j=0;j<i;j++) if (A.seq(i).name == A.seq(j).name) throw myexception()<<"Sequence name '"<<A.seq(i).name<<"' occurs multiple times in the alignment!"; } }
bool names_are_unique(const alignment& A) { // check that names are all unique for(int i=0;i<A.n_sequences();i++) for(int j=0;j<i;j++) if (A.seq(i).name == A.seq(j).name) return false; return true; }
void transpair_model4::computeScores(const alignment&al,vector<double>&d)const { LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ; total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0))); for (WordIndex i = 1 ; i <= al.fert(0) ; i++) total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ; for (WordIndex i = 1 ; i <= l ; i++) total2 *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i)); for (WordIndex j = 1 ; j <= m ; j++) total3*= get_t(al(j), j) ; for(WordIndex j=1;j<=m;j++) if( al(j) ) if( al.get_head(al(j))==j) { int ep=al.prev_cept(al(j)); float x2=probFirst[ep](j,al.get_center(ep)); total4*=x2; } else { float x2=probSecond(j,al.prev_in_cept(j)); total4*=x2; } d.push_back(total1);//9 d.push_back(total2);//10 d.push_back(total3);//11 d.push_back(total4);//12 }
/// Count the number of times the letter with index \a l occurs in \a A. int letter_count(const alignment& A,int l) { // Count the occurrence of the different letters int count=0; for(int i=0;i<A.length();i++) for(int j=0;j<A.n_sequences();j++) if (A(i,j) == l) count++; return count; }
unsigned n_homologous(const alignment& A,int s1,int s2) { unsigned same =0; for(int i=0;i<A.length();i++) { if (A.character(i,s1) and A.character(i,s2)) same++; } return same;; }
double getSimilarity(const alignment& A,int s1,int s2) { int match=0; int total=0; for(int column=0;column<A.length();column++) { if (A.gap(column,s1) or A.gap(column,s2)) continue; total++; if (A(column,s1) == A(column,s2)) match++; } return double(match)/total; }
// FIXME - should perhaps also check names? // use this function in alignment-gild, alignment-compare, alignment-diff, etc. void check_same_sequence_lengths(const vector<int>& L, const alignment& A) { if (A.n_sequences() != L.size()) throw myexception()<<"Expected alignment has "<<L.size()<<", but this one has "<<A.n_sequences(); for(int i=0;i<L.size();i++) { int L2 = A.seqlength(i); if (L[i] != L2) throw myexception()<<"Sequence "<<i+1<<": length "<<L2<<" differs from expected length "<<L[i]; } }
dynamic_bitset<> gap_variable_sites(const alignment& A) { valarray<int> counts(0, 2); dynamic_bitset<> columns(A.length()); for(int c=0; c<A.length(); c++) { count_gaps(A,c,counts); if (variable_counts(counts)) columns[c] = true; } return columns; }
/// Check that internal node states are consistent void check_internal_nodes_connected(const alignment& A,const Tree& T,const vector<int>& ignore) { for(int column=0;column<A.length();column++) { dynamic_bitset<> present(T.n_nodes()); for(int i=0;i<T.n_nodes();i++) present[i] = not A.gap(column,i); if (not all_characters_connected(T,present,ignore)) { cerr<<"Internal node states are inconsistent in column "<<column<<endl; cerr<<A<<endl; throw myexception()<<"Internal node states are inconsistent in column "<<column; } } }
/// Check that internal nodes don't have letters (or anything wierder!) void check_internal_sequences_composition(const alignment& A,int n_leaves) { for(int column=0;column<A.length();column++) for(int i=n_leaves;i<A.n_sequences();i++) if (A(column,i) == alphabet::gap) ; else if (A(column,i) == alphabet::not_gap) ; else throw myexception()<<"Found a illegal index "<<A(column,i) <<"in column "<<column<<" of internal sequence '" <<A.seq(i).name<<"': only - and * are allowed"; }
bool after(int c1, int c2, const alignment& A,const vector<int>& nodes) { assert(nodes.size() == A.n_sequences()); for(int i=0;i<nodes.size();i++) { bool p1 = not A.gap(c1,nodes[i]); bool p2 = not A.gap(c2,nodes[i]); if (p2 and not p1) return true; if (p1 and not p2) return false; } return false; }
void count_letters(const alignment& A, int c, valarray<int>& counts) { const alphabet& a = A.get_alphabet(); assert(counts.size() == a.size()); counts = 0; for(int i=0;i<A.n_sequences();i++) { int l = A(c,i); if (a.is_letter(l)) counts[l]++; } }
/// \brief Estimate the empirical frequencies of different letters from the alignment, with pseudocounts /// /// \param args The command line parameters. /// \param A The alignment. /// valarray<double> empirical_frequencies(const variables_map& args,const alignment& A) { const alphabet& a = A.get_alphabet(); // Count the occurrence of the different letters valarray<double> counts = letter_counts(A); valarray<double> frequencies(a.size()); // empirical frequencies frequencies = A.get_alphabet().get_frequencies_from_counts(counts,chop_internal(A).n_sequences()); return frequencies; }
/// \brief TODOCUMENT alignment_split_list cath::align::detail::get_preexisting_alignment_splits(const alignment &arg_alignment ///< TODOCUMENT ) { const size_t num_entries = arg_alignment.num_entries(); const size_t aln_length = arg_alignment.length(); alignment_split_list new_alignment_splits; for (size_t aln_index = 0; aln_index < aln_length; ++aln_index) { const size_vec present_positions = entries_present_at_index( arg_alignment, aln_index ); const alignment_split multi_split = make_alignment_split( present_positions, num_entries ); if ( is_valid_split( multi_split) ) { new_alignment_splits.insert( get_least_version( multi_split ) ); } } return new_alignment_splits; }
unsigned n_with_identity(const alignment& A,int s1,int s2,double I) { // Get matches vector<int> F(A.length()+1); unsigned L=0; unsigned T = 0; F[0]=0; for(int i=0;i<A.length();i++) { if (not A.character(i,s1) and not A.character(i,s2)) continue; L++; if (A(i,s1) == A(i,s2)) T++; F[L] = T; } F.resize(L+1); // Get positions vector<int> FI(T+1); FI[0]=0; for(int i=0;i<L;i++) if (F[i+1] > F[i]) FI[F[i+1]] = i+1; // tag positions that vector<int> tagged(L,0); const unsigned w = 4; for(int i=1;i<=T;i++) { for(int j=20;j>=w;j--) { int i2 = i+j; if (i2 > T) continue; assert(FI[i] > 0 and FI[i] <=L); assert(FI[i2] > 0 and FI[i2] <=L); assert(FI[i2] > FI[i]); if (double(i2-i+1)/(FI[i2]-FI[i]+1) > I) { for(int k=FI[i];k<=FI[i2];k++) tagged[k-1]=1; break; } } } return sum(tagged); }
bool A_constant(alignment A1, alignment A2, const dynamic_bitset<>& ignore) { assert(A1.n_sequences() == A2.n_sequences()); // equality holds if we have internal node sequences -- otherwise ignore is larger assert(A1.n_sequences() <= ignore.size()); // convert to feature-number notation ublas::matrix<int> M1 = M(A1); ublas::matrix<int> M2 = M(A2); // lookup and cache the column each feature is in vector< vector< int> > column_indices = column_lookup(A2); //----- Check that the sequence lengths match ------// for(int i=0;i<M1.size2();i++) { if (ignore[i]) continue; if (A1.seqlength(i) != A2.seqlength(i)) return false; } //----- Check that each homology in A1 is in A2 -----// for(int column=0; column<A1.length(); column++) for(int s1=0; s1 < A1.n_sequences(); s1++) { if (ignore[s1]) continue; for(int s2=s1+1; s2 < A1.n_sequences(); s2++) { if (ignore[s2]) continue; if (not A_match(M1,column,s1,s2,M2,column_indices)) return false; } } return true; }