log_double_t correction(const data_partition& P,const vector<int>& nodes) { if (P.variable_alignment()) { // get the lengths of then internal node int length = P.seqlength(nodes[0]); return pow(P.sequence_length_pr(length), 2); } else return 1; }
vector<HMM::bitmask_t> get_bitpath(const data_partition& P, const vector<int>& nodes) { auto t = P.t(); int b1 = t.find_branch(nodes[1],nodes[0]); int b2 = t.find_branch(nodes[0],nodes[2]); int b3 = t.find_branch(nodes[0],nodes[3]); vector<HMM::bitmask_t> a1 = convert_to_bits(P.get_pairwise_alignment(b1),0,3); vector<HMM::bitmask_t> a2 = convert_to_bits(P.get_pairwise_alignment(b2),3,1); vector<HMM::bitmask_t> a3 = convert_to_bits(P.get_pairwise_alignment(b3),3,2); vector<HMM::bitmask_t> a123 = Glue_A(a1, Glue_A(a2, a3)); return a123; }
efloat_t prior_HMM_rootless_scale(const data_partition& P) { const Tree& T = *P.T; #ifndef NDEBUG assert(P.has_IModel()); check_internal_nodes_connected(*P.A,T); #endif efloat_t Pr = 1; for(int i=T.n_leaves();i<T.n_nodes();i++) { int l = P.seqlength(i); efloat_t temp = P.IModel().lengthp(l); Pr /= (temp*temp); } return Pr; }
/// Probability of a multiple alignment if branch alignments independant efloat_t prior_HMM_nogiven(const data_partition& P) { const alignment& A = *P.A; const Tree& T = *P.T; #ifndef NDEBUG assert(P.has_IModel()); check_internal_nodes_connected(A,T); #endif efloat_t Pr = 1; for(int b=0;b<T.n_branches();b++) { int target = T.branch(b).target(); int source = T.branch(b).source(); Pr *= prior_branch(A, P.branch_HMMs[b], target, source); } return Pr; }
boost::shared_ptr<DParrayConstrained> sample_node_base(data_partition& P,const vector<int>& nodes) { default_timer_stack.push_timer("alignment::DP1/3-way"); const Tree& T = *P.T; assert(P.variable_alignment()); alignment old = *P.A; // std::cerr<<"old = "<<old<<endl; /*------------- Compute sequence properties --------------*/ int n0 = nodes[0]; int n1 = nodes[1]; int n2 = nodes[2]; int n3 = nodes[3]; vector<int> columns = getorder(old,n0,n1,n2,n3); // std::cerr<<"n0 = "<<n0<<" n1 = "<<n1<<" n2 = "<<n2<<" n3 = "<<n3<<std::endl; // std::cerr<<"old (reordered) = "<<project(old,n0,n1,n2,n3)<<endl; // Find sub-alignments and sequences vector<int> seq1; vector<int> seq2; vector<int> seq3; vector<int> seq123; for(int i=0; i<columns.size(); i++) { int column = columns[i]; if (not old.gap(column,n1)) seq1.push_back(column); if (not old.gap(column,n2)) seq2.push_back(column); if (not old.gap(column,n3)) seq3.push_back(column); if (not old.gap(column,n1) or not old.gap(column,n2) or not old.gap(column,n3)) seq123.push_back(column); } // Map columns with n2 or n3 to single index 'c' vector<int> icol(seq123.size()+1); vector<int> jcol(seq123.size()+1); vector<int> kcol(seq123.size()+1); icol[0] = 0; jcol[0] = 0; kcol[0] = 0; for(int c=1,i=0,j=0,k=0; c<seq123.size()+1; c++) { if (not old.gap(seq123[c-1],n1)) i++; if (not old.gap(seq123[c-1],n2)) j++; if (not old.gap(seq123[c-1],n3)) k++; icol[c] = i; jcol[c] = j; kcol[c] = k; } /*-------------- Create alignment matrices ---------------*/ // Cache which states emit which sequences vector<int> state_emit(nstates+1); for(int S2=0; S2<state_emit.size(); S2++) { state_emit[S2] = 0; if (di(S2) or dj(S2) or dk(S2)) state_emit[S2] |= (1<<0); } vector<int> branches; for(int i=1; i<nodes.size(); i++) branches.push_back(T.branch(nodes[0],nodes[i]) ); const Matrix Q = createQ(P.branch_HMMs,branches); vector<double> start_P = get_start_P(P.branch_HMMs,branches); // Actually create the Matrices & Chain boost::shared_ptr<DParrayConstrained> Matrices( new DParrayConstrained(seq123.size(),state_emit,start_P,Q, P.get_beta()) ); // Determine which states are allowed to match (c2) for(int c2=0; c2<Matrices->size(); c2++) { int i2 = icol[c2]; int j2 = jcol[c2]; int k2 = kcol[c2]; Matrices->states(c2).reserve(Matrices->nstates()); for(int i=0; i<Matrices->nstates(); i++) { int S2 = Matrices->order(i); //---------- Get (i1,j1,k1) ---------- int i1 = i2; if (di(S2)) i1--; int j1 = j2; if (dj(S2)) j1--; int k1 = k2; if (dk(S2)) k1--; //------ Get c1, check if valid ------ if (c2==0 or (i1 == i2 and j1 == j2 and k1 == k2) or (i1 == icol[c2-1] and j1 == jcol[c2-1] and k1 == kcol[c2-1]) ) Matrices->states(c2).push_back(S2); else { } // this state not allowed here } } /*------------------ Compute the DP matrix ---------------------*/ // Matrices.prune(); prune is broken! Matrices->forward(); //------------- Sample a path from the matrix -------------------// vector<int> path_g = Matrices->sample_path(); vector<int> path = Matrices->ungeneralize(path_g); *P.A = construct(old,path,n0,n1,n2,n3,T,seq1,seq2,seq3); for(int i=1; i<4; i++) { int b = T.branch(nodes[0],nodes[i]); P.note_alignment_changed_on_branch(b); } #ifndef NDEBUG vector<int> path_new = get_path_3way(project(*P.A,n0,n1,n2,n3),0,1,2,3); vector<int> path_new2 = get_path_3way(*P.A,n0,n1,n2,n3); assert(path_new == path_new2); // <- current implementation probably guarantees this // but its not a NECESSARY effect of the routine. // get the generalized paths - no sequential silent states that can loop vector<int> path_new_g = Matrices->generalize(path_new); assert(path_new_g == path_g); assert(valid(*P.A)); #endif default_timer_stack.pop_timer(); return Matrices; }
boost::shared_ptr<DPmatrixSimple> sample_alignment_base(data_partition& P,int b) { assert(P.has_IModel()); dynamic_bitset<> s1 = constraint_satisfied(P.alignment_constraint, *P.A); const Tree& T = *P.T; //FIXME - partitions data_partition P0 = P; // We COULD make this conditional... perhaps we should //FIXME - partitions alignment& old = *P0.A; const Matrix frequency = substitution::frequency_matrix(P.SModel()); int node1 = T.branch(b).target(); int node2 = T.branch(b).source(); dynamic_bitset<> group1 = T.partition(node2,node1); // Find sub-alignments and sequences vector<int> seq1; vector<int> seq2; vector<int> seq12; for(int column=0;column<old.length();column++) { if (not old.gap(column,node1)) seq1.push_back(column); if (not old.gap(column,node2)) seq2.push_back(column); if (not old.gap(column,node1) or old.gap(column,node2)) seq12.push_back(column); } //FIXME - this makes the debug routines crash if (not seq1.size() or not seq2.size()) return boost::shared_ptr<DPmatrixSimple>(); //NULL; /******** Precompute distributions at node2 from the 2 subtrees **********/ distributions_t_local distributions = distributions_tree; if (not P.smodel_full_tree) distributions = distributions_star; vector< Matrix > dists1 = distributions(P0,seq1,b,true); vector< Matrix > dists2 = distributions(P0,seq2,b,false); vector<int> state_emit(4,0); state_emit[0] |= (1<<1)|(1<<0); state_emit[1] |= (1<<1); state_emit[2] |= (1<<0); state_emit[3] |= 0; boost::shared_ptr<DPmatrixSimple> Matrices( new DPmatrixSimple(state_emit, P.branch_HMMs[b].start_pi(), P.branch_HMMs[b], P.beta[0], P.SModel().distribution(), dists1, dists2, frequency) ); //------------------ Compute the DP matrix ---------------------// vector<int> path_old = get_path(old,node1,node2); vector<vector<int> > pins = get_pins(P.alignment_constraint,old,group1,~group1,seq1,seq2,seq12); vector<int> path = Matrices->forward(pins); path.erase(path.begin()+path.size()-1); *P.A = construct(old,path,node1,node2,T,seq1,seq2); P.LC.set_length(P.A->length()); P.LC.invalidate_branch_alignment(T,b); P.note_alignment_changed_on_branch(b); #ifndef NDEBUG_DP assert(valid(*P.A)); dynamic_bitset<> s2 = constraint_satisfied(P.alignment_constraint, *P.A); report_constraints(s1,s2); vector<int> path_new = get_path(*P.A, node1, node2); path.push_back(3); assert(path_new == path); #endif return Matrices; }
efloat_t Pr(const data_partition& P,Likelihood_Cache& LC) { return Pr(*P.A, P.MC, *P.T, LC, P.SModel()); }
/// Find the probabilities of each letter at the root, given the data at the nodes in 'group' vector<Matrix> get_column_likelihoods(const data_partition& P, const vector<int>& b, const vector<int>& req,const vector<int>& seq,int delta) { const alphabet& a = P.get_alphabet(); const alignment& A = *P.A; const Tree& T = *P.T; Likelihood_Cache& LC = P.LC; #ifndef NDEBUG subA_index_check_footprint(A,T); subA_index_check_regenerate(A,T); #endif //------ Check that all branches point to a 'root' node -----------// assert(b.size()); int root = T.directed_branch(b[0]).target(); for(int i=1;i<b.size();i++) assert(T.directed_branch(b[i]).target() == root); LC.root = root; ublas::matrix<int> index = subA_index_any(b,A,T,req,seq); IF_DEBUG(int n_br =) calculate_caches(P); #ifndef NDEBUG std::clog<<"get_column_likelihoods: Peeled on "<<n_br<<" branches.\n"; #endif vector<Matrix> L; L.reserve(A.length()+2); Matrix& S = LC.scratch(0); const int n_models = S.size1(); const int n_states = S.size2(); //Add the padding matrices { for(int i=0;i<S.size1();i++) for(int j=0;j<S.size2();j++) S(i,j) = 0; for(int i=0;i<delta;i++) L.push_back(S); } const vector<unsigned>& smap = P.SModel().state_letters(); for(int i=0;i<index.size1();i++) { for(int m=0;m<n_models;m++) { for(int s=0;s<n_states;s++) S(m,s) = 1; //-------------- Propagate and collect information at 'root' -----------// for(int j=0;j<b.size();j++) { int i0 = index(i,j); if (i0 != alphabet::gap) for(int s=0;s<n_states;s++) S(m,s) *= LC(i0,b[j])(m,s); } if (root < T.n_leaves()) { int rl = A.seq(root)[i]; if (a.is_letter_class(rl)) for(int s=0;s<n_states;s++) if (not a.matches(smap[s],rl)) S(m,s) = 0; } } L.push_back(S); } return L; }
int calculate_caches(const data_partition& P) { return calculate_caches(*P.A, P.MC, *P.T, P.LC, P.SModel()); }
efloat_t calc_root_probability(const data_partition& P,const vector<int>& rb, const ublas::matrix<int>& index) { return calc_root_probability(*P.A, *P.T, P.LC, P.SModel(), rb, index); }
boost::shared_ptr<DPmatrixConstrained> tri_sample_alignment_base(data_partition& P,const vector<int>& nodes) { const Tree& T = *P.T; alignment& A = *P.A; assert(P.variable_alignment()); assert(T.is_connected(nodes[0],nodes[1])); assert(T.is_connected(nodes[0],nodes[2])); assert(T.is_connected(nodes[0],nodes[3])); const Matrix frequency = substitution::frequency_matrix(P.SModel()); // std::cerr<<"A = "<<A<<endl; //------------- Compute sequence properties --------------// dynamic_bitset<> group1 = T.partition(nodes[0],nodes[1]); dynamic_bitset<> group2 = T.partition(nodes[0],nodes[2]); dynamic_bitset<> group3 = T.partition(nodes[0],nodes[3]); // std::clog<<"n0 = "<<nodes[0]<<" n1 = "<<nodes[1]<<" n2 = "<<nodes[2]<<" n3 = "<<nodes[3]<<std::endl; // std::clog<<"A (reordered) = "<<project(A,nodes[0],nodes[1],nodes[2],nodes[3])<<endl; vector<int> columns = getorder(A,nodes[0],nodes[1],nodes[2],nodes[3]); #ifndef NDEBUG // getorder(project(A,...)...) is not the same as getorder(A,...) because columns that are // in both project(A,...) and A have different columns numbers in each alignment, and // project(A,...) is shorter. // However, the NUMBER of columns should be the same. vector<int> columns2 = getorder(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3); assert(columns.size() == columns2.size()); #endif // Find sub-alignments and sequences vector<int> seq1; seq1.reserve(A.length()); vector<int> seq2; seq2.reserve(A.length()); vector<int> seq3; seq3.reserve(A.length()); vector<int> seq23; seq23.reserve(A.length()); for(int i=0;i<columns.size();i++) { int column = columns[i]; if (not A.gap(column,nodes[1])) seq1.push_back(column); if (not A.gap(column,nodes[2])) seq2.push_back(column); if (not A.gap(column,nodes[3])) seq3.push_back(column); if (not A.gap(column,nodes[2]) or not A.gap(column,nodes[3])) seq23.push_back(column); } // Map columns with n2 or n3 to single index 'c' vector<int> jcol(seq23.size()+1); vector<int> kcol(seq23.size()+1); jcol[0] = 0; kcol[0] = 0; for(int c=1,j=0,k=0;c<seq23.size()+1;c++) { if (not A.gap(seq23[c-1],nodes[2])) j++; if (not A.gap(seq23[c-1],nodes[3])) k++; jcol[c] = j; kcol[c] = k; } // Precompute distributions at nodes[0] distributions_t distributions = distributions_tree; if (not P.smodel_full_tree) distributions = distributions_star; vector< Matrix > dists1 = distributions(P,seq1,nodes[0],group1); vector< Matrix > dists23 = distributions(P,seq23,nodes[0],group2|group3); //-------------- Create alignment matrices ---------------// vector<int> branches(3); for(int i=0;i<3;i++) branches[i] = T.branch(nodes[0],nodes[i+1]); const Matrix Q = createQ(P.branch_HMMs, branches); vector<double> start_P = get_start_P(P.branch_HMMs,branches); // Actually create the Matrices & Chain boost::shared_ptr<DPmatrixConstrained> Matrices(new DPmatrixConstrained(get_state_emit(), start_P, Q, P.beta[0], P.SModel().distribution(), dists1, dists23, frequency) ); // Determine which states are allowed to match (,c2) for(int c2=0;c2<dists23.size()-1;c2++) { int j2 = jcol[c2]; int k2 = kcol[c2]; Matrices->states(c2).reserve(Matrices->nstates()); for(int i=0;i<Matrices->nstates();i++) { int S2 = Matrices->order(i); //---------- Get (,j1,k1) ---------- int j1 = j2; if (dj(S2)) j1--; int k1 = k2; if (dk(S2)) k1--; //------ Get c1, check if valid ------ if (c2==0 or (j1 == j2 and k1 == k2) or (j1 == jcol[c2-1] and k1 == kcol[c2-1]) ) Matrices->states(c2+1).push_back(S2); else { } // this state not allowed here } } //------------------ Compute the DP matrix ---------------------// // Matrices.prune(); prune is broken! // vector<int> path_old = get_path_3way(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3); // vector<int> path_old_g = Matrices.generalize(path_old); // vector<int> path_g = Matrices.forward(P.features,(int)P.constants[0],path_old_g); vector<vector<int> > pins = get_pins(P.alignment_constraint,A,group1,group2 | group3,seq1,seq23,columns); // if the constraints are currently met but cannot be met if (pins.size() == 1 and pins[0][0] == -1) ; //std::cerr<<"Constraints cannot be expressed in terms of DP matrix paths!"<<std::endl; else { Matrices->forward_constrained(pins); if (Matrices->Pr_sum_all_paths() <= 0.0) std::cerr<<"Constraints give this choice probability 0"<<std::endl; } if (Matrices->Pr_sum_all_paths() <= 0.0) return Matrices; vector<int> path_g = Matrices->sample_path(); vector<int> path = Matrices->ungeneralize(path_g); A = construct(A,path,nodes[0],nodes[1],nodes[2],nodes[3],T,seq1,seq2,seq3); for(int i=1;i<4;i++) { int b = T.branch(nodes[0],nodes[i]); P.note_alignment_changed_on_branch(b); } #ifndef NDEBUG_DP //--------------- Check alignment construction ------------------// vector<int> path_new = get_path_3way(project(A,nodes),0,1,2,3); vector<int> path_new2 = get_path_3way(A,nodes); assert(path_new == path_new2); // <- current implementation probably guarantees this // but its not a NECESSARY effect of the routine. // due to ordering stuff required in the path but // not store in the alignment A. vector<int> path_new_g = Matrices->generalize(path_new); if (path_new_g != path_g) { std::clog<<"A' (reordered) = "<<project(A,nodes)<<endl; std::clog<<"A' = "<<A<<endl; std::abort(); } assert(valid(A)); #endif // std::cerr<<"[tri]bandwidth = "<<bandwidth(Matrices,path_g)<<std::endl; // std::cerr<<"[tri]bandwidth2 = "<<bandwidth2(Matrices,path_g)<<std::endl; #ifndef NDEBUG_DP check_alignment(A,T,"sample_tri_base:out"); #else Matrices->clear(); #endif P.LC.set_length(A.length()); int b = T.branch(nodes[0],nodes[1]); P.LC.invalidate_branch_alignment(T, b); return Matrices; }