boost::shared_ptr<DPmatrixSimple> sample_alignment_base(data_partition& P,int b) { assert(P.has_IModel()); dynamic_bitset<> s1 = constraint_satisfied(P.alignment_constraint, *P.A); const Tree& T = *P.T; //FIXME - partitions data_partition P0 = P; // We COULD make this conditional... perhaps we should //FIXME - partitions alignment& old = *P0.A; const Matrix frequency = substitution::frequency_matrix(P.SModel()); int node1 = T.branch(b).target(); int node2 = T.branch(b).source(); dynamic_bitset<> group1 = T.partition(node2,node1); // Find sub-alignments and sequences vector<int> seq1; vector<int> seq2; vector<int> seq12; for(int column=0;column<old.length();column++) { if (not old.gap(column,node1)) seq1.push_back(column); if (not old.gap(column,node2)) seq2.push_back(column); if (not old.gap(column,node1) or old.gap(column,node2)) seq12.push_back(column); } //FIXME - this makes the debug routines crash if (not seq1.size() or not seq2.size()) return boost::shared_ptr<DPmatrixSimple>(); //NULL; /******** Precompute distributions at node2 from the 2 subtrees **********/ distributions_t_local distributions = distributions_tree; if (not P.smodel_full_tree) distributions = distributions_star; vector< Matrix > dists1 = distributions(P0,seq1,b,true); vector< Matrix > dists2 = distributions(P0,seq2,b,false); vector<int> state_emit(4,0); state_emit[0] |= (1<<1)|(1<<0); state_emit[1] |= (1<<1); state_emit[2] |= (1<<0); state_emit[3] |= 0; boost::shared_ptr<DPmatrixSimple> Matrices( new DPmatrixSimple(state_emit, P.branch_HMMs[b].start_pi(), P.branch_HMMs[b], P.beta[0], P.SModel().distribution(), dists1, dists2, frequency) ); //------------------ Compute the DP matrix ---------------------// vector<int> path_old = get_path(old,node1,node2); vector<vector<int> > pins = get_pins(P.alignment_constraint,old,group1,~group1,seq1,seq2,seq12); vector<int> path = Matrices->forward(pins); path.erase(path.begin()+path.size()-1); *P.A = construct(old,path,node1,node2,T,seq1,seq2); P.LC.set_length(P.A->length()); P.LC.invalidate_branch_alignment(T,b); P.note_alignment_changed_on_branch(b); #ifndef NDEBUG_DP assert(valid(*P.A)); dynamic_bitset<> s2 = constraint_satisfied(P.alignment_constraint, *P.A); report_constraints(s1,s2); vector<int> path_new = get_path(*P.A, node1, node2); path.push_back(3); assert(path_new == path); #endif return Matrices; }
efloat_t Pr(const data_partition& P,Likelihood_Cache& LC) { return Pr(*P.A, P.MC, *P.T, LC, P.SModel()); }
/// Find the probabilities of each letter at the root, given the data at the nodes in 'group' vector<Matrix> get_column_likelihoods(const data_partition& P, const vector<int>& b, const vector<int>& req,const vector<int>& seq,int delta) { const alphabet& a = P.get_alphabet(); const alignment& A = *P.A; const Tree& T = *P.T; Likelihood_Cache& LC = P.LC; #ifndef NDEBUG subA_index_check_footprint(A,T); subA_index_check_regenerate(A,T); #endif //------ Check that all branches point to a 'root' node -----------// assert(b.size()); int root = T.directed_branch(b[0]).target(); for(int i=1;i<b.size();i++) assert(T.directed_branch(b[i]).target() == root); LC.root = root; ublas::matrix<int> index = subA_index_any(b,A,T,req,seq); IF_DEBUG(int n_br =) calculate_caches(P); #ifndef NDEBUG std::clog<<"get_column_likelihoods: Peeled on "<<n_br<<" branches.\n"; #endif vector<Matrix> L; L.reserve(A.length()+2); Matrix& S = LC.scratch(0); const int n_models = S.size1(); const int n_states = S.size2(); //Add the padding matrices { for(int i=0;i<S.size1();i++) for(int j=0;j<S.size2();j++) S(i,j) = 0; for(int i=0;i<delta;i++) L.push_back(S); } const vector<unsigned>& smap = P.SModel().state_letters(); for(int i=0;i<index.size1();i++) { for(int m=0;m<n_models;m++) { for(int s=0;s<n_states;s++) S(m,s) = 1; //-------------- Propagate and collect information at 'root' -----------// for(int j=0;j<b.size();j++) { int i0 = index(i,j); if (i0 != alphabet::gap) for(int s=0;s<n_states;s++) S(m,s) *= LC(i0,b[j])(m,s); } if (root < T.n_leaves()) { int rl = A.seq(root)[i]; if (a.is_letter_class(rl)) for(int s=0;s<n_states;s++) if (not a.matches(smap[s],rl)) S(m,s) = 0; } } L.push_back(S); } return L; }
int calculate_caches(const data_partition& P) { return calculate_caches(*P.A, P.MC, *P.T, P.LC, P.SModel()); }
efloat_t calc_root_probability(const data_partition& P,const vector<int>& rb, const ublas::matrix<int>& index) { return calc_root_probability(*P.A, *P.T, P.LC, P.SModel(), rb, index); }
boost::shared_ptr<DPmatrixConstrained> tri_sample_alignment_base(data_partition& P,const vector<int>& nodes) { const Tree& T = *P.T; alignment& A = *P.A; assert(P.variable_alignment()); assert(T.is_connected(nodes[0],nodes[1])); assert(T.is_connected(nodes[0],nodes[2])); assert(T.is_connected(nodes[0],nodes[3])); const Matrix frequency = substitution::frequency_matrix(P.SModel()); // std::cerr<<"A = "<<A<<endl; //------------- Compute sequence properties --------------// dynamic_bitset<> group1 = T.partition(nodes[0],nodes[1]); dynamic_bitset<> group2 = T.partition(nodes[0],nodes[2]); dynamic_bitset<> group3 = T.partition(nodes[0],nodes[3]); // std::clog<<"n0 = "<<nodes[0]<<" n1 = "<<nodes[1]<<" n2 = "<<nodes[2]<<" n3 = "<<nodes[3]<<std::endl; // std::clog<<"A (reordered) = "<<project(A,nodes[0],nodes[1],nodes[2],nodes[3])<<endl; vector<int> columns = getorder(A,nodes[0],nodes[1],nodes[2],nodes[3]); #ifndef NDEBUG // getorder(project(A,...)...) is not the same as getorder(A,...) because columns that are // in both project(A,...) and A have different columns numbers in each alignment, and // project(A,...) is shorter. // However, the NUMBER of columns should be the same. vector<int> columns2 = getorder(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3); assert(columns.size() == columns2.size()); #endif // Find sub-alignments and sequences vector<int> seq1; seq1.reserve(A.length()); vector<int> seq2; seq2.reserve(A.length()); vector<int> seq3; seq3.reserve(A.length()); vector<int> seq23; seq23.reserve(A.length()); for(int i=0;i<columns.size();i++) { int column = columns[i]; if (not A.gap(column,nodes[1])) seq1.push_back(column); if (not A.gap(column,nodes[2])) seq2.push_back(column); if (not A.gap(column,nodes[3])) seq3.push_back(column); if (not A.gap(column,nodes[2]) or not A.gap(column,nodes[3])) seq23.push_back(column); } // Map columns with n2 or n3 to single index 'c' vector<int> jcol(seq23.size()+1); vector<int> kcol(seq23.size()+1); jcol[0] = 0; kcol[0] = 0; for(int c=1,j=0,k=0;c<seq23.size()+1;c++) { if (not A.gap(seq23[c-1],nodes[2])) j++; if (not A.gap(seq23[c-1],nodes[3])) k++; jcol[c] = j; kcol[c] = k; } // Precompute distributions at nodes[0] distributions_t distributions = distributions_tree; if (not P.smodel_full_tree) distributions = distributions_star; vector< Matrix > dists1 = distributions(P,seq1,nodes[0],group1); vector< Matrix > dists23 = distributions(P,seq23,nodes[0],group2|group3); //-------------- Create alignment matrices ---------------// vector<int> branches(3); for(int i=0;i<3;i++) branches[i] = T.branch(nodes[0],nodes[i+1]); const Matrix Q = createQ(P.branch_HMMs, branches); vector<double> start_P = get_start_P(P.branch_HMMs,branches); // Actually create the Matrices & Chain boost::shared_ptr<DPmatrixConstrained> Matrices(new DPmatrixConstrained(get_state_emit(), start_P, Q, P.beta[0], P.SModel().distribution(), dists1, dists23, frequency) ); // Determine which states are allowed to match (,c2) for(int c2=0;c2<dists23.size()-1;c2++) { int j2 = jcol[c2]; int k2 = kcol[c2]; Matrices->states(c2).reserve(Matrices->nstates()); for(int i=0;i<Matrices->nstates();i++) { int S2 = Matrices->order(i); //---------- Get (,j1,k1) ---------- int j1 = j2; if (dj(S2)) j1--; int k1 = k2; if (dk(S2)) k1--; //------ Get c1, check if valid ------ if (c2==0 or (j1 == j2 and k1 == k2) or (j1 == jcol[c2-1] and k1 == kcol[c2-1]) ) Matrices->states(c2+1).push_back(S2); else { } // this state not allowed here } } //------------------ Compute the DP matrix ---------------------// // Matrices.prune(); prune is broken! // vector<int> path_old = get_path_3way(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3); // vector<int> path_old_g = Matrices.generalize(path_old); // vector<int> path_g = Matrices.forward(P.features,(int)P.constants[0],path_old_g); vector<vector<int> > pins = get_pins(P.alignment_constraint,A,group1,group2 | group3,seq1,seq23,columns); // if the constraints are currently met but cannot be met if (pins.size() == 1 and pins[0][0] == -1) ; //std::cerr<<"Constraints cannot be expressed in terms of DP matrix paths!"<<std::endl; else { Matrices->forward_constrained(pins); if (Matrices->Pr_sum_all_paths() <= 0.0) std::cerr<<"Constraints give this choice probability 0"<<std::endl; } if (Matrices->Pr_sum_all_paths() <= 0.0) return Matrices; vector<int> path_g = Matrices->sample_path(); vector<int> path = Matrices->ungeneralize(path_g); A = construct(A,path,nodes[0],nodes[1],nodes[2],nodes[3],T,seq1,seq2,seq3); for(int i=1;i<4;i++) { int b = T.branch(nodes[0],nodes[i]); P.note_alignment_changed_on_branch(b); } #ifndef NDEBUG_DP //--------------- Check alignment construction ------------------// vector<int> path_new = get_path_3way(project(A,nodes),0,1,2,3); vector<int> path_new2 = get_path_3way(A,nodes); assert(path_new == path_new2); // <- current implementation probably guarantees this // but its not a NECESSARY effect of the routine. // due to ordering stuff required in the path but // not store in the alignment A. vector<int> path_new_g = Matrices->generalize(path_new); if (path_new_g != path_g) { std::clog<<"A' (reordered) = "<<project(A,nodes)<<endl; std::clog<<"A' = "<<A<<endl; std::abort(); } assert(valid(A)); #endif // std::cerr<<"[tri]bandwidth = "<<bandwidth(Matrices,path_g)<<std::endl; // std::cerr<<"[tri]bandwidth2 = "<<bandwidth2(Matrices,path_g)<<std::endl; #ifndef NDEBUG_DP check_alignment(A,T,"sample_tri_base:out"); #else Matrices->clear(); #endif P.LC.set_length(A.length()); int b = T.branch(nodes[0],nodes[1]); P.LC.invalidate_branch_alignment(T, b); return Matrices; }