/************************************************************************ Expands the single multi path into all possible sequence variants. Since this turns out to be the time-limiting process for long de nvoo, this is implemented using a quick branch and bound that works on edge variant indices. The SeqPaths are created only for the final set of sequences. *************************************************************************/ void PrmGraph::fast_expand_multi_path_for_combo_scores( AllScoreModels *model, const MultiPath& multi_path, vector<SeqPath>& seq_paths, score_t min_score, score_t forbidden_pair_penalty, int max_num_paths) { const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos(); vector<score_t> max_attainable_scores; const int num_edges = multi_path.edge_idxs.size(); max_attainable_scores.resize(num_edges+1,0); int num_path_variants=1; int e; for (e=num_edges-1; e<num_edges; e++) { const int e_idx = multi_path.edge_idxs[e]; const MultiEdge& edge = multi_edges[e_idx]; num_path_variants *= edge.variant_ptrs.size(); } if (num_path_variants == 0) { cout << "Error: had an edge with 0 variants!" <<endl; exit(1); } // re-check the max attainable scores, this time compute all combo scores along the path // this will give a more accurate max attainable score, and also allow quick computation // of the expanded scores max_attainable_scores.clear(); max_attainable_scores.resize(num_edges+1,0); vector<score_t> max_node_scores; max_node_scores.resize(num_edges+1,NEG_INF); for (e=0; e<=num_edges; e++) { const int n_edge_idx = (e>0 ? multi_path.edge_idxs[e-1] : -1); const int c_edge_idx = (e<num_edges ? multi_path.edge_idxs[e] : -1); const int node_idx = (n_edge_idx>=0 ? multi_edges[n_edge_idx].c_idx : multi_edges[c_edge_idx].n_idx); const int num_n_vars = (e>0 ? multi_edges[n_edge_idx].variant_ptrs.size() : 1); const int num_c_vars = (e<num_edges ? multi_edges[c_edge_idx].variant_ptrs.size() : 1); int j,k; for (j=0; j<num_n_vars; j++) for (k=0; k<num_c_vars; k++) { score_t combo_score = model->get_node_combo_score(this,node_idx,n_edge_idx,j,c_edge_idx,k); if (max_node_scores[e]<combo_score) max_node_scores[e]=combo_score; } } max_attainable_scores[num_edges]=max_node_scores[num_edges]; for (e=num_edges-1; e>=0; e--) { const int e_idx = multi_path.edge_idxs[e]; const MultiEdge& edge = multi_edges[e_idx]; max_attainable_scores[e]= max_attainable_scores[e+1] + edge.max_variant_score + max_node_scores[e]; } score_t max_path_score = max_attainable_scores[0] - multi_path.num_forbidden_nodes * forbidden_pair_penalty; score_t min_delta_allowed = min_score - max_path_score; if (min_delta_allowed>0) return; // in this expansion method, all edges are stored (not only ones with more than one variant) // perform expansion using a heap and condensed path representation vector<int> var_positions; // holds the edge idxs vector<int> num_vars; vector<int> var_edge_positions; vector< vector< score_t > > var_scores; var_scores.clear(); var_positions.clear(); num_vars.clear(); int num_aas_in_multipath = 0; for (e=0; e<num_edges; e++) { const int e_idx = multi_path.edge_idxs[e]; const MultiEdge& edge = multi_edges[e_idx]; const int num_vars_in_edge = edge.variant_ptrs.size(); vector<score_t> scores; num_aas_in_multipath+=edge.num_aa; int i; for (i=0; i<num_vars_in_edge; i++) scores.push_back(edge.variant_scores[i]); var_scores.push_back(scores); var_positions.push_back(e); num_vars.push_back(num_vars_in_edge); var_edge_positions.push_back(num_aas_in_multipath-edge.num_aa); } // create the SeqPaths from the edge_combos... const int num_positions = num_aas_in_multipath+1; SeqPath template_path; // holds common elements to all paths template_path.n_term_aa = multi_path.n_term_aa; template_path.c_term_aa = multi_path.c_term_aa; template_path.n_term_mass = multi_path.n_term_mass; template_path.c_term_mass = multi_path.c_term_mass; template_path.charge = charge; template_path.multi_path_rank = multi_path.original_rank; template_path.path_score = 0; template_path.prm_ptr = (PrmGraph *)this; template_path.positions.clear(); template_path.positions.resize(num_positions); template_path.num_forbidden_nodes = multi_path.num_forbidden_nodes; template_path.path_score -= template_path.num_forbidden_nodes * forbidden_pair_penalty; int pos_idx=0; for (e=0; e<multi_path.edge_idxs.size(); e++) { const int e_idx = multi_path.edge_idxs[e]; const MultiEdge& edge = get_multi_edge(e_idx); PathPos& pos = template_path.positions[pos_idx++]; pos.breakage = edge.n_break; pos.edge_idx = e_idx; pos.mass = edge.n_break->mass; pos.node_idx = edge.n_idx; int *variant_ptr = edge.variant_ptrs[0]; const int num_aa = *variant_ptr++; int *aas = variant_ptr; if (edge.variant_ptrs.size() == 1) { pos.variant_ptr = edge.variant_ptrs[0]; pos.edge_variant_score = edge.variant_scores[0]; pos.aa = aas[0]; } if (edge.num_aa == 1) continue; int j; for (j=1; j<edge.num_aa; j++) { PathPos& pos = template_path.positions[pos_idx++]; pos.breakage = NULL; pos.aa = aas[j]; } } if (pos_idx != num_aas_in_multipath) { cout << "Error: mismatches between positions and multipath length!" << endl; exit(1); } PathPos& last_pos = template_path.positions[num_positions-1]; const int last_e_idx = multi_path.edge_idxs[e-1]; const MultiEdge& last_edge = get_multi_edge(last_e_idx); last_pos.breakage = last_edge.c_break; last_pos.edge_idx =-1; last_pos.mass = last_edge.c_break->mass; last_pos.node_idx = last_edge.c_idx; const int num_multi_var_edges = var_positions.size(); vector<var_combo> combo_heap; vector<int> var_idxs; var_idxs.resize(num_multi_var_edges,0); const int last_var_pos = num_multi_var_edges-1; const int last_var_val = num_vars[last_var_pos]; const score_t needed_var_score = min_score; // this is only the penalty for forbidden pairs while (1) { score_t idxs_score = 0; int k; for (k=0; k<var_idxs.size(); k++) idxs_score += var_scores[k][var_idxs[k]]; // this is the score for the edge variants (usually 0) const vector<PathPos>& positions = template_path.positions; score_t path_score = idxs_score + template_path.path_score; // compute the node scores based on the edge variants // N-term node score vector<score_t> node_scores; node_scores.resize(var_idxs.size()+1,NEG_INF); node_scores[0] = model->get_node_combo_score(this,positions[0].node_idx,-1,0, positions[0].edge_idx,var_idxs[0]); path_score += node_scores[0]; // middle nodes score int prev_edge_idx = positions[0].edge_idx; for (k=1; k<var_idxs.size(); k++) { const int p=var_edge_positions[k]; const PathPos& pos = positions[p]; node_scores[k] = model->get_node_combo_score(this,pos.node_idx,prev_edge_idx,var_idxs[k-1], pos.edge_idx,var_idxs[k]); path_score += node_scores[k]; prev_edge_idx = pos.edge_idx; } // C-term node score node_scores[k] = model->get_node_combo_score(this,last_pos.node_idx,prev_edge_idx,var_idxs[k-1],-1,0); path_score += node_scores[k]; if (path_score>=needed_var_score) { if (combo_heap.size()<max_num_paths) { var_combo v; v.path_score = path_score; v.var_idxs = var_idxs; v.node_scores = node_scores; combo_heap.push_back(v); if (combo_heap.size()== max_num_paths) make_heap(combo_heap.begin(),combo_heap.end()); } else { const score_t score = path_score; if (score>combo_heap[0].path_score) { pop_heap(combo_heap.begin(),combo_heap.end()); var_combo& combo = combo_heap[max_num_paths-1]; combo.path_score = path_score; combo.var_idxs = var_idxs; combo.node_scores = node_scores; push_heap(combo_heap.begin(),combo_heap.end()); } } } int j=0; while (j<num_multi_var_edges) { var_idxs[j]++; if (var_idxs[j]==num_vars[j]) { var_idxs[j++]=0; } else break; } if (j==num_multi_var_edges) break; } seq_paths.clear(); seq_paths.resize(combo_heap.size(),template_path); int i; for (i=0; i<combo_heap.size(); i++) { const var_combo& combo = combo_heap[i]; SeqPath& seq_path = seq_paths[i]; // fill in info that changes with multi-variant edges int j; for (j=0; j<num_multi_var_edges; j++) { const int var_pos = var_positions[j]; const int e_idx = multi_path.edge_idxs[var_pos]; const MultiEdge& edge = get_multi_edge(e_idx); const int pos_idx = var_edge_positions[j]; if (seq_path.positions[pos_idx].edge_idx != e_idx) { cout << "POS: " << pos_idx << endl; cout << "Error: mismatch in pos_idx and e_idx of a multipath!" << endl; cout << "looking for " << e_idx << endl; cout << "edge idxs:" << endl; int k; for (k=0; k<seq_path.positions.size(); k++) cout << k << "\t" << seq_path.positions[k].edge_idx << "\tnidx: " << seq_path.positions[k].node_idx << endl; cout << endl; multi_path.print(config); this->print(); exit(1); } PathPos& pos = seq_path.positions[pos_idx]; const int variant_idx = combo.var_idxs[j]; int *variant_ptr = edge.variant_ptrs[variant_idx]; const int num_aa = *variant_ptr++; int *aas = variant_ptr; if (num_aa != edge.num_aa) { cout << "Error: edge and variant mixup!" << endl; exit(1); } pos.node_score = combo.node_scores[j]; pos.edge_variant_score = edge.variant_scores[variant_idx]; pos.variant_ptr = edge.variant_ptrs[variant_idx]; pos.aa = aas[0]; if (edge.num_aa == 1) continue; int k; for (k=1; k<edge.num_aa; k++) { PathPos& pos = seq_path.positions[pos_idx+k]; pos.aa = aas[k]; } } seq_path.positions[seq_path.positions.size()-1].node_score = combo.node_scores[num_multi_var_edges]; seq_path.path_score=combo.path_score; float pen = seq_path.adjust_complete_sequence_penalty(this); seq_path.make_seq_str(config); } }
/************************************************************************ Expands the single multi path into all possible sequence variants. Since this turns out to be the time-limiting process for long de nvoo, this is implemented using a quick branch and bound that works on edge variant indices. The SeqPaths are created only for the final set of sequences. *************************************************************************/ void PrmGraph::fast_expand_multi_path(const MultiPath& multi_path, vector<SeqPath>& seq_paths, score_t min_score, score_t forbidden_pair_penalty, int max_num_paths) const { const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos(); vector<score_t> max_attainable_scores; const int num_edges = multi_path.edge_idxs.size(); max_attainable_scores.resize(num_edges+1,0); int num_path_variants=1; int e; for (e=num_edges-1; e>=0; e--) { const int e_idx = multi_path.edge_idxs[e]; const MultiEdge& edge = multi_edges[e_idx]; max_attainable_scores[e]= max_attainable_scores[e+1] + edge.max_variant_score + edge.c_break->score; num_path_variants *= edge.variant_ptrs.size(); } const score_t max_path_score = multi_edges[multi_path.edge_idxs[0]].n_break->score + max_attainable_scores[0]; const score_t min_delta_allowed = min_score - max_path_score; if (min_delta_allowed>0) return; // this multipath won't help much if (num_path_variants == 0) { cout << "Error: had an edge with 0 variants!" <<endl; exit(1); } // perform expansion using a heap and condensed path representation vector<int> var_positions; // holds the edge idxs vector<int> num_vars; vector<int> var_edge_positions; vector< vector< score_t > > var_scores; var_scores.clear(); var_positions.clear(); num_vars.clear(); int num_aas_in_multipath = 0; for (e=0; e<num_edges; e++) { const int e_idx = multi_path.edge_idxs[e]; const MultiEdge& edge = multi_edges[e_idx]; const int num_vars_in_edge = edge.variant_ptrs.size(); vector<score_t> scores; num_aas_in_multipath+=edge.num_aa; if (num_vars_in_edge>1) { int i; for (i=0; i<num_vars_in_edge; i++) scores.push_back(edge.variant_scores[i]); var_scores.push_back(scores); var_positions.push_back(e); num_vars.push_back(num_vars_in_edge); var_edge_positions.push_back(num_aas_in_multipath-edge.num_aa); // cout << e << "\t" << e_idx << "\t" << num_vars_in_edge << "\t" << edge.num_aa << endl; } } // create the SeqPaths from the edge_combos... const int num_positions = num_aas_in_multipath+1; SeqPath template_path; // holds common elements to all paths template_path.n_term_aa = multi_path.n_term_aa; template_path.c_term_aa = multi_path.c_term_aa; template_path.n_term_mass = multi_path.n_term_mass; template_path.c_term_mass = multi_path.c_term_mass; template_path.multi_path_rank = multi_path.original_rank; template_path.path_score = 0; template_path.prm_ptr = (PrmGraph *)this; template_path.positions.clear(); template_path.positions.resize(num_positions); template_path.num_forbidden_nodes = multi_path.num_forbidden_nodes; template_path.path_score -= template_path.num_forbidden_nodes * forbidden_pair_penalty; int pos_idx=0; for (e=0; e<multi_path.edge_idxs.size(); e++) { const int e_idx = multi_path.edge_idxs[e]; const MultiEdge& edge = get_multi_edge(e_idx); PathPos& pos = template_path.positions[pos_idx++]; pos.breakage = edge.n_break; pos.edge_idx = e_idx; pos.mass = edge.n_break->mass; pos.node_score = edge.n_break->score; pos.node_idx = edge.n_idx; template_path.path_score += pos.node_score; int *variant_ptr = edge.variant_ptrs[0]; const int num_aa = *variant_ptr++; int *aas = variant_ptr; if (edge.variant_ptrs.size() == 1) { pos.edge_variant_score = edge.variant_scores[0]; pos.aa = aas[0]; template_path.path_score += pos.edge_variant_score; } if (edge.num_aa == 1) continue; int j; for (j=1; j<edge.num_aa; j++) { PathPos& pos = template_path.positions[pos_idx++]; pos.breakage = NULL; pos.aa = aas[j]; } } if (pos_idx != num_aas_in_multipath) { cout << "Error: mismatches between positions and multipath length!" << endl; exit(1); } PathPos& last_pos = template_path.positions[num_positions-1]; const int last_e_idx = multi_path.edge_idxs[e-1]; const MultiEdge& last_edge = get_multi_edge(last_e_idx); last_pos.breakage = last_edge.c_break; last_pos.edge_idx =-1; last_pos.mass = last_edge.c_break->mass; last_pos.node_idx = last_edge.c_idx; last_pos.node_score = last_edge.c_break->score; template_path.path_score += last_pos.node_score; const int num_multi_var_edges = var_positions.size(); if (num_multi_var_edges == 0) { seq_paths.resize(1); seq_paths[0]=template_path; return; } vector<var_combo> combo_heap; vector<int> idxs; idxs.resize(num_multi_var_edges,0); const int last_var_pos = num_multi_var_edges-1; const int last_var_val = num_vars[last_var_pos]; const score_t needed_var_score = min_score - template_path.path_score; while (1) { score_t idxs_score = 0; int k; for (k=0; k<idxs.size(); k++) idxs_score += var_scores[k][idxs[k]]; if (idxs_score>=needed_var_score) { if (combo_heap.size()<max_num_paths) { var_combo v; v.path_score = template_path.path_score + idxs_score; v.var_idxs = idxs; combo_heap.push_back(v); if (combo_heap.size()== max_num_paths) make_heap(combo_heap.begin(),combo_heap.end()); } else { const score_t score = template_path.path_score + idxs_score; if (score>combo_heap[0].path_score) { pop_heap(combo_heap.begin(),combo_heap.end()); combo_heap[max_num_paths-1].path_score = score; combo_heap[max_num_paths-1].var_idxs = idxs; push_heap(combo_heap.begin(),combo_heap.end()); } } } int j=0; while (j<num_multi_var_edges) { idxs[j]++; if (idxs[j]==num_vars[j]) { idxs[j++]=0; } else break; } if (j==num_multi_var_edges) break; } seq_paths.clear(); seq_paths.resize(combo_heap.size(),template_path); int i; for (i=0; i<combo_heap.size(); i++) { const var_combo& combo = combo_heap[i]; SeqPath& seq_path = seq_paths[i]; // fill in info that changes with multi-variant edges int j; for (j=0; j<num_multi_var_edges; j++) { const int var_pos = var_positions[j]; const int e_idx = multi_path.edge_idxs[var_pos]; const MultiEdge& edge = get_multi_edge(e_idx); const int pos_idx = var_edge_positions[j]; if (seq_path.positions[pos_idx].edge_idx != e_idx) { cout << "POS: " << pos_idx << endl; cout << "Error: mismatch in pos_idx and e_idx of a multipath!" << endl; cout << "looking for " << e_idx << endl; cout << "edge idxs:" << endl; int k; for (k=0; k<seq_path.positions.size(); k++) cout << k << "\t" << seq_path.positions[k].edge_idx << "\tnidx: " << seq_path.positions[k].node_idx << endl; cout << endl; multi_path.print(config); this->print(); exit(1); } PathPos& pos = seq_path.positions[pos_idx]; const int variant_idx = combo.var_idxs[j]; int *variant_ptr = edge.variant_ptrs[variant_idx]; const int num_aa = *variant_ptr++; int *aas = variant_ptr; if (num_aa != edge.num_aa) { cout << "Error: edge and variant mixup!" << endl; exit(1); } pos.edge_variant_score = edge.variant_scores[variant_idx]; pos.aa = aas[0]; // pos.edge_var_idx = variant_idx; if (edge.num_aa == 1) continue; int k; for (k=1; k<edge.num_aa; k++) { PathPos& pos = seq_path.positions[pos_idx+k]; pos.aa = aas[k]; } } //seq_path.path_score = max_path_score + combo.path_delta; seq_path.path_score=combo.path_score; seq_path.make_seq_str(config); } }