Esempio n. 1
0
/************************************************************************

  Expands the single multi path into all possible sequence variants.
  Since this turns out to be the time-limiting process for long de nvoo,
  this is implemented using a quick branch and bound that works on
  edge variant indices. The SeqPaths are created only for the final
  set of sequences.

*************************************************************************/
void PrmGraph::fast_expand_multi_path_for_combo_scores(
									  AllScoreModels *model,
									  const MultiPath& multi_path, 
									  vector<SeqPath>& seq_paths,
									  score_t min_score,
									  score_t forbidden_pair_penalty,
									  int max_num_paths)
{
	const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos();
	vector<score_t> max_attainable_scores;

	const int num_edges = multi_path.edge_idxs.size();
	max_attainable_scores.resize(num_edges+1,0);
	
	int num_path_variants=1;
	int e;
	for (e=num_edges-1; e<num_edges; e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];
		num_path_variants *= edge.variant_ptrs.size();
	}


	if (num_path_variants == 0)
	{
		cout << "Error: had an edge with 0 variants!" <<endl;
		exit(1);
	}

	// re-check the max attainable scores, this time compute all combo scores along the path
	// this will give a more accurate max attainable score, and also allow quick computation
	// of the expanded scores

	max_attainable_scores.clear();
	max_attainable_scores.resize(num_edges+1,0);
	vector<score_t> max_node_scores;
	max_node_scores.resize(num_edges+1,NEG_INF);

	for (e=0; e<=num_edges; e++)
	{
		const int n_edge_idx = (e>0 ? multi_path.edge_idxs[e-1] : -1);
		const int c_edge_idx = (e<num_edges ? multi_path.edge_idxs[e] : -1);
		const int node_idx = (n_edge_idx>=0 ? multi_edges[n_edge_idx].c_idx : multi_edges[c_edge_idx].n_idx);
		const int num_n_vars = (e>0 ? multi_edges[n_edge_idx].variant_ptrs.size() : 1);
		const int num_c_vars = (e<num_edges ? multi_edges[c_edge_idx].variant_ptrs.size() : 1);

		int j,k;
		for (j=0; j<num_n_vars; j++)
			for (k=0; k<num_c_vars; k++)
			{
				score_t combo_score = model->get_node_combo_score(this,node_idx,n_edge_idx,j,c_edge_idx,k);
				if (max_node_scores[e]<combo_score)
					max_node_scores[e]=combo_score;
			}
	}

	max_attainable_scores[num_edges]=max_node_scores[num_edges];
	for (e=num_edges-1; e>=0; e--)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];

		max_attainable_scores[e]= max_attainable_scores[e+1] + 
			edge.max_variant_score + max_node_scores[e]; 
	}
	score_t max_path_score =  max_attainable_scores[0] - multi_path.num_forbidden_nodes * forbidden_pair_penalty;
	score_t min_delta_allowed = min_score - max_path_score;

	if (min_delta_allowed>0)
		return;

	// in this expansion method, all edges are stored (not only ones with more than one variant)
	// perform expansion using a heap and condensed path representation
	vector<int> var_positions;				   // holds the edge idxs
	vector<int> num_vars;
	vector<int> var_edge_positions;
	vector< vector< score_t > > var_scores;
	var_scores.clear();
	var_positions.clear();
	num_vars.clear();

	int num_aas_in_multipath = 0;
	for (e=0; e<num_edges; e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];
		const int num_vars_in_edge = edge.variant_ptrs.size();
		vector<score_t> scores;

		num_aas_in_multipath+=edge.num_aa;

		int i;
		for (i=0; i<num_vars_in_edge; i++)
			scores.push_back(edge.variant_scores[i]);
		
		var_scores.push_back(scores);
		var_positions.push_back(e);
		num_vars.push_back(num_vars_in_edge);
		var_edge_positions.push_back(num_aas_in_multipath-edge.num_aa);
	}

	// create the SeqPaths from the edge_combos...
	const int num_positions = num_aas_in_multipath+1;
	SeqPath template_path;				// holds common elements to all paths
	template_path.n_term_aa = multi_path.n_term_aa;
	template_path.c_term_aa = multi_path.c_term_aa;
	template_path.n_term_mass = multi_path.n_term_mass;
	template_path.c_term_mass = multi_path.c_term_mass;
	template_path.charge = charge;
	template_path.multi_path_rank = multi_path.original_rank;
	template_path.path_score = 0;
	template_path.prm_ptr = (PrmGraph *)this;
	template_path.positions.clear();
	template_path.positions.resize(num_positions);

	template_path.num_forbidden_nodes = multi_path.num_forbidden_nodes;
	template_path.path_score -= template_path.num_forbidden_nodes * forbidden_pair_penalty;

	int pos_idx=0;
	for (e=0; e<multi_path.edge_idxs.size(); e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = get_multi_edge(e_idx);
		PathPos& pos = template_path.positions[pos_idx++];

		pos.breakage = edge.n_break;
		pos.edge_idx = e_idx;
		pos.mass =  edge.n_break->mass;
		pos.node_idx = edge.n_idx;
		
		int *variant_ptr = edge.variant_ptrs[0];
		const int num_aa = *variant_ptr++;
		int *aas = variant_ptr;

		if (edge.variant_ptrs.size() == 1)
		{
			pos.variant_ptr = edge.variant_ptrs[0];
			pos.edge_variant_score = edge.variant_scores[0];
			pos.aa = aas[0];
		}

		if (edge.num_aa == 1)
			continue;
		
		int j;
		for (j=1; j<edge.num_aa; j++)
		{
			PathPos& pos = template_path.positions[pos_idx++];
			pos.breakage = NULL;
			pos.aa = aas[j];
		}
	}

	if (pos_idx != num_aas_in_multipath)
	{
		cout << "Error: mismatches between positions and multipath length!" << endl;
		exit(1);
	}
	
	PathPos& last_pos = template_path.positions[num_positions-1];
	const int last_e_idx = multi_path.edge_idxs[e-1];
	const MultiEdge& last_edge = get_multi_edge(last_e_idx);
	last_pos.breakage = last_edge.c_break;
	last_pos.edge_idx =-1;
	last_pos.mass = last_edge.c_break->mass;
	last_pos.node_idx = last_edge.c_idx;
	

	const int num_multi_var_edges = var_positions.size();
	
	vector<var_combo>      combo_heap;
	vector<int> var_idxs;
	var_idxs.resize(num_multi_var_edges,0);
	const int last_var_pos = num_multi_var_edges-1;
	const int last_var_val = num_vars[last_var_pos];
	const score_t needed_var_score = min_score; // this is only the penalty for forbidden pairs
	while (1)
	{
		score_t idxs_score = 0;
		int k;
		for (k=0; k<var_idxs.size(); k++)
			idxs_score += var_scores[k][var_idxs[k]]; // this is the score for the edge variants (usually 0) 
		
		const vector<PathPos>& positions = template_path.positions;
		score_t path_score = idxs_score + template_path.path_score;
		
		// compute the node scores based on the edge variants
		// N-term node score
		vector<score_t> node_scores;
		node_scores.resize(var_idxs.size()+1,NEG_INF);

		node_scores[0] = model->get_node_combo_score(this,positions[0].node_idx,-1,0,
													positions[0].edge_idx,var_idxs[0]);
		path_score += node_scores[0];
		// middle nodes score
		int prev_edge_idx = positions[0].edge_idx;
		for (k=1; k<var_idxs.size(); k++)
		{
			const int p=var_edge_positions[k];
			const PathPos& pos = positions[p];
			node_scores[k] = model->get_node_combo_score(this,pos.node_idx,prev_edge_idx,var_idxs[k-1],
													pos.edge_idx,var_idxs[k]);
			path_score += node_scores[k];
			prev_edge_idx = pos.edge_idx;
		}

		// C-term node score
		node_scores[k] = model->get_node_combo_score(this,last_pos.node_idx,prev_edge_idx,var_idxs[k-1],-1,0);

		path_score += node_scores[k];
		
		if (path_score>=needed_var_score)
		{
			if (combo_heap.size()<max_num_paths)
			{
				var_combo v;
				v.path_score  = path_score;
				v.var_idxs    = var_idxs;
				v.node_scores = node_scores;
				combo_heap.push_back(v);
				if (combo_heap.size()== max_num_paths)
					make_heap(combo_heap.begin(),combo_heap.end());
			}
			else
			{
				const score_t score = path_score;
				if (score>combo_heap[0].path_score)
				{
					pop_heap(combo_heap.begin(),combo_heap.end());
					var_combo& combo  = combo_heap[max_num_paths-1];
					combo.path_score  = path_score;
					combo.var_idxs    = var_idxs;
					combo.node_scores = node_scores;
					push_heap(combo_heap.begin(),combo_heap.end());
				}
			}	
		}

		int j=0;
		while (j<num_multi_var_edges)
		{
			var_idxs[j]++;
			if (var_idxs[j]==num_vars[j])
			{
				var_idxs[j++]=0;
			}
			else
				break;
		}
		if (j==num_multi_var_edges)
			break;

	}


	seq_paths.clear();
	seq_paths.resize(combo_heap.size(),template_path);

	int i;
	for (i=0; i<combo_heap.size(); i++)
	{
		const var_combo& combo  = combo_heap[i];
		SeqPath& seq_path = seq_paths[i];

		// fill in info that changes with multi-variant edges
		int j;
		for (j=0; j<num_multi_var_edges; j++)
		{
			const int var_pos = var_positions[j];
			const int e_idx = multi_path.edge_idxs[var_pos];
			const MultiEdge& edge = get_multi_edge(e_idx);

			const int pos_idx = var_edge_positions[j];
		
			if (seq_path.positions[pos_idx].edge_idx !=  e_idx)
			{
				cout << "POS: " << pos_idx << endl;
				cout << "Error: mismatch in pos_idx and e_idx of a multipath!" << endl;
				cout << "looking for " << e_idx << endl;
				cout << "edge idxs:" << endl;
				int k;
				for (k=0; k<seq_path.positions.size(); k++)
					cout << k << "\t" << seq_path.positions[k].edge_idx << "\tnidx: " <<
					seq_path.positions[k].node_idx << endl;
				cout << endl;
				multi_path.print(config);

				this->print();

				exit(1);
			}

			PathPos& pos = seq_path.positions[pos_idx];
			const int variant_idx = combo.var_idxs[j];
			int *variant_ptr = edge.variant_ptrs[variant_idx];
			const int num_aa = *variant_ptr++;
			int *aas = variant_ptr;

			if (num_aa != edge.num_aa)
			{
				cout << "Error: edge and variant mixup!" << endl;
				exit(1);
			}

			pos.node_score = combo.node_scores[j];
			pos.edge_variant_score = edge.variant_scores[variant_idx];
			pos.variant_ptr = edge.variant_ptrs[variant_idx];
			pos.aa = aas[0];
	
			if (edge.num_aa == 1)
				continue;

			int k;
			for (k=1; k<edge.num_aa; k++)
			{
				PathPos& pos = seq_path.positions[pos_idx+k];
				pos.aa = aas[k];
			}
		}

		seq_path.positions[seq_path.positions.size()-1].node_score = combo.node_scores[num_multi_var_edges];
		seq_path.path_score=combo.path_score;
		float pen = seq_path.adjust_complete_sequence_penalty(this);

		seq_path.make_seq_str(config);
	}
}
Esempio n. 2
0
/************************************************************************

  Expands the single multi path into all possible sequence variants.
  Since this turns out to be the time-limiting process for long de nvoo,
  this is implemented using a quick branch and bound that works on
  edge variant indices. The SeqPaths are created only for the final
  set of sequences.

*************************************************************************/
void PrmGraph::fast_expand_multi_path(const MultiPath& multi_path, 
									  vector<SeqPath>& seq_paths,
									  score_t min_score,
									  score_t forbidden_pair_penalty,
									  int max_num_paths) const
{
	const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos();
	vector<score_t> max_attainable_scores;

	const int num_edges = multi_path.edge_idxs.size();
	max_attainable_scores.resize(num_edges+1,0);
	
	int num_path_variants=1;
	int e;
	for (e=num_edges-1; e>=0; e--)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];

		max_attainable_scores[e]= max_attainable_scores[e+1] + 
			edge.max_variant_score + edge.c_break->score; 
		num_path_variants *= edge.variant_ptrs.size();
	}

	const score_t max_path_score =  multi_edges[multi_path.edge_idxs[0]].n_break->score + 
									max_attainable_scores[0];
	const score_t min_delta_allowed = min_score - max_path_score;

	if (min_delta_allowed>0)
		return; // this multipath won't help much

	if (num_path_variants == 0)
	{
		cout << "Error: had an edge with 0 variants!" <<endl;
		exit(1);
	}

	// perform expansion using a heap and condensed path representation
	vector<int> var_positions;				   // holds the edge idxs
	vector<int> num_vars;
	vector<int> var_edge_positions;
	vector< vector< score_t > > var_scores;
	var_scores.clear();
	var_positions.clear();
	num_vars.clear();
	int num_aas_in_multipath = 0;
	for (e=0; e<num_edges; e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];
		const int num_vars_in_edge = edge.variant_ptrs.size();
		vector<score_t> scores;

		num_aas_in_multipath+=edge.num_aa;

		if (num_vars_in_edge>1)
		{
			int i;
			for (i=0; i<num_vars_in_edge; i++)
				scores.push_back(edge.variant_scores[i]);
		
			var_scores.push_back(scores);
			var_positions.push_back(e);
			num_vars.push_back(num_vars_in_edge);
			var_edge_positions.push_back(num_aas_in_multipath-edge.num_aa);

		//	cout << e << "\t" << e_idx << "\t" << num_vars_in_edge << "\t" << edge.num_aa << endl;
		}	
	}

		// create the SeqPaths from the edge_combos...
	const int num_positions = num_aas_in_multipath+1;
	SeqPath template_path;				// holds common elements to all paths
	template_path.n_term_aa = multi_path.n_term_aa;
	template_path.c_term_aa = multi_path.c_term_aa;
	template_path.n_term_mass = multi_path.n_term_mass;
	template_path.c_term_mass = multi_path.c_term_mass;
	template_path.multi_path_rank = multi_path.original_rank;
	template_path.path_score = 0;
	template_path.prm_ptr = (PrmGraph *)this;
	template_path.positions.clear();
	template_path.positions.resize(num_positions);

	template_path.num_forbidden_nodes = multi_path.num_forbidden_nodes;
	template_path.path_score -= template_path.num_forbidden_nodes * forbidden_pair_penalty;

	int pos_idx=0;
	for (e=0; e<multi_path.edge_idxs.size(); e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = get_multi_edge(e_idx);
		PathPos& pos = template_path.positions[pos_idx++];

		pos.breakage = edge.n_break;
		pos.edge_idx = e_idx;
		pos.mass =  edge.n_break->mass;
		pos.node_score = edge.n_break->score;
		pos.node_idx = edge.n_idx;
		template_path.path_score += pos.node_score;

		int *variant_ptr = edge.variant_ptrs[0];
		const int num_aa = *variant_ptr++;
		int *aas = variant_ptr;

		if (edge.variant_ptrs.size() == 1)
		{
			pos.edge_variant_score = edge.variant_scores[0];
			pos.aa = aas[0];
			template_path.path_score += pos.edge_variant_score;
		}

		if (edge.num_aa == 1)
			continue;
		
		int j;
		for (j=1; j<edge.num_aa; j++)
		{
			PathPos& pos = template_path.positions[pos_idx++];
			pos.breakage = NULL;
			pos.aa = aas[j];
		}
	}

	if (pos_idx != num_aas_in_multipath)
	{
		cout << "Error: mismatches between positions and multipath length!" << endl;
		exit(1);
	}
	
	PathPos& last_pos = template_path.positions[num_positions-1];
	const int last_e_idx = multi_path.edge_idxs[e-1];
	const MultiEdge& last_edge = get_multi_edge(last_e_idx);
	last_pos.breakage = last_edge.c_break;
	last_pos.edge_idx =-1;
	last_pos.mass = last_edge.c_break->mass;
	last_pos.node_idx = last_edge.c_idx;
	last_pos.node_score = last_edge.c_break->score;

	template_path.path_score += last_pos.node_score;

	const int num_multi_var_edges = var_positions.size();
	if (num_multi_var_edges == 0)
	{
		seq_paths.resize(1);
		seq_paths[0]=template_path;
		return;
	}

	vector<var_combo>      combo_heap;
	vector<int> idxs;
	idxs.resize(num_multi_var_edges,0);
	const int last_var_pos = num_multi_var_edges-1;
	const int last_var_val = num_vars[last_var_pos];
	const score_t needed_var_score = min_score - template_path.path_score;
	while (1)
	{
		score_t idxs_score = 0;
		int k;
		for (k=0; k<idxs.size(); k++)
			idxs_score += var_scores[k][idxs[k]];
		
		if (idxs_score>=needed_var_score)
		{
			if (combo_heap.size()<max_num_paths)
			{
				var_combo v;
				v.path_score = template_path.path_score + idxs_score;
				v.var_idxs = idxs;
				combo_heap.push_back(v);
				if (combo_heap.size()== max_num_paths)
					make_heap(combo_heap.begin(),combo_heap.end());
			}
			else
			{
				const score_t score = template_path.path_score + idxs_score;
				if (score>combo_heap[0].path_score)
				{
					pop_heap(combo_heap.begin(),combo_heap.end());
					combo_heap[max_num_paths-1].path_score = score;
					combo_heap[max_num_paths-1].var_idxs = idxs;
					push_heap(combo_heap.begin(),combo_heap.end());
				}
			}	
		}

		int j=0;
		while (j<num_multi_var_edges)
		{
			idxs[j]++;
			if (idxs[j]==num_vars[j])
			{
				idxs[j++]=0;
			}
			else
				break;
		}
		if (j==num_multi_var_edges)
			break;

	}

	seq_paths.clear();
	seq_paths.resize(combo_heap.size(),template_path);

	int i;
	for (i=0; i<combo_heap.size(); i++)
	{
		const var_combo& combo  = combo_heap[i];
		SeqPath& seq_path = seq_paths[i];

		// fill in info that changes with multi-variant edges
		int j;
		for (j=0; j<num_multi_var_edges; j++)
		{
			const int var_pos = var_positions[j];
			const int e_idx = multi_path.edge_idxs[var_pos];
			const MultiEdge& edge = get_multi_edge(e_idx);

			const int pos_idx = var_edge_positions[j];
		
			if (seq_path.positions[pos_idx].edge_idx !=  e_idx)
			{
				cout << "POS: " << pos_idx << endl;
				cout << "Error: mismatch in pos_idx and e_idx of a multipath!" << endl;
				cout << "looking for " << e_idx << endl;
				cout << "edge idxs:" << endl;
				int k;
				for (k=0; k<seq_path.positions.size(); k++)
					cout << k << "\t" << seq_path.positions[k].edge_idx << "\tnidx: " <<
					seq_path.positions[k].node_idx << endl;
				cout << endl;
				multi_path.print(config);

				this->print();

				exit(1);
			}

			PathPos& pos = seq_path.positions[pos_idx];
			const int variant_idx = combo.var_idxs[j];
			int *variant_ptr = edge.variant_ptrs[variant_idx];
			const int num_aa = *variant_ptr++;
			int *aas = variant_ptr;

			if (num_aa != edge.num_aa)
			{
				cout << "Error: edge and variant mixup!" << endl;
				exit(1);
			}

			pos.edge_variant_score = edge.variant_scores[variant_idx];
			pos.aa = aas[0];
		//	pos.edge_var_idx = variant_idx;

			if (edge.num_aa == 1)
				continue;

			int k;
			for (k=1; k<edge.num_aa; k++)
			{
				PathPos& pos = seq_path.positions[pos_idx+k];
				pos.aa = aas[k];
			}
		}

		//seq_path.path_score = max_path_score + combo.path_delta;
		seq_path.path_score=combo.path_score;
		seq_path.make_seq_str(config);
	}
}