예제 #1
0
/**********************************************************************
This fills in all possible sequence features that can be used in any
of the models (these features are good for all fragments/mobility/
size/charge). The models can later choose to ignore some of these
features by asigning the weight 0. All features are derived directly
from the peptide sequence and concern the specified cut idx.
***********************************************************************/
void PeakRankModel::fill_advanced_peak_features(
								 const  vector<int>& org_amino_acids,
								 int    cut_idx,
								 mass_t cut_mass,
								 mass_t pm_with_19,
								 int	spec_charge,
								 const FragmentType& fragment,
								 RankBoostSample& sample) const
{
	const int length = org_amino_acids.size();
	const int num_aas = model_aa_labels.size();
	int r_idx=0;
	int i;

	vector<int> amino_acids;
	convert_aas_to_model_aas(org_amino_acids, amino_acids);

	if (amino_acids.size() != org_amino_acids.size())
	{
		cout << "Error: aa size mismatch!" << endl;
		exit(1);
	}

	if (cut_idx<=0 || cut_idx>=amino_acids.size())
	{
		cout << "Error: cut_idx is bad!" << endl;
		exit(1);
	}

	// need to use the special Idx variables and not the regular enumerations
	const int HisIdx = session_aas_to_model_aas[His];
	const int LysIdx = session_aas_to_model_aas[Lys];
	const int ArgIdx = session_aas_to_model_aas[Arg];
	const int SerIdx = session_aas_to_model_aas[Ser];
	const int ThrIdx = session_aas_to_model_aas[Thr];
	const int ProIdx = session_aas_to_model_aas[Pro];
	const int GlyIdx = session_aas_to_model_aas[Gly];
	const int AlaIdx = session_aas_to_model_aas[Ala];
	const int LeuIdx = session_aas_to_model_aas[Leu];
	const int AsnIdx = session_aas_to_model_aas[Asn];
	const int AspIdx = session_aas_to_model_aas[Asp];
	const int GluIdx = session_aas_to_model_aas[Glu];


	sample.clear();

	// special N C side aa indicators
	int num_nH=0, num_cH=0;
	int num_nK=0, num_cK=0;
	int num_nR=0, num_cR=0;
	
	for (i=0; i<cut_idx; i++)
	{
		if (amino_acids[i] == HisIdx)
			num_nH++;

		if (amino_acids[i] == LysIdx)
			num_nK++;

		if (amino_acids[i] == ArgIdx)
			num_nR++;
	}

	for (i=cut_idx; i<length; i++)
	{
		if (amino_acids[i] == HisIdx)
			num_cH++;

		if (amino_acids[i] == LysIdx)
			num_cK++;

		if (amino_acids[i] == ArgIdx)
			num_cR++;
	}

	// MASS / LOCATION FEATURES (REAL + BINARY)
	const mass_t exp_peak_mass = fragment.calc_expected_mass(cut_mass,pm_with_19);
	const mass_t min_obs_mass = calc_min_detected_mass(pm_with_19,spec_charge);
	const mass_t max_obs_mass = (pm_with_19>max_detected_mass ? max_detected_mass : pm_with_19);
	const float peak_mass_prop = ((exp_peak_mass - min_obs_mass)/(max_obs_mass - min_obs_mass));
	const float rounded_peak_prop = 0.1*floor(peak_mass_prop * 10.0);

	// give values within a resolution of 20 Da
	const mass_t dis_from_min = 25.0*floor((exp_peak_mass - min_obs_mass)*0.04);
	const mass_t dis_from_max = 25.0*floor((max_obs_mass  - exp_peak_mass)*0.04);

	const int RKH_n_combo_idx = calc_RKH_combo_idx(num_nR,num_nK,num_nH);
	const int RKH_c_combo_idx = calc_RKH_combo_idx(num_cR,num_cK,num_cH);

	const int RKH_pair_idx = (RKH_n_combo_idx * num_RKH_combos) + RKH_c_combo_idx;

	const float RKH_liniar_pair_idx = RKH_pair_matrix[RKH_n_combo_idx][RKH_c_combo_idx];
	const int n_aa = amino_acids[cut_idx-1];
	const int c_aa = amino_acids[cut_idx];

	// proportion of mass of the N/C fragments (special values are given to the first 3 
	// cuts on each side. If the cut is not in those regions, prop is assigned the
	// index of the fifth in which it falls 
	int side_length=3;
	if (length>=12) side_length=4;
	if (length>=15) side_length=5;

	float cut_prop;
	if (cut_idx<=side_length)
	{
		cut_prop=(float)cut_idx;
	}
	else if (cut_idx>=length-side_length)
	{
		cut_prop=(float)(11+cut_idx-length);
	}
	else
	{
		cut_prop = 5.1+floor(3.0*(cut_mass/pm_with_19))*0.1;
	}


	// fill N RKH and C RKH
	sample.add_real_feature(r_idx++,RKH_n_combo_idx);
	sample.add_real_feature(r_idx++,RKH_c_combo_idx);

	// peak prop
	sample.add_real_feature(r_idx++,rounded_peak_prop);
	sample.add_real_feature(r_idx+RKH_pair_idx,rounded_peak_prop);
	r_idx+=num_RKH_pairs;
	
	// fill dis features
	if (dis_from_min<dis_from_max)
	{
		sample.add_real_feature(r_idx,dis_from_min);
		r_idx++;
		sample.add_real_feature(r_idx+RKH_pair_idx,dis_from_min);
		r_idx+=(2*num_RKH_pairs+1);
	}
	else
	{
		r_idx+=(num_RKH_pairs+1);
		sample.add_real_feature(r_idx,dis_from_max);
		r_idx++;
		sample.add_real_feature(r_idx+RKH_pair_idx,dis_from_max);
		r_idx+=num_RKH_pairs;
	} 

	//  fill prop features
	sample.add_real_feature(r_idx++,cut_prop);
	sample.add_real_feature(r_idx+RKH_pair_idx,cut_prop);
	r_idx+=num_RKH_pairs;

	// fill prop X dis features
	if (dis_from_min<dis_from_max)
	{
		if (dis_from_min<75.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;
		if (dis_from_min<150.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;
		if (dis_from_min<250.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;
		if (dis_from_min<=400.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;

		r_idx+=4;
	}
	else
	{
		r_idx+=4;
		if (dis_from_max<75.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;
		if (dis_from_max<150.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;
		if (dis_from_max<250.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;
		if (dis_from_max<=400.0)
			sample.add_real_feature(r_idx,cut_prop);
		r_idx++;
	}

	// fill aa count features (up top 3 aa's away from cut)
	vector<int> n_aa_counts, c_aa_counts;
	n_aa_counts.resize(num_aas+1,0);
	c_aa_counts.resize(num_aas+1,0);

	for (i=0; i<cut_idx-3; i++)
		n_aa_counts[amino_acids[i]]++;

	for (i=cut_idx+3; i<length; i++)
		c_aa_counts[amino_acids[i]]++;

	int a;
	for (a=0; a<num_aas; a++)
		sample.add_real_feature(r_idx++,n_aa_counts[a]);
	
	for (a=0; a<num_aas; a++)
		sample.add_real_feature(r_idx++,c_aa_counts[a]);

	// including the aas up top the count
	int start_cut = cut_idx-3;
	if (start_cut<0)
		start_cut=0;
	for (i=start_cut; i<cut_idx; i++)
		n_aa_counts[amino_acids[i]]++;

	int end_cut = cut_idx+3;
	if (end_cut>length)
		end_cut = length;
	for (i=cut_idx; i<end_cut; i++)
		c_aa_counts[amino_acids[i]]++;

	for (a=0; a<num_aas; a++)
		sample.add_real_feature(r_idx++,n_aa_counts[a]);
	
	for (a=0; a<num_aas; a++)
		sample.add_real_feature(r_idx++,c_aa_counts[a]);

	// fill aa flanking features N side
	if (cut_idx>0)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;
	if (cut_idx>1)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;
	if (cut_idx>2)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-3],cut_prop);
	r_idx+=num_aas;

	// fill aa flanking features C side
	if (cut_idx<length)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;
	if (cut_idx<length-1)
		sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;
	if (cut_idx<length-2)
		sample.add_real_feature(r_idx+amino_acids[cut_idx+2],cut_prop);
	r_idx+=num_aas;

	// fill cut pair features X-Y
	sample.add_real_feature(r_idx+(n_aa*num_aas+c_aa),cut_prop);
	r_idx+=(num_aas*num_aas);

	// fill cut pair features X-Y
	sample.add_real_feature(r_idx+(n_aa*num_aas+c_aa),rounded_peak_prop);
	r_idx+=(num_aas*num_aas);

	// fill flanking aa info with RKH_pair data
	// fill aa flanking features N side
	if (cut_idx>0)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],RKH_liniar_pair_idx);
	r_idx+=num_aas;
	if (cut_idx>1)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],RKH_liniar_pair_idx);
	r_idx+=num_aas;
	if (cut_idx>2)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-3],RKH_liniar_pair_idx);
	r_idx+=num_aas;

	// fill aa flanking features C side
	if (cut_idx<length)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],RKH_liniar_pair_idx);
	r_idx+=num_aas;
	if (cut_idx<length-1)
		sample.add_real_feature(r_idx+amino_acids[cut_idx+1],RKH_liniar_pair_idx);
	r_idx+=num_aas;
	if (cut_idx<length-2)
		sample.add_real_feature(r_idx+amino_acids[cut_idx+2],RKH_liniar_pair_idx);
	r_idx+=num_aas;
	

	// fill flanking aa info with peak prop data
	// fill aa flanking features N side
	if (cut_idx>0)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],rounded_peak_prop);
	r_idx+=num_aas;
	if (cut_idx>1)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],rounded_peak_prop);
	r_idx+=num_aas;
	if (cut_idx>2)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-3],rounded_peak_prop);
	r_idx+=num_aas;

	// fill aa flanking features C side
	if (cut_idx<length)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],rounded_peak_prop);
	r_idx+=num_aas;
	if (cut_idx<length-1)
		sample.add_real_feature(r_idx+amino_acids[cut_idx+1],rounded_peak_prop);
	r_idx+=num_aas;
	if (cut_idx<length-2)
		sample.add_real_feature(r_idx+amino_acids[cut_idx+2],rounded_peak_prop);
	r_idx+=num_aas;

	// add features for flanking pairs of amino acids
	if (cut_idx>1)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2]*num_aas+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas*num_aas;

	if (cut_idx<length-2)
		sample.add_real_feature(r_idx+amino_acids[cut_idx]*num_aas+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas*num_aas;

	
	// X != R
	// features of the form  |LXK   |LXXK   |LXXXK   |LXXXXK
	if (cut_idx < length-2 &&
		amino_acids[cut_idx+2]==LysIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-3 &&
		amino_acids[cut_idx+3]==LysIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-4 &&
		amino_acids[cut_idx+4]==LysIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-5 &&
		amino_acids[cut_idx+5]==LysIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	// features of the form  |XLXK  |XLXXK  |XLXXXK  |XLXXXXK
	if (cut_idx < length-3 &&
		amino_acids[cut_idx+3]==LysIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx 
		&& amino_acids[cut_idx] != LysIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-4 &&
		amino_acids[cut_idx+4]==LysIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != LysIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-5 &&
		amino_acids[cut_idx+5]==LysIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx 
		&& amino_acids[cut_idx] != LysIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-6 &&
		amino_acids[cut_idx+6]==LysIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx &&
		amino_acids[cut_idx+5] != ArgIdx && amino_acids[cut_idx] != LysIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;


	// features of the form L|XK   L|XXK   L|XXXK   L|XXXXK
	if (cut_idx>0 &&cut_idx < length-1 &&
		amino_acids[cut_idx+1]==LysIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>0 &&cut_idx < length-2 &&
		amino_acids[cut_idx+2]==LysIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>0 &&cut_idx < length-3 &&
		amino_acids[cut_idx+3]==LysIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>0 && cut_idx < length-4 &&
		amino_acids[cut_idx+4]==LysIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != ArgIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	
	// features of the form  |LXR   |LXXR   |LXXXR   |LXXXXR
		if (cut_idx < length-2 &&
		amino_acids[cut_idx+2]==ArgIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-3 &&
		amino_acids[cut_idx+3]==ArgIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-4 &&
		amino_acids[cut_idx+4]==ArgIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-5 &&
		amino_acids[cut_idx+5]==ArgIdx &&
		amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	// features of the form  |XLXR  |XLXXR  |XLXXXR  |XLXXXXR
		if (cut_idx < length-3 &&
		amino_acids[cut_idx+3]==ArgIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx] != LysIdx &&
		amino_acids[cut_idx+2] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-4 &&
		amino_acids[cut_idx+4]==ArgIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != LysIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-5 &&
		amino_acids[cut_idx+5]==ArgIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx &&
		amino_acids[cut_idx] != LysIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx < length-6 &&
		amino_acids[cut_idx+6]==ArgIdx &&
		amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx &&
		amino_acids[cut_idx+5] != ArgIdx && amino_acids[cut_idx] != LysIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop);
	r_idx+=num_aas;

	// features of the form L|XR   L|XXR   L|XXXR   L|XXXXR

	if (cut_idx>0 &&cut_idx < length-1 &&
		amino_acids[cut_idx+1]==ArgIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>0 &&cut_idx < length-2 &&
		amino_acids[cut_idx+2]==ArgIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>0 &&cut_idx < length-3 &&
		amino_acids[cut_idx+3]==ArgIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx] != ArgIdx)
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>0 && cut_idx < length-4 &&
		amino_acids[cut_idx+4]==ArgIdx &&
		amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx &&
		amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx &&
		amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != ArgIdx )
			sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;
	

	// features of the form KXF KXXF KXXXF KXXXXF
	if (cut_idx>2 && amino_acids[cut_idx-3] == LysIdx &&amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>3 && amino_acids[cut_idx-4] == LysIdx && amino_acids[cut_idx-3] != ArgIdx &&
		amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>4 && amino_acids[cut_idx-5] == LysIdx && amino_acids[cut_idx-4] != ArgIdx &&
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>5 && amino_acids[cut_idx-6] == LysIdx && amino_acids[cut_idx-5] != ArgIdx &&
		amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx &&
		amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	// features of the form KXFX KXXFX KXXXFX KXXXXFX
	if (cut_idx>3 && amino_acids[cut_idx-4] == LysIdx && amino_acids[cut_idx-3] != ArgIdx &&
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>4 && amino_acids[cut_idx-5] == LysIdx && amino_acids[cut_idx-4] != ArgIdx &&
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>5 && amino_acids[cut_idx-6] == LysIdx && amino_acids[cut_idx-5] != ArgIdx &&
		amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && 
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>6 && amino_acids[cut_idx-7] == LysIdx && amino_acids[cut_idx-6] != ArgIdx &&
		amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && 
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	// features of the form KX|F KXX|F KXXX|F KXXXX|F
	if (cut_idx>1 && cut_idx<length && amino_acids[cut_idx-2] == LysIdx && 
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>2 && cut_idx<length && amino_acids[cut_idx-3] == LysIdx && 
		amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>3 && cut_idx<length && amino_acids[cut_idx-4] == LysIdx && 
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx && 
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>4 && cut_idx<length && amino_acids[cut_idx-5] == LysIdx && 
		amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && 
		amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

		// features of the form RXF RXXF RXXXF RXXXXF
	if (cut_idx>2 && amino_acids[cut_idx-3] == ArgIdx &&amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>3 && amino_acids[cut_idx-4] == ArgIdx && amino_acids[cut_idx-3] != ArgIdx &&
		amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>4 && amino_acids[cut_idx-5] == ArgIdx && amino_acids[cut_idx-4] != ArgIdx &&
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>5 && amino_acids[cut_idx-6] == ArgIdx && amino_acids[cut_idx-5] != ArgIdx &&
		amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx &&
		amino_acids[cut_idx-2] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop);
	r_idx+=num_aas;

	// features of the form RXFX RXXFX RXXXFX RXXXXFX
	if (cut_idx>3 && amino_acids[cut_idx-4] == ArgIdx && amino_acids[cut_idx-3] != ArgIdx &&
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>4 && amino_acids[cut_idx-5] == ArgIdx && amino_acids[cut_idx-4] != ArgIdx &&
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>5 && amino_acids[cut_idx-6] == ArgIdx && amino_acids[cut_idx-5] != ArgIdx &&
		amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && 
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>6 && amino_acids[cut_idx-7] == ArgIdx && amino_acids[cut_idx-6] != ArgIdx &&
		amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && 
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop);
	r_idx+=num_aas;

	// features of the form RX|F RXX|F RXXX|F RXXXX|F
	if (cut_idx>1 && cut_idx<length && amino_acids[cut_idx-2] == ArgIdx && 
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>2 && cut_idx<length && amino_acids[cut_idx-3] == ArgIdx && 
		amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>3 && cut_idx<length && amino_acids[cut_idx-4] == ArgIdx && 
		amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx && 
		amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;

	if (cut_idx>4 && cut_idx<length && amino_acids[cut_idx-5] == ArgIdx && 
		amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && 
		amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx)
		sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop);
	r_idx+=num_aas;


	// Add sepcial C-terminal features
	const int c_cut_dis = length - cut_idx;
	if (c_cut_dis<=5 && amino_acids[length-1]==LysIdx)
	{
		if (c_aa_counts[LeuIdx]>0 &&
			(c_aa_counts[AspIdx]+c_aa_counts[GluIdx]>0 &&
			c_aa_counts[LeuIdx] + c_aa_counts[GluIdx] + +c_aa_counts[AspIdx] +
			c_aa_counts[LysIdx] == c_cut_dis) )
			sample.add_real_feature(r_idx,c_cut_dis);
		r_idx++;

		if (c_aa_counts[LeuIdx]>0 && c_aa_counts[AlaIdx]>0 &&
			c_aa_counts[LeuIdx] + c_aa_counts[AlaIdx] + c_aa_counts[LysIdx] == c_cut_dis)
			sample.add_real_feature(r_idx,c_cut_dis);
		r_idx++;

		if (c_cut_dis>=3 && c_aa_counts[LeuIdx] + c_aa_counts[LysIdx] + c_aa_counts[GluIdx] + 
			c_aa_counts[AspIdx] == c_cut_dis-1)
			sample.add_real_feature(r_idx,c_cut_dis);
		r_idx++;
	}
	else
		r_idx+=3;

	if (amino_acids[length-1]==ArgIdx && cut_idx<length-1)
	{
		if (c_aa_counts[LeuIdx] + c_aa_counts[ArgIdx] + c_aa_counts[GluIdx] + 
			c_aa_counts[AspIdx] == c_cut_dis)
		sample.add_real_feature(r_idx,c_cut_dis);
		
	}
	r_idx++;
}
예제 #2
0
/***********************************************************************
makes tables listing features and final scores
Only makes table if the predictions match
************************************************************************/
bool PeakRankModel::make_peak_prediction_table(
			const PeptideSolution& sol,
			const vector< vector<intensity_t> >& intens,
			int num_peaks) const
{
	PeptidePeakPrediction ppp;
	calc_peptide_predicted_scores(sol, ppp);

	// the ppp includes a table of rank scores (rows are actual frag idxs, not relative
	// position in the frag_type_idxs).

	// reduce intensities to the same dimensionality
	const int num_frags = ppp.frag_idxs.size();
	vector< vector< float> > observed_intens;
	observed_intens.resize(num_frags);

	int i,f;
	for (f=0; f<num_frags; f++)
	{
		const int frag_idx = ppp.frag_idxs[f];
		observed_intens[f]=intens[frag_idx]; 
	}

	// calculate the ranks and mapping between predicted and observed
	vector< vector<int> > observed_ranks, predicted_ranks;
	calc_combined_peak_ranks(observed_intens, observed_ranks);
	calc_combined_peak_ranks(ppp.rank_scores, predicted_ranks);

	vector<int> sel_frags, sel_idxs;
	vector< float > intensities;
	
	
	int rank;
	for (rank=0; rank<num_peaks; rank++)
	{
		bool good_pred=false;
		for (f=0; f<num_frags; f++)
		{
			int i;
			for (i=0; i<predicted_ranks[f].size(); i++)
			{
				if (predicted_ranks[f][i] == rank &&
					observed_ranks[f][i]  == rank)
				{
					good_pred=true;
					sel_frags.push_back(f);
					sel_idxs.push_back(i);
					intensities.push_back(intens[f][i]);
					break;
				}
			}
		}
		if (! good_pred)
			return false;
	}

//	cout << "#sel_frags: " << sel_frags.size() << endl;
	

	// calc specific peak vectors and collect data
	vector< vector< string> > feature_names;
	vector< vector< float > > feature_values;
	vector< vector< float > > feature_scores;
	vector< float > total_scores;


	feature_names.resize(num_peaks);
	feature_values.resize(num_peaks);
	feature_scores.resize(num_peaks);
	total_scores.resize(num_peaks,0);
	

	const Peptide& pep = sol.pep;
	const mass_t pm_with_19 = sol.pm_with_19;
	const int spec_charge = sol.charge;
	const int mobility = get_proton_mobility(pep,spec_charge);
	const int size_idx =  get_size_group(spec_charge,pm_with_19);
	
	if (! partition_models[spec_charge][size_idx][mobility])
	{
		cout << "Error: no rank partition model for " <<
			spec_charge << " " << size_idx << " " << mobility << endl;
		exit(1);
	}

	if (size_idx != 1 || mobility != 1)
		return false;

	const mass_t min_detected_mass = calc_min_detected_mass(pm_with_19, spec_charge);
	const mass_t max_detected_mass = get_max_detected_mass();


	const vector<int>& amino_acids = pep.get_amino_acids();
	vector<mass_t> exp_cuts;

	pep.calc_expected_breakage_masses(config,exp_cuts);

	const mass_t n_mass = pep.get_n_gap();

	// calculate a single set of ranks across the combined set of fragments
	const int start_cut_idx = (sol.reaches_n_terminal ? 1 : 0);
	const int last_cut_idx  = (sol.reaches_c_terminal ? exp_cuts.size()-1 : exp_cuts.size());
	const mass_t c_mass = exp_cuts[exp_cuts.size()-1];


	int max_l=0;
	for (i=0; i<sel_frags.size(); i++)
	{
		const int frag_idx=sel_frags[i];
		const int cut_idx = sel_idxs[i];
	
		const FragmentType& fragment = config->get_fragment(frag_idx);

		const mass_t cut_mass = exp_cuts[cut_idx];
		const mass_t peak_mass = fragment.calc_expected_mass(cut_mass,pm_with_19);
		
		RankBoostSample rbs;

		for (f=0; f<num_frags; f++)
			if (ppp.frag_idxs[f] == frag_idx)
				break;

	//	cout << "Frag: " << fragment.label << " fi:" << frag_idx << " f:" << f << endl;

		if (f==num_frags)
		{
			cout << "Error: bad frag!!!!" << endl;
			exit(1);
		}
		

		partition_models[spec_charge][size_idx][mobility]->fill_combined_simple_peak_features(
			this, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
				
//		partition_models[spec_charge][size_idx][mobility]->fill_combined_peak_features(	
//			this, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
			
		total_scores[i] = partition_models[spec_charge][size_idx][mobility]->combined_frag_boost_model.calc_rank_score_with_details(
									rbs,feature_names[i],feature_values[i],feature_scores[i]);
							
			
		if (feature_names[i].size()>max_l)
			max_l = feature_names[i].size();
	}


	cout << "Size: " << size_idx << " Mobility: " << mobility << endl;


	// print results
	for (i=0; i<num_peaks; i++)
	{
		cout << config->get_fragment(sel_frags[i]).label << " " <<
			sel_idxs[i];
		
		if (i<num_peaks-1)
		{
			cout << " & ";
		}
		else
			cout << "\\\\" << endl;
	}

	cout << setprecision(2) << fixed;
	for (i=0; i<num_peaks; i++)
	{
		cout << total_scores[i];
		if (i<num_peaks-1)
		{
			cout << " & ";
		}
		else
			cout << "\\\\" << endl;
	}

	for (i=0; i<num_peaks; i++)
	{
		cout << intensities[i];
		if (i<num_peaks-1)
		{
			cout << " & ";
		}
		else
			cout << "\\\\" << endl;
	}

	for (i=0; i<max_l; i++)
	{
		int j;
		for (j=0; j<num_peaks; j++)
		{
			if (feature_names[j].size()<=i)
			{
				cout << "           &  ";  
			}
			else
			{
				cout << feature_names[j][i] << " " << feature_values[j][i] << " & ";
				if (feature_scores[j][i]>0)
				{
					cout << "+";
				}
				cout << feature_scores[j][i];
			}

			if (j<num_peaks-1)
			{
				cout << " & ";
			}
			else
				cout << "\\\\" << endl;
		}
	}



	return true;
}