void convert_ME_to_RankBoostSample(const ME_Regression_Sample& me, RankBoostSample& rbs) { rbs.clear(); int i; for (i=0; i<me.f_vals.size(); i++) rbs.add_real_feature(me.f_vals[i].f_idx,me.f_vals[i].val); }
/********************************************************************** This fills in all possible sequence features that can be used in any of the models (these features are good for all fragments/mobility/ size/charge). The models can later choose to ignore some of these features by asigning the weight 0. All features are derived directly from the peptide sequence and concern the specified cut idx. ***********************************************************************/ void PeakRankModel::fill_advanced_peak_features( const vector<int>& org_amino_acids, int cut_idx, mass_t cut_mass, mass_t pm_with_19, int spec_charge, const FragmentType& fragment, RankBoostSample& sample) const { const int length = org_amino_acids.size(); const int num_aas = model_aa_labels.size(); int r_idx=0; int i; vector<int> amino_acids; convert_aas_to_model_aas(org_amino_acids, amino_acids); if (amino_acids.size() != org_amino_acids.size()) { cout << "Error: aa size mismatch!" << endl; exit(1); } if (cut_idx<=0 || cut_idx>=amino_acids.size()) { cout << "Error: cut_idx is bad!" << endl; exit(1); } // need to use the special Idx variables and not the regular enumerations const int HisIdx = session_aas_to_model_aas[His]; const int LysIdx = session_aas_to_model_aas[Lys]; const int ArgIdx = session_aas_to_model_aas[Arg]; const int SerIdx = session_aas_to_model_aas[Ser]; const int ThrIdx = session_aas_to_model_aas[Thr]; const int ProIdx = session_aas_to_model_aas[Pro]; const int GlyIdx = session_aas_to_model_aas[Gly]; const int AlaIdx = session_aas_to_model_aas[Ala]; const int LeuIdx = session_aas_to_model_aas[Leu]; const int AsnIdx = session_aas_to_model_aas[Asn]; const int AspIdx = session_aas_to_model_aas[Asp]; const int GluIdx = session_aas_to_model_aas[Glu]; sample.clear(); // special N C side aa indicators int num_nH=0, num_cH=0; int num_nK=0, num_cK=0; int num_nR=0, num_cR=0; for (i=0; i<cut_idx; i++) { if (amino_acids[i] == HisIdx) num_nH++; if (amino_acids[i] == LysIdx) num_nK++; if (amino_acids[i] == ArgIdx) num_nR++; } for (i=cut_idx; i<length; i++) { if (amino_acids[i] == HisIdx) num_cH++; if (amino_acids[i] == LysIdx) num_cK++; if (amino_acids[i] == ArgIdx) num_cR++; } // MASS / LOCATION FEATURES (REAL + BINARY) const mass_t exp_peak_mass = fragment.calc_expected_mass(cut_mass,pm_with_19); const mass_t min_obs_mass = calc_min_detected_mass(pm_with_19,spec_charge); const mass_t max_obs_mass = (pm_with_19>max_detected_mass ? max_detected_mass : pm_with_19); const float peak_mass_prop = ((exp_peak_mass - min_obs_mass)/(max_obs_mass - min_obs_mass)); const float rounded_peak_prop = 0.1*floor(peak_mass_prop * 10.0); // give values within a resolution of 20 Da const mass_t dis_from_min = 25.0*floor((exp_peak_mass - min_obs_mass)*0.04); const mass_t dis_from_max = 25.0*floor((max_obs_mass - exp_peak_mass)*0.04); const int RKH_n_combo_idx = calc_RKH_combo_idx(num_nR,num_nK,num_nH); const int RKH_c_combo_idx = calc_RKH_combo_idx(num_cR,num_cK,num_cH); const int RKH_pair_idx = (RKH_n_combo_idx * num_RKH_combos) + RKH_c_combo_idx; const float RKH_liniar_pair_idx = RKH_pair_matrix[RKH_n_combo_idx][RKH_c_combo_idx]; const int n_aa = amino_acids[cut_idx-1]; const int c_aa = amino_acids[cut_idx]; // proportion of mass of the N/C fragments (special values are given to the first 3 // cuts on each side. If the cut is not in those regions, prop is assigned the // index of the fifth in which it falls int side_length=3; if (length>=12) side_length=4; if (length>=15) side_length=5; float cut_prop; if (cut_idx<=side_length) { cut_prop=(float)cut_idx; } else if (cut_idx>=length-side_length) { cut_prop=(float)(11+cut_idx-length); } else { cut_prop = 5.1+floor(3.0*(cut_mass/pm_with_19))*0.1; } // fill N RKH and C RKH sample.add_real_feature(r_idx++,RKH_n_combo_idx); sample.add_real_feature(r_idx++,RKH_c_combo_idx); // peak prop sample.add_real_feature(r_idx++,rounded_peak_prop); sample.add_real_feature(r_idx+RKH_pair_idx,rounded_peak_prop); r_idx+=num_RKH_pairs; // fill dis features if (dis_from_min<dis_from_max) { sample.add_real_feature(r_idx,dis_from_min); r_idx++; sample.add_real_feature(r_idx+RKH_pair_idx,dis_from_min); r_idx+=(2*num_RKH_pairs+1); } else { r_idx+=(num_RKH_pairs+1); sample.add_real_feature(r_idx,dis_from_max); r_idx++; sample.add_real_feature(r_idx+RKH_pair_idx,dis_from_max); r_idx+=num_RKH_pairs; } // fill prop features sample.add_real_feature(r_idx++,cut_prop); sample.add_real_feature(r_idx+RKH_pair_idx,cut_prop); r_idx+=num_RKH_pairs; // fill prop X dis features if (dis_from_min<dis_from_max) { if (dis_from_min<75.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; if (dis_from_min<150.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; if (dis_from_min<250.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; if (dis_from_min<=400.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; r_idx+=4; } else { r_idx+=4; if (dis_from_max<75.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; if (dis_from_max<150.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; if (dis_from_max<250.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; if (dis_from_max<=400.0) sample.add_real_feature(r_idx,cut_prop); r_idx++; } // fill aa count features (up top 3 aa's away from cut) vector<int> n_aa_counts, c_aa_counts; n_aa_counts.resize(num_aas+1,0); c_aa_counts.resize(num_aas+1,0); for (i=0; i<cut_idx-3; i++) n_aa_counts[amino_acids[i]]++; for (i=cut_idx+3; i<length; i++) c_aa_counts[amino_acids[i]]++; int a; for (a=0; a<num_aas; a++) sample.add_real_feature(r_idx++,n_aa_counts[a]); for (a=0; a<num_aas; a++) sample.add_real_feature(r_idx++,c_aa_counts[a]); // including the aas up top the count int start_cut = cut_idx-3; if (start_cut<0) start_cut=0; for (i=start_cut; i<cut_idx; i++) n_aa_counts[amino_acids[i]]++; int end_cut = cut_idx+3; if (end_cut>length) end_cut = length; for (i=cut_idx; i<end_cut; i++) c_aa_counts[amino_acids[i]]++; for (a=0; a<num_aas; a++) sample.add_real_feature(r_idx++,n_aa_counts[a]); for (a=0; a<num_aas; a++) sample.add_real_feature(r_idx++,c_aa_counts[a]); // fill aa flanking features N side if (cut_idx>0) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>1) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; if (cut_idx>2) sample.add_real_feature(r_idx+amino_acids[cut_idx-3],cut_prop); r_idx+=num_aas; // fill aa flanking features C side if (cut_idx<length) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx<length-1) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; if (cut_idx<length-2) sample.add_real_feature(r_idx+amino_acids[cut_idx+2],cut_prop); r_idx+=num_aas; // fill cut pair features X-Y sample.add_real_feature(r_idx+(n_aa*num_aas+c_aa),cut_prop); r_idx+=(num_aas*num_aas); // fill cut pair features X-Y sample.add_real_feature(r_idx+(n_aa*num_aas+c_aa),rounded_peak_prop); r_idx+=(num_aas*num_aas); // fill flanking aa info with RKH_pair data // fill aa flanking features N side if (cut_idx>0) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],RKH_liniar_pair_idx); r_idx+=num_aas; if (cut_idx>1) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],RKH_liniar_pair_idx); r_idx+=num_aas; if (cut_idx>2) sample.add_real_feature(r_idx+amino_acids[cut_idx-3],RKH_liniar_pair_idx); r_idx+=num_aas; // fill aa flanking features C side if (cut_idx<length) sample.add_real_feature(r_idx+amino_acids[cut_idx],RKH_liniar_pair_idx); r_idx+=num_aas; if (cut_idx<length-1) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],RKH_liniar_pair_idx); r_idx+=num_aas; if (cut_idx<length-2) sample.add_real_feature(r_idx+amino_acids[cut_idx+2],RKH_liniar_pair_idx); r_idx+=num_aas; // fill flanking aa info with peak prop data // fill aa flanking features N side if (cut_idx>0) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],rounded_peak_prop); r_idx+=num_aas; if (cut_idx>1) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],rounded_peak_prop); r_idx+=num_aas; if (cut_idx>2) sample.add_real_feature(r_idx+amino_acids[cut_idx-3],rounded_peak_prop); r_idx+=num_aas; // fill aa flanking features C side if (cut_idx<length) sample.add_real_feature(r_idx+amino_acids[cut_idx],rounded_peak_prop); r_idx+=num_aas; if (cut_idx<length-1) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],rounded_peak_prop); r_idx+=num_aas; if (cut_idx<length-2) sample.add_real_feature(r_idx+amino_acids[cut_idx+2],rounded_peak_prop); r_idx+=num_aas; // add features for flanking pairs of amino acids if (cut_idx>1) sample.add_real_feature(r_idx+amino_acids[cut_idx-2]*num_aas+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas*num_aas; if (cut_idx<length-2) sample.add_real_feature(r_idx+amino_acids[cut_idx]*num_aas+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas*num_aas; // X != R // features of the form |LXK |LXXK |LXXXK |LXXXXK if (cut_idx < length-2 && amino_acids[cut_idx+2]==LysIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx < length-3 && amino_acids[cut_idx+3]==LysIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx < length-4 && amino_acids[cut_idx+4]==LysIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx < length-5 && amino_acids[cut_idx+5]==LysIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; // features of the form |XLXK |XLXXK |XLXXXK |XLXXXXK if (cut_idx < length-3 && amino_acids[cut_idx+3]==LysIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx] != LysIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; if (cut_idx < length-4 && amino_acids[cut_idx+4]==LysIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != LysIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; if (cut_idx < length-5 && amino_acids[cut_idx+5]==LysIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx && amino_acids[cut_idx] != LysIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; if (cut_idx < length-6 && amino_acids[cut_idx+6]==LysIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx && amino_acids[cut_idx+5] != ArgIdx && amino_acids[cut_idx] != LysIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; // features of the form L|XK L|XXK L|XXXK L|XXXXK if (cut_idx>0 &&cut_idx < length-1 && amino_acids[cut_idx+1]==LysIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>0 &&cut_idx < length-2 && amino_acids[cut_idx+2]==LysIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>0 &&cut_idx < length-3 && amino_acids[cut_idx+3]==LysIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>0 && cut_idx < length-4 && amino_acids[cut_idx+4]==LysIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != ArgIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; // features of the form |LXR |LXXR |LXXXR |LXXXXR if (cut_idx < length-2 && amino_acids[cut_idx+2]==ArgIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx < length-3 && amino_acids[cut_idx+3]==ArgIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx < length-4 && amino_acids[cut_idx+4]==ArgIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx < length-5 && amino_acids[cut_idx+5]==ArgIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; // features of the form |XLXR |XLXXR |XLXXXR |XLXXXXR if (cut_idx < length-3 && amino_acids[cut_idx+3]==ArgIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx] != LysIdx && amino_acids[cut_idx+2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; if (cut_idx < length-4 && amino_acids[cut_idx+4]==ArgIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != LysIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; if (cut_idx < length-5 && amino_acids[cut_idx+5]==ArgIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx && amino_acids[cut_idx] != LysIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; if (cut_idx < length-6 && amino_acids[cut_idx+6]==ArgIdx && amino_acids[cut_idx+1] != LysIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx+4] != ArgIdx && amino_acids[cut_idx+5] != ArgIdx && amino_acids[cut_idx] != LysIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx+1],cut_prop); r_idx+=num_aas; // features of the form L|XR L|XXR L|XXXR L|XXXXR if (cut_idx>0 &&cut_idx < length-1 && amino_acids[cut_idx+1]==ArgIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>0 &&cut_idx < length-2 && amino_acids[cut_idx+2]==ArgIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>0 &&cut_idx < length-3 && amino_acids[cut_idx+3]==ArgIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>0 && cut_idx < length-4 && amino_acids[cut_idx+4]==ArgIdx && amino_acids[cut_idx-1] != LysIdx && amino_acids[cut_idx-1] != ArgIdx && amino_acids[cut_idx+1] != ArgIdx && amino_acids[cut_idx+2] != ArgIdx && amino_acids[cut_idx+3] != ArgIdx && amino_acids[cut_idx] != ArgIdx ) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; // features of the form KXF KXXF KXXXF KXXXXF if (cut_idx>2 && amino_acids[cut_idx-3] == LysIdx &&amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>3 && amino_acids[cut_idx-4] == LysIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>4 && amino_acids[cut_idx-5] == LysIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>5 && amino_acids[cut_idx-6] == LysIdx && amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; // features of the form KXFX KXXFX KXXXFX KXXXXFX if (cut_idx>3 && amino_acids[cut_idx-4] == LysIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; if (cut_idx>4 && amino_acids[cut_idx-5] == LysIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; if (cut_idx>5 && amino_acids[cut_idx-6] == LysIdx && amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; if (cut_idx>6 && amino_acids[cut_idx-7] == LysIdx && amino_acids[cut_idx-6] != ArgIdx && amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; // features of the form KX|F KXX|F KXXX|F KXXXX|F if (cut_idx>1 && cut_idx<length && amino_acids[cut_idx-2] == LysIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx>2 && cut_idx<length && amino_acids[cut_idx-3] == LysIdx && amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx>3 && cut_idx<length && amino_acids[cut_idx-4] == LysIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx>4 && cut_idx<length && amino_acids[cut_idx-5] == LysIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; // features of the form RXF RXXF RXXXF RXXXXF if (cut_idx>2 && amino_acids[cut_idx-3] == ArgIdx &&amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>3 && amino_acids[cut_idx-4] == ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>4 && amino_acids[cut_idx-5] == ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; if (cut_idx>5 && amino_acids[cut_idx-6] == ArgIdx && amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-1],cut_prop); r_idx+=num_aas; // features of the form RXFX RXXFX RXXXFX RXXXXFX if (cut_idx>3 && amino_acids[cut_idx-4] == ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; if (cut_idx>4 && amino_acids[cut_idx-5] == ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; if (cut_idx>5 && amino_acids[cut_idx-6] == ArgIdx && amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; if (cut_idx>6 && amino_acids[cut_idx-7] == ArgIdx && amino_acids[cut_idx-6] != ArgIdx && amino_acids[cut_idx-5] != ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx-2],cut_prop); r_idx+=num_aas; // features of the form RX|F RXX|F RXXX|F RXXXX|F if (cut_idx>1 && cut_idx<length && amino_acids[cut_idx-2] == ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx>2 && cut_idx<length && amino_acids[cut_idx-3] == ArgIdx && amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx>3 && cut_idx<length && amino_acids[cut_idx-4] == ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; if (cut_idx>4 && cut_idx<length && amino_acids[cut_idx-5] == ArgIdx && amino_acids[cut_idx-4] != ArgIdx && amino_acids[cut_idx-3] != ArgIdx && amino_acids[cut_idx-2] != ArgIdx && amino_acids[cut_idx-1] != ArgIdx) sample.add_real_feature(r_idx+amino_acids[cut_idx],cut_prop); r_idx+=num_aas; // Add sepcial C-terminal features const int c_cut_dis = length - cut_idx; if (c_cut_dis<=5 && amino_acids[length-1]==LysIdx) { if (c_aa_counts[LeuIdx]>0 && (c_aa_counts[AspIdx]+c_aa_counts[GluIdx]>0 && c_aa_counts[LeuIdx] + c_aa_counts[GluIdx] + +c_aa_counts[AspIdx] + c_aa_counts[LysIdx] == c_cut_dis) ) sample.add_real_feature(r_idx,c_cut_dis); r_idx++; if (c_aa_counts[LeuIdx]>0 && c_aa_counts[AlaIdx]>0 && c_aa_counts[LeuIdx] + c_aa_counts[AlaIdx] + c_aa_counts[LysIdx] == c_cut_dis) sample.add_real_feature(r_idx,c_cut_dis); r_idx++; if (c_cut_dis>=3 && c_aa_counts[LeuIdx] + c_aa_counts[LysIdx] + c_aa_counts[GluIdx] + c_aa_counts[AspIdx] == c_cut_dis-1) sample.add_real_feature(r_idx,c_cut_dis); r_idx++; } else r_idx+=3; if (amino_acids[length-1]==ArgIdx && cut_idx<length-1) { if (c_aa_counts[LeuIdx] + c_aa_counts[ArgIdx] + c_aa_counts[GluIdx] + c_aa_counts[AspIdx] == c_cut_dis) sample.add_real_feature(r_idx,c_cut_dis); } r_idx++; }