void AdvancedScoreModel::learn_prm_normalizer_values(const FileManager& fm) { const float step = 0.5; const float min_delta = -1.0; const float max_delta = 7.0; const float target_mid_ratio = 0.96; const float target_side_ratio = 0.94; config.set_use_spectrum_charge(1); regional_prm_normalizers.resize(regional_breakage_score_models.size()); int c; for (c=0; c<regional_breakage_score_models.size(); c++) { regional_prm_normalizers[c].resize(regional_breakage_score_models[c].size()); int s; for (s=0; s<regional_breakage_score_models[c].size(); s++) regional_prm_normalizers[c][s].resize(regional_breakage_score_models[c][s].size(),0); } const vector< vector<mass_t> >& mass_threshes = config.get_size_thresholds(); for (c=1; c<regional_prm_normalizers.size(); c++) { int s; for (s=0; s<regional_prm_normalizers[c].size(); s++) { const mass_t min_mass = (s == 0 ? 0 : mass_threshes[c][s-1]); const mass_t max_mass = mass_threshes[c][s]; const int num_regions = regional_prm_normalizers[c][s].size(); cout << "Finding normalizers for charge " << c << " size " << s << " (masses " << min_mass << " - " << max_mass << ")" << endl; FileSet fs; BasicSpecReader bsr; fs.select_files_in_mz_range(fm,min_mass/c,max_mass/c,c); fs.randomly_reduce_ssfs(2000); vector< vector< NodeType > > all_prms; const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers(); if (fs.get_total_spectra()<50) { cout << "Insufficient number of spectra... skipping" << endl; continue; } int sc; for (sc=0; sc<all_ssf.size(); sc++) { PrmGraph prm; static vector<QCPeak> peaks; SingleSpectrumFile *ssf = all_ssf[sc]; if (peaks.size()<ssf->num_peaks) { int new_size = ssf->num_peaks*2; if (new_size<2500) new_size=2500; peaks.resize(new_size); } const int num_peaks = bsr.read_basic_spec(&config,fm,ssf,&peaks[0]); if (num_peaks<5) continue; // convert peak list ot a spectrum with charge (if original charge ==0) // the spectrum gets charge 2, but the true charge is computed from the data Spectrum s; s.init_from_QCPeaks(&config,&peaks[0],num_peaks,ssf); vector<mass_t> pms_with_19; vector<int> charges; pms_with_19.clear(); charges.clear(); BasicSpectrum bs; bs.ssf = ssf; bs.peaks = &peaks[0]; bs.num_peaks = num_peaks; // output m/z and prob values for the different charge states select_pms_and_charges(&config,bs,pms_with_19,charges); if (pms_with_19.size()<=0) continue; s.set_charge(charges[0]); init_model_for_scoring_spectrum(&s); prm.create_graph_from_spectrum(this,&s,pms_with_19[0]); vector<NodeType> spec_prms; vector<mass_t> exp_masses; const mass_t true_mass_with_19 = s.get_true_mass_with_19(); s.get_peptide().calc_expected_breakage_masses(&config,exp_masses); int i; for (i=1; i<prm.get_num_nodes()-1; i++) { const Node& node = prm.get_node(i); if (node.score == 0) continue; NodeType nt; nt.type = 0; int j; for (j=0; j<exp_masses.size(); j++) if (fabs(exp_masses[j]-node.mass)<config.get_tolerance()) { nt.type=1; break; } if (nt.type<=0) { int j; for (j=0; j<exp_masses.size(); j++) if (fabs(true_mass_with_19 - exp_masses[j] -node.mass-MASS_PROTON)<config.get_tolerance()) { nt.type=2; break; } } nt.org_score = node.score; nt.mod_score = node.score; nt.region = node.breakage.region_idx; spec_prms.push_back(nt); } all_prms.push_back(spec_prms); } vector< vector< double > > per_pre, per_suf, per_covered; vector<float> deltas; per_pre.resize(num_regions); per_suf.resize(num_regions); per_covered.resize(num_regions); float delta; for (delta = min_delta; delta<=max_delta; delta+=step ) { // perform mods int a; for (a=0; a<all_prms.size(); a++) { int b; for (b=0; b<all_prms[a].size(); b++) { NodeType& nt = all_prms[a][b]; if (nt.org_score< -delta) { nt.mod_score = NEG_INF; continue; } /* if (nt.org_score>delta) { nt.mod_score = nt.org_score ; } else nt.mod_score = nt.org_score + (delta-nt.org_score)*0.5;*/ nt.mod_score = nt.org_score + delta; } } // compute stats (if score is negative treat as 0) vector<double> num_pre,num_suf; vector<double> num_pre_wpos, num_suf_wpos; vector<double> score_pre, score_suf, total_score; num_pre.resize(num_regions,0); num_suf.resize(num_regions,0); num_pre_wpos.resize(num_regions,0); num_suf_wpos.resize(num_regions,0); score_pre.resize(num_regions,0); score_suf.resize(num_regions,0); total_score.resize(num_regions,0); for (a=0; a<all_prms.size(); a++) { int b; for (b=0; b<all_prms[a].size(); b++) { const int type = all_prms[a][b].type; const float score = all_prms[a][b].mod_score; const int region = all_prms[a][b].region; if (type == 1) { num_pre[region]++; if (score>0) { num_pre_wpos[region]++; score_pre[region]+= score; } } if (type == 2) { num_suf[region]++; if (score>0) { num_suf_wpos[region]++; score_suf[region]+=score; } } if (score>0) total_score[region]+=score; } } deltas.push_back(delta); int r; for (r=0; r<num_regions; r++) { per_pre[r].push_back(num_pre_wpos[r]/num_pre[r]); per_suf[r].push_back(num_suf_wpos[r]/num_suf[r]); per_covered[r].push_back((score_pre[r]+score_suf[r])/total_score[r]); } } // report int r; for (r=0; r<num_regions; r++) { cout << endl << "Region " << r << endl; int d; for (d=0; d<deltas.size(); d++) cout << "\t" << deltas[d]; cout << endl << "% Pre"; for (d=0; d<per_pre[r].size(); d++) cout << "\t" << per_pre[r][d]; cout << endl << "% Suf"; for (d=0; d<per_suf[r].size(); d++) cout << "\t" << per_suf[r][d]; cout << endl << "% Cov"; for (d=0; d<per_covered[r].size(); d++) cout << "\t" << per_covered[r][d]; cout << endl; // select float target_val = target_mid_ratio; if (r==0 || r == num_regions-1) target_val = target_side_ratio; float best_val=POS_INF; float best_delta=0; for (d=0; d<deltas.size(); d++) if (fabs(per_pre[r][d]-target_val)<best_val) { best_val = fabs(per_pre[r][d]-target_val); best_delta = deltas[d]; } cout << "Chose delta = " << best_delta << endl << endl; regional_prm_normalizers[c][s][r]=best_delta; } } } }
void PrmNodeScoreModel::learnPrmNormalizerValue(void* allScoreModelsVoidPointer, const SpectraAggregator& sa) { AllScoreModels* allScoreModels = static_cast<AllScoreModels*>(allScoreModelsVoidPointer); const float step = 0.5; const float min_delta = -1.0; const float max_delta = 7.0; const float target_mid_ratio = 0.96; const float target_side_ratio = 0.94; config_->set_use_spectrum_charge(1); regional_prm_normalizers.resize(RegionalPrmNodeScoreModels_.size()); int c; for (c=0; c<RegionalPrmNodeScoreModels_.size(); c++) { regional_prm_normalizers[c].resize(RegionalPrmNodeScoreModels_[c].size()); int s; for (s=0; s<RegionalPrmNodeScoreModels_[c].size(); s++) regional_prm_normalizers[c][s].resize(RegionalPrmNodeScoreModels_[c][s].size(),0); } const vector< vector<mass_t> >& mass_threshes = config_->get_size_thresholds(); for (c=1; c<regional_prm_normalizers.size(); c++) { int s; for (s=0; s<regional_prm_normalizers[c].size(); s++) { const mass_t min_mass = (s == 0 ? 0 : mass_threshes[c][s-1]); const mass_t max_mass = mass_threshes[c][s]; const int num_regions = regional_prm_normalizers[c][s].size(); cout << "Finding normalizers for charge " << c << " size " << s << " (masses " << min_mass << " - " << max_mass << ")" << endl; SpectraList sl(sa); sl.selectHeaders(min_mass/c,max_mass/c,c,c); sl.randomlyReduceListToSize(2000); if (sl.getNumHeaders()<50) { cout << "Insufficient number of spectra... skipping" << endl; continue; } vector< vector< NodeType > > all_prms; int sc; for (sc=0; sc<sl.getNumHeaders(); sc++) { const SingleSpectrumHeader* header = sl.getSpectrumHeader(sc); PrmGraph prm; Spectrum s; if (! s.readSpectrum(sa, header)) continue; vector<mass_t> pms_with_19; vector<int> charges; // output m/z and prob values for the different charge states allScoreModels->selectPrecursorMassesAndCharges(config_, s, pms_with_19, charges); if (pms_with_19.size()<=0) continue; s.setCharge(charges[0]); prm.create_graph_from_spectrum(allScoreModels, &s,pms_with_19[0]); vector<NodeType> spec_prms; vector<mass_t> exp_masses; const mass_t true_mass_with_19 = s.get_true_mass_with_19(); s.getPeptide().calc_expected_breakage_masses(config_,exp_masses); int i; for (i=1; i<prm.get_num_nodes()-1; i++) { const Node& node = prm.get_node(i); if (node.score == 0) continue; NodeType nt; nt.type = 0; int j; for (j=0; j<exp_masses.size(); j++) if (fabs(exp_masses[j]-node.mass)<config_->getTolerance()) { nt.type=1; break; } if (nt.type<=0) { int j; for (j=0; j<exp_masses.size(); j++) if (fabs(true_mass_with_19 - exp_masses[j] -node.mass-MASS_PROTON)<config_->getTolerance()) { nt.type=2; break; } } nt.org_score = node.score; nt.mod_score = node.score; nt.region = node.breakage.region_idx; spec_prms.push_back(nt); } all_prms.push_back(spec_prms); } vector< vector< double > > per_pre, per_suf, per_covered; vector<float> deltas; per_pre.resize(num_regions); per_suf.resize(num_regions); per_covered.resize(num_regions); float delta; for (delta = min_delta; delta<=max_delta; delta+=step ) { // perform mods int a; for (a=0; a<all_prms.size(); a++) { int b; for (b=0; b<all_prms[a].size(); b++) { NodeType& nt = all_prms[a][b]; if (nt.org_score< -delta) { nt.mod_score = NEG_INF; continue; } nt.mod_score = nt.org_score + delta; } } // compute stats (if score is negative treat as 0) vector<double> num_pre,num_suf; vector<double> num_pre_wpos, num_suf_wpos; vector<double> score_pre, score_suf, total_score; num_pre.resize(num_regions,0); num_suf.resize(num_regions,0); num_pre_wpos.resize(num_regions,0); num_suf_wpos.resize(num_regions,0); score_pre.resize(num_regions,0); score_suf.resize(num_regions,0); total_score.resize(num_regions,0); for (a=0; a<all_prms.size(); a++) { int b; for (b=0; b<all_prms[a].size(); b++) { const int type = all_prms[a][b].type; const float score = all_prms[a][b].mod_score; const int region = all_prms[a][b].region; if (type == 1) { num_pre[region]++; if (score>0) { num_pre_wpos[region]++; score_pre[region]+= score; } } if (type == 2) { num_suf[region]++; if (score>0) { num_suf_wpos[region]++; score_suf[region]+=score; } } if (score>0) total_score[region]+=score; } } deltas.push_back(delta); int r; for (r=0; r<num_regions; r++) { per_pre[r].push_back(num_pre_wpos[r]/num_pre[r]); per_suf[r].push_back(num_suf_wpos[r]/num_suf[r]); per_covered[r].push_back((score_pre[r]+score_suf[r])/total_score[r]); } } // report int r; for (r=0; r<num_regions; r++) { cout << endl << "Region " << r << endl; int d; for (d=0; d<deltas.size(); d++) cout << "\t" << deltas[d]; cout << endl << "% Pre"; for (d=0; d<per_pre[r].size(); d++) cout << "\t" << per_pre[r][d]; cout << endl << "% Suf"; for (d=0; d<per_suf[r].size(); d++) cout << "\t" << per_suf[r][d]; cout << endl << "% Cov"; for (d=0; d<per_covered[r].size(); d++) cout << "\t" << per_covered[r][d]; cout << endl; // select float target_val = target_mid_ratio; if (r==0 || r == num_regions-1) target_val = target_side_ratio; float best_val=POS_INF; float best_delta=0; for (d=0; d<deltas.size(); d++) if (fabs(per_pre[r][d]-target_val)<best_val) { best_val = fabs(per_pre[r][d]-target_val); best_delta = deltas[d]; } cout << "Chose delta = " << best_delta << endl << endl; regional_prm_normalizers[c][s][r]=best_delta; } } } indNormzlizersInitialized_ = true; write_prm_normalizer_values(); }