Exemplo n.º 1
0
void AdvancedScoreModel::learn_prm_normalizer_values(const FileManager& fm)
{
	const float step = 0.5;
	const float min_delta = -1.0;
	const float max_delta = 7.0;
	const float target_mid_ratio = 0.96;
	const float target_side_ratio = 0.94;


	config.set_use_spectrum_charge(1);

	regional_prm_normalizers.resize(regional_breakage_score_models.size());
	int c;
	for (c=0; c<regional_breakage_score_models.size(); c++)
	{
		regional_prm_normalizers[c].resize(regional_breakage_score_models[c].size());
		int s;
		for (s=0; s<regional_breakage_score_models[c].size(); s++)
			regional_prm_normalizers[c][s].resize(regional_breakage_score_models[c][s].size(),0);
	}
	

	const vector< vector<mass_t> >& mass_threshes = config.get_size_thresholds();
	for (c=1; c<regional_prm_normalizers.size(); c++)
	{
		int s;
		for (s=0; s<regional_prm_normalizers[c].size(); s++)
		{
			const mass_t min_mass = (s == 0 ? 0 : mass_threshes[c][s-1]);
			const mass_t max_mass =  mass_threshes[c][s];
			const int num_regions = regional_prm_normalizers[c][s].size();
			
			cout << "Finding normalizers for charge " << c << " size " << s << "  (masses " << min_mass << " - " <<
				max_mass << ")" << endl;

			FileSet fs;
			BasicSpecReader bsr;

			fs.select_files_in_mz_range(fm,min_mass/c,max_mass/c,c);
			fs.randomly_reduce_ssfs(2000);

			vector< vector< NodeType > > all_prms;
			const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();

			if (fs.get_total_spectra()<50)
			{
				cout << "Insufficient number of spectra... skipping" << endl;
				continue;
			}

			int sc;
			for (sc=0; sc<all_ssf.size(); sc++)
			{
				PrmGraph prm;
				static vector<QCPeak> peaks;
				SingleSpectrumFile *ssf = all_ssf[sc];
				if (peaks.size()<ssf->num_peaks)
				{
					int new_size = ssf->num_peaks*2;
					if (new_size<2500)
						new_size=2500;
					peaks.resize(new_size); 
				}

				const int num_peaks = bsr.read_basic_spec(&config,fm,ssf,&peaks[0]);	
				if (num_peaks<5)
					continue;

				// convert peak list ot a spectrum with charge (if original charge ==0)
				// the spectrum gets charge 2, but the true charge is computed from the data
			
				Spectrum s;
				s.init_from_QCPeaks(&config,&peaks[0],num_peaks,ssf);

				vector<mass_t> pms_with_19;
				vector<int>    charges;

				pms_with_19.clear();
				charges.clear();		
				
				BasicSpectrum bs;
				bs.ssf = ssf;
				bs.peaks = &peaks[0];
				bs.num_peaks = num_peaks;

				// output m/z and prob values for the different charge states
				
				select_pms_and_charges(&config,bs,pms_with_19,charges);
				if (pms_with_19.size()<=0)
					continue;
			
				s.set_charge(charges[0]);
				init_model_for_scoring_spectrum(&s);
				prm.create_graph_from_spectrum(this,&s,pms_with_19[0]);

				vector<NodeType> spec_prms;
				vector<mass_t>   exp_masses;
				const mass_t true_mass_with_19 = s.get_true_mass_with_19();
				s.get_peptide().calc_expected_breakage_masses(&config,exp_masses);

				int i;
				for (i=1; i<prm.get_num_nodes()-1; i++)
				{
					const Node& node = prm.get_node(i);
					if (node.score == 0)
						continue;
					
					NodeType nt;

					nt.type = 0;
					int j;
					for (j=0; j<exp_masses.size(); j++)
						if (fabs(exp_masses[j]-node.mass)<config.get_tolerance())
						{
							nt.type=1;
							break;
						}
					
					if (nt.type<=0)
					{
						int j;
						for (j=0; j<exp_masses.size(); j++)
							if (fabs(true_mass_with_19 - exp_masses[j] -node.mass-MASS_PROTON)<config.get_tolerance())
							{
								nt.type=2;
								break;
							}
					}

					nt.org_score = node.score;
					nt.mod_score = node.score;
					nt.region = node.breakage.region_idx;
					spec_prms.push_back(nt);
				}
				all_prms.push_back(spec_prms);
			}
		
	
			vector< vector< double > > per_pre, per_suf, per_covered;
			vector<float> deltas;

			per_pre.resize(num_regions);
			per_suf.resize(num_regions);
			per_covered.resize(num_regions);

			float delta;
			for (delta = min_delta; delta<=max_delta; delta+=step )
			{
				// perform mods
				int a;
				for (a=0; a<all_prms.size(); a++)
				{
					int b;
					for (b=0; b<all_prms[a].size(); b++)
					{
						NodeType& nt = all_prms[a][b];
						if (nt.org_score< -delta)
						{
							nt.mod_score = NEG_INF;
							continue;
						}
						
					/*	if (nt.org_score>delta)
						{
							nt.mod_score = nt.org_score ;
						}
						else
							nt.mod_score = nt.org_score + (delta-nt.org_score)*0.5;*/
						nt.mod_score = nt.org_score + delta;
					}
				}

				// compute stats (if score is negative treat as 0)
				vector<double> num_pre,num_suf;
				vector<double> num_pre_wpos, num_suf_wpos;
				vector<double> score_pre, score_suf, total_score;
			

				num_pre.resize(num_regions,0);
				num_suf.resize(num_regions,0);
				num_pre_wpos.resize(num_regions,0);
				num_suf_wpos.resize(num_regions,0);
				score_pre.resize(num_regions,0);
				score_suf.resize(num_regions,0);
				total_score.resize(num_regions,0);
				
				for (a=0; a<all_prms.size(); a++)
				{
					int b;
					for (b=0; b<all_prms[a].size(); b++)
					{
						const int   type =    all_prms[a][b].type;
						const float score =   all_prms[a][b].mod_score;
						const int   region =  all_prms[a][b].region;

						if (type == 1)
						{
							num_pre[region]++;
							if (score>0)
							{
								num_pre_wpos[region]++;
								score_pre[region]+= score;
							}
						}

						if (type == 2)
						{
							num_suf[region]++;
							if (score>0)
							{
								num_suf_wpos[region]++;
								score_suf[region]+=score;
							}
						}

						if (score>0)
							total_score[region]+=score;
					}
				}

				
				deltas.push_back(delta);
				int r;
				for (r=0; r<num_regions; r++)
				{
					per_pre[r].push_back(num_pre_wpos[r]/num_pre[r]);
					per_suf[r].push_back(num_suf_wpos[r]/num_suf[r]);
					per_covered[r].push_back((score_pre[r]+score_suf[r])/total_score[r]);
				}
			}

			// report
			int r;
			for (r=0; r<num_regions; r++)
			{
				cout << endl << "Region " << r << endl;
				int d;
				for (d=0; d<deltas.size(); d++)
					cout << "\t" << deltas[d];
				cout << endl << "% Pre";
				for (d=0; d<per_pre[r].size(); d++)
					cout << "\t" << per_pre[r][d];
				cout << endl << "% Suf";
				for (d=0; d<per_suf[r].size(); d++)
					cout << "\t" << per_suf[r][d];
				cout << endl << "% Cov";
				for (d=0; d<per_covered[r].size(); d++)
					cout << "\t" << per_covered[r][d];
				cout << endl;

				// select
				float target_val = target_mid_ratio;
				if (r==0 || r == num_regions-1)
					target_val = target_side_ratio;

				float best_val=POS_INF;
				float best_delta=0;

				for (d=0; d<deltas.size(); d++)
					if (fabs(per_pre[r][d]-target_val)<best_val)
					{
						best_val = fabs(per_pre[r][d]-target_val);
						best_delta = deltas[d];
					}
				
				cout << "Chose delta = " << best_delta << endl << endl;
				regional_prm_normalizers[c][s][r]=best_delta;
			}	
		}
	}
}
Exemplo n.º 2
0
void PrmNodeScoreModel::learnPrmNormalizerValue(void* allScoreModelsVoidPointer, const SpectraAggregator& sa)
{
	AllScoreModels* allScoreModels = static_cast<AllScoreModels*>(allScoreModelsVoidPointer);
	const float step = 0.5;
	const float min_delta = -1.0;
	const float max_delta = 7.0;
	const float target_mid_ratio = 0.96;
	const float target_side_ratio = 0.94;


	config_->set_use_spectrum_charge(1);

	regional_prm_normalizers.resize(RegionalPrmNodeScoreModels_.size());
	int c;
	for (c=0; c<RegionalPrmNodeScoreModels_.size(); c++)
	{
		regional_prm_normalizers[c].resize(RegionalPrmNodeScoreModels_[c].size());
		int s;
		for (s=0; s<RegionalPrmNodeScoreModels_[c].size(); s++)
			regional_prm_normalizers[c][s].resize(RegionalPrmNodeScoreModels_[c][s].size(),0);
	}
	

	const vector< vector<mass_t> >& mass_threshes = config_->get_size_thresholds();
	for (c=1; c<regional_prm_normalizers.size(); c++)
	{
		int s;
		for (s=0; s<regional_prm_normalizers[c].size(); s++)
		{
			const mass_t min_mass = (s == 0 ? 0 : mass_threshes[c][s-1]);
			const mass_t max_mass =  mass_threshes[c][s];
			const int num_regions = regional_prm_normalizers[c][s].size();
			
			cout << "Finding normalizers for charge " << c << " size " << s << "  (masses " << min_mass << " - " <<
				max_mass << ")" << endl;

			SpectraList sl(sa);
			sl.selectHeaders(min_mass/c,max_mass/c,c,c);
			sl.randomlyReduceListToSize(2000);

			if (sl.getNumHeaders()<50)
			{
				cout << "Insufficient number of spectra... skipping" << endl;
				continue;
			}

			vector< vector< NodeType > > all_prms;
			int sc;
			for (sc=0; sc<sl.getNumHeaders(); sc++)
			{
				const SingleSpectrumHeader* header = sl.getSpectrumHeader(sc);
				PrmGraph prm;

				Spectrum s;
				if (! s.readSpectrum(sa, header))
					continue;
				
				vector<mass_t> pms_with_19;
				vector<int>    charges;

				// output m/z and prob values for the different charge states
				allScoreModels->selectPrecursorMassesAndCharges(config_, s, pms_with_19, charges);
				if (pms_with_19.size()<=0)
					continue;
			
				s.setCharge(charges[0]);
				prm.create_graph_from_spectrum(allScoreModels, &s,pms_with_19[0]);

				vector<NodeType> spec_prms;
				vector<mass_t>   exp_masses;
				const mass_t true_mass_with_19 = s.get_true_mass_with_19();
				s.getPeptide().calc_expected_breakage_masses(config_,exp_masses);

				int i;
				for (i=1; i<prm.get_num_nodes()-1; i++)
				{
					const Node& node = prm.get_node(i);
					if (node.score == 0)
						continue;
					
					NodeType nt;

					nt.type = 0;
					int j;
					for (j=0; j<exp_masses.size(); j++)
						if (fabs(exp_masses[j]-node.mass)<config_->getTolerance())
						{
							nt.type=1;
							break;
						}
					
					if (nt.type<=0)
					{
						int j;
						for (j=0; j<exp_masses.size(); j++)
							if (fabs(true_mass_with_19 - exp_masses[j] -node.mass-MASS_PROTON)<config_->getTolerance())
							{
								nt.type=2;
								break;
							}
					}

					nt.org_score = node.score;
					nt.mod_score = node.score;
					nt.region = node.breakage.region_idx;
					spec_prms.push_back(nt);
				}
				all_prms.push_back(spec_prms);
			}
		
	
			vector< vector< double > > per_pre, per_suf, per_covered;
			vector<float> deltas;

			per_pre.resize(num_regions);
			per_suf.resize(num_regions);
			per_covered.resize(num_regions);

			float delta;
			for (delta = min_delta; delta<=max_delta; delta+=step )
			{
				// perform mods
				int a;
				for (a=0; a<all_prms.size(); a++)
				{
					int b;
					for (b=0; b<all_prms[a].size(); b++)
					{
						NodeType& nt = all_prms[a][b];
						if (nt.org_score< -delta)
						{
							nt.mod_score = NEG_INF;
							continue;
						}
						nt.mod_score = nt.org_score + delta;
					}
				}

				// compute stats (if score is negative treat as 0)
				vector<double> num_pre,num_suf;
				vector<double> num_pre_wpos, num_suf_wpos;
				vector<double> score_pre, score_suf, total_score;
			

				num_pre.resize(num_regions,0);
				num_suf.resize(num_regions,0);
				num_pre_wpos.resize(num_regions,0);
				num_suf_wpos.resize(num_regions,0);
				score_pre.resize(num_regions,0);
				score_suf.resize(num_regions,0);
				total_score.resize(num_regions,0);
				
				for (a=0; a<all_prms.size(); a++)
				{
					int b;
					for (b=0; b<all_prms[a].size(); b++)
					{
						const int   type =    all_prms[a][b].type;
						const float score =   all_prms[a][b].mod_score;
						const int   region =  all_prms[a][b].region;

						if (type == 1)
						{
							num_pre[region]++;
							if (score>0)
							{
								num_pre_wpos[region]++;
								score_pre[region]+= score;
							}
						}

						if (type == 2)
						{
							num_suf[region]++;
							if (score>0)
							{
								num_suf_wpos[region]++;
								score_suf[region]+=score;
							}
						}

						if (score>0)
							total_score[region]+=score;
					}
				}

				
				deltas.push_back(delta);
				int r;
				for (r=0; r<num_regions; r++)
				{
					per_pre[r].push_back(num_pre_wpos[r]/num_pre[r]);
					per_suf[r].push_back(num_suf_wpos[r]/num_suf[r]);
					per_covered[r].push_back((score_pre[r]+score_suf[r])/total_score[r]);
				}
			}

			// report
			int r;
			for (r=0; r<num_regions; r++)
			{
				cout << endl << "Region " << r << endl;
				int d;
				for (d=0; d<deltas.size(); d++)
					cout << "\t" << deltas[d];
				cout << endl << "% Pre";
				for (d=0; d<per_pre[r].size(); d++)
					cout << "\t" << per_pre[r][d];
				cout << endl << "% Suf";
				for (d=0; d<per_suf[r].size(); d++)
					cout << "\t" << per_suf[r][d];
				cout << endl << "% Cov";
				for (d=0; d<per_covered[r].size(); d++)
					cout << "\t" << per_covered[r][d];
				cout << endl;

				// select
				float target_val = target_mid_ratio;
				if (r==0 || r == num_regions-1)
					target_val = target_side_ratio;

				float best_val=POS_INF;
				float best_delta=0;

				for (d=0; d<deltas.size(); d++)
					if (fabs(per_pre[r][d]-target_val)<best_val)
					{
						best_val = fabs(per_pre[r][d]-target_val);
						best_delta = deltas[d];
					}
				
				cout << "Chose delta = " << best_delta << endl << endl;
				regional_prm_normalizers[c][s][r]=best_delta;
			}	
		}
	}

	indNormzlizersInitialized_ = true;
	write_prm_normalizer_values();
}