Ejemplo n.º 1
0
/******************************************************************************
Train PMC models from positive example files
*******************************************************************************/
void PMCSQS_Scorer::train_pmc_rank_models(Config *config, const FileManager& fm, 
										  int sel_charge, bool overwrite)
{	
	const bool sample_diagnostic = false;
	const vector<int>& spectra_counts = fm.get_spectra_counts();
	
	max_model_charge=0;

	int charge;
	for (charge=1; charge<spectra_counts.size(); charge++)
	{
		if (spectra_counts[charge]>=MIN_SPECTRA_FOR_PMCSQS_MODEL)
			max_model_charge=charge;
	}

	const int max_to_read_per_file = 40000;
	
	vector<string> real_names;
	init_PMC_feature_names(real_names);


	// try and read existing pmc model, otherwise init a new one
	string pmc_path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt";
	ifstream model_stream(pmc_path.c_str());
	if (model_stream.is_open() && model_stream.good())
	{
		model_stream.close();
		string pmcr_name = config->get_model_name() + "_PMCR.txt";
		const char *path = pmc_path.c_str();
		this->read_pmc_rank_models(config,(char *)pmcr_name.c_str());
	}
	else
	{
		set_pmc_mass_thresholds();
	
		this->set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19
		this->set_bin_increment(0.1);
		pmc_rank_models.resize(pmc_rank_mass_thresholds.size());
		pmc_charge_mz_biases.resize(pmc_rank_mass_thresholds.size());
	}
	
	const double prop_train = 0.5;


	// It is assumed that the mass thresholds were set according to the training data
	// (this is done manually with values encoded in the set_mass_threhsolds function)
	for (charge=1; charge<=max_model_charge; charge++)
	{
		if (sel_charge>0 && charge != sel_charge)
			continue;

		const int num_sizes = pmc_rank_mass_thresholds[charge].size();
		pmc_rank_models[charge].resize(num_sizes+1,NULL);
		pmc_charge_mz_biases[charge].resize(num_sizes+1,0);

		
		int size_idx;
		for (size_idx=0; size_idx<=num_sizes; size_idx++)
		{
			if (pmc_rank_models[charge][size_idx] && ! overwrite)
				continue;

			vector<SingleSpectrumFile *> test_ssfs;
			BasicSpecReader bsr;
			static QCPeak peaks[5000];
			RankBoostDataset train_ds, test_ds, pos_ds, neg_ds;

			mass_t min_mass =0;
			mass_t max_mass = POS_INF;

			if (size_idx>0)
				min_mass = pmc_rank_mass_thresholds[charge][size_idx-1];

			if (size_idx<num_sizes)
				max_mass = pmc_rank_mass_thresholds[charge][size_idx];

			// these ranges are given according to pm_with_19
			// so files should be selected through select_files and not
			// select_file_in_mz_range
			FileSet fs;		
			fs.select_files(fm,min_mass,max_mass,-1,-1,charge);

			if (fs.get_total_spectra()<500)
				continue;

			
			int num_groups_in_train=0;
			int num_groups_in_test=0;

			cout << "TRAINING charge " << charge << " size " << size_idx << "  (" <<
				min_mass << "-" << max_mass << ")" << endl;

			fs.randomly_reduce_ssfs(max_to_read_per_file);
			const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
			const int num_samples = all_ssf.size();
			
			// first find the bias in number of bins between the true m/z bin and
			// the optimal m/z bin
			vector<bool> skipped_idxs;
			skipped_idxs.resize(num_samples,false);
			int skipped_bad_mz=0;
			mass_t total_bias=0;
			int i;
			for (i=0; i<num_samples; i++)
			{
				SingleSpectrumFile* ssf = all_ssf[i];
				BasicSpectrum bs;
			
				bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
				bs.peaks = peaks;
				bs.ssf = ssf;

				ssf->peptide.calc_mass(config);
				
				const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge;

				if (fabs(true_mz - bs.ssf->m_over_z)>2.5)
				{
					//cout << setprecision(2) << true_mz << " <---> " << bs.ssf->m_over_z << " skipping" << endl;
					skipped_bad_mz++;
					skipped_idxs[i]=true;
					continue;
				} 

				init_for_current_spec(config,bs);
				calculate_curr_spec_pmc_values(bs, bin_increment);

				// find the true_mz_bin_idx
				
				const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge];
				int true_mz_bin_idx=0;
				while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz)
					true_mz_bin_idx++;

				if (true_mz_bin_idx == pmc_stats.size())
					true_mz_bin_idx--;

				if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z)
					true_mz_bin_idx--;

				int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge);

				if (opt_bin_idx <=0 || opt_bin_idx == pmc_stats.size()-1)
				{
					skipped_bad_mz++;
					skipped_idxs[i]=true;
					continue;
				}

				total_bias += (pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z);

				if (fabs(pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z)>4.0)
				{
					cout << "opt bin: " << opt_bin_idx << " (" << pmc_stats[opt_bin_idx].m_over_z << ")  ";
					cout << "tru bin: " << true_mz_bin_idx << " ("<< pmc_stats[true_mz_bin_idx].m_over_z << ")" << endl;
				}
			} 

			mass_t mz_bias = total_bias / (mass_t)(num_samples-skipped_bad_mz);
			pmc_charge_mz_biases[charge][size_idx]=mz_bias;

			cout << "m/z bias: " << setprecision(4) << mz_bias << endl;
			cout << "skipped " << skipped_bad_mz << "/" << num_samples <<
				"  because of m/z more than 2.5 away from observed..." << endl; 

		//	pmc_charge_mz_biases[charge][size_idx] = 0;

			for (i=0; i<num_samples; i++)
			{
				if (skipped_idxs[i])
					continue;

				SingleSpectrumFile* ssf = all_ssf[i];
				BasicSpectrum bs;
			
				bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
				bs.peaks = peaks;
				bs.ssf = ssf;
				const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge;

				init_for_current_spec(config,bs);
				calculate_curr_spec_pmc_values(bs, bin_increment);

				// find the true_mz_bin_idx
				
				const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge];
				int true_mz_bin_idx=0;
				while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz)
					true_mz_bin_idx++;

				if (true_mz_bin_idx == pmc_stats.size())
					true_mz_bin_idx--;

				if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z)
					true_mz_bin_idx--;

				int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge);

				
				static vector<RankBoostSample> spec_samples;
				fill_RankBoost_smaples_with_PMC(bs, charge, spec_samples);

				// select samples and add them to pmc_ds
				int good_idx;
				vector<int> bad_idxs;
				select_training_sample_idxs(charge,spec_samples,bs,good_idx,bad_idxs);

				const bool ind_add_to_train = (my_random()<prop_train);
				int group_idx;
				
				if (ind_add_to_train)
				{
					group_idx= num_groups_in_train++;	
				}
				else
				{
					group_idx= num_groups_in_test++;
					test_ssfs.push_back(ssf);
				}
				
				
				RankBoostDataset& ds = (ind_add_to_train ? train_ds : test_ds);

				const int pos_index  = ds.get_num_samples();
				spec_samples[good_idx].group_idx = group_idx;
				spec_samples[good_idx].rank_in_group=0;

				ds.add_sample(spec_samples[good_idx]);
				if (sample_diagnostic)
					pos_ds.add_sample(spec_samples[good_idx]);

				int j;
				for (j=0; j<bad_idxs.size(); j++)
				{
					const int bad_idx = bad_idxs[j];
					if (bad_idx < 0 || bad_idx>= spec_samples.size())
						continue;
		
					spec_samples[bad_idx].group_idx=group_idx;
					spec_samples[bad_idx].rank_in_group=1;

					ds.add_to_phi_vector(ds.get_num_samples(),pos_index);
					ds.add_sample(spec_samples[bad_idx]);

					if (sample_diagnostic)
						neg_ds.add_sample(spec_samples[bad_idx]);
				}						   
			}

			train_ds.set_num_groups(num_groups_in_train);
			test_ds.set_num_groups(num_groups_in_test);
			
			train_ds.compute_total_phi_weight();
			train_ds.initialize_potenital_lists();
			train_ds.initialzie_real_feature_table(real_names.size());

			test_ds.compute_total_phi_weight();

			if (pmc_rank_models[charge][size_idx])
				delete pmc_rank_models[charge][size_idx];
			
			pmc_rank_models[charge][size_idx] = new RankBoostModel;
		

			RankBoostModel* boost = pmc_rank_models[charge][size_idx];

			vector<string> empty;
			empty.clear();
			boost->init_rankboost_model_feature_names(empty,real_names);
			boost->init_rankboost_model_for_training(train_ds,100,25);

			train_ds.initialize_real_vote_lists(*boost);

			if (sample_diagnostic)
			{
				boost->summarize_features_pos_neg(pos_ds.get_samples(),neg_ds.get_samples());
			}
			else
				boost->summarize_features(train_ds.get_samples());

			boost->train_rankboost_model(train_ds,4000,NULL,&test_ds);
			
			boost->ouput_ranked_feature_list();

		//	output_pmc_rank_results(fm,charge,test_ssfs);

		//	exit(0);

			ind_initialized_pmcr = true;
		//	string path;
		//	path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCRtt.txt";
		//	this->write_pmc_rank_models(path.c_str());
			
		}
	}

	string path;
	path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt";
	this->write_pmc_rank_models(path.c_str());
	ind_initialized_pmcr = true;
}
Ejemplo n.º 2
0
void PMCSQS_Scorer::train_sqs_models(Config *config, 
									 const FileManager& fm_pos, 
									 const char *neg_list,
									 int specificCharge, 
									 vector<vector<float> > *inputWeights)
{
	vector< vector< vector<ME_Regression_Sample> > > samples; //  neg, p1, p2, p3 / sizeIndex
	FileManager fm_neg;

	const vector<int>& spectra_counts = fm_pos.get_spectra_counts();
	maximalChargeWithModels_ = (inputWeights ? inputWeights->size()-1 : 3);
	int charge;

	set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19
	set_bin_increment(0.1);
	this->set_sqs_mass_thresholds();

	if (this->pmcMassThresholds_.size() == 0)
	{
		pmcMassThresholds_=config->get_size_thresholds();
	}

	vector<vector<float> > classWeights;
	if (inputWeights)
	{
		classWeights = *inputWeights;
	}
	else
	{
		classWeights.resize(maximalChargeWithModels_+1);
		int i;
		for (i=0; i<classWeights.size(); i++)
			classWeights[i].resize(maximalChargeWithModels_+1,1.0);
	}

	const int numSizes = this->sqsMassThresholds_.size();
	cout << "NUM SIZE MODELS: " << numSizes+1 << endl;

	samples.resize(maximalChargeWithModels_+1);

	fm_neg.init_from_list_file(config, neg_list);
	const int max_to_read_per_file = 8000;

	for (charge=0; charge<=maximalChargeWithModels_; charge++)
	{
		if (charge>0 && specificCharge>0 && charge != specificCharge)
			continue; 

		int sizeIndex;
		for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
		{	
			const mass_t minMass = (sizeIndex == 0 ? 0 : sqsMassThresholds_[sizeIndex-1]);
			const mass_t maxMass = (sizeIndex == numSizes ? POS_INF : sqsMassThresholds_[sizeIndex]);

			samples[charge].resize(numSizes+1);

			BasicSpecReader bsr;
			QCPeak peaks[5000]; 

			FileSet fs;
			if (charge == 0)
			{
				fs.select_files_in_mz_range(fm_neg,minMass, maxMass,0);	
			}
			else
			{
				fs.select_files_in_mz_range(fm_pos, minMass, maxMass, charge);
			}

			cout << "Found " << fs.get_total_spectra() << " for charge " << charge << " ranges:" <<
				minMass << " - " << maxMass << endl;

			fs.randomly_reduce_ssfs(max_to_read_per_file);
			const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
			const int label = (charge == 0 ? 1 : 0);
			const int num_samples =  all_ssf.size();
						
			samples[charge][sizeIndex].resize(num_samples);

			
			int i;
			for (i=0; i<num_samples; i++)
			{
				SingleSpectrumFile* ssf = all_ssf[i];
				BasicSpectrum bs;

				bs.peaks = peaks;
				bs.ssf = ssf;
			
				if (charge==0)
				{
					bs.num_peaks = bsr.read_basic_spec(config,fm_neg,ssf,peaks);
					bs.ssf->charge=0;
				}
				else
					bs.num_peaks = bsr.read_basic_spec(config,fm_pos,ssf,peaks);

				init_for_current_spec(config,bs);
				calculate_curr_spec_pmc_values(bs, bin_increment);
			
				fill_fval_vector_with_SQS(bs, samples[charge][sizeIndex][i]);
				
				samples[charge][sizeIndex][i].label = label;
			}
		}
	}

	// cout sample composition
	cout << "Sample composition:" << endl;
	for (charge=0; charge<=maximalChargeWithModels_; charge++)
	{
		cout << charge;
		int i;
		for (i=0; i<samples[charge].size(); i++)
			cout << "\t" << samples[charge][i].size();
		cout << endl;
	}

	// create SQS models
	this->sqs_models.resize(maximalChargeWithModels_+1);
	for (charge =0; charge<=maximalChargeWithModels_; charge++)
	{
		sqs_models[charge].resize(maximalChargeWithModels_+1);
		int j;
		for (j=0; j<sqs_models[charge].size(); j++)
			sqs_models[charge][j].resize(numSizes+1,NULL);
	}



	for (charge=1; charge<=maximalChargeWithModels_; charge++)
	{
		int sizeIndex;
		for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
		{
			ME_Regression_DataSet ds;

			cout << endl << "CHARGE " << charge << " SIZE " << sizeIndex << endl;
			ds.num_classes=2;
			ds.num_features=SQS_NUM_FIELDS;
			ds.add_samples(samples[0][sizeIndex]);
			ds.add_samples(samples[charge][sizeIndex]);
			ds.tally_samples();

			if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001)
			{
				cout << "Warning: insufficient number of samples, not trianing model for this charge " << charge <<
					" size " << sizeIndex << endl;
				continue;
			}

			const double pos_weight = 0.2 + classWeights[charge][sizeIndex]*0.3;

			ds.randomly_remove_samples_with_activated_feature(1,SQS_IND_MAX_TAG_LENGTH_ABOVE_4,0.5);

			ds.calibrate_class_weights(pos_weight); // charge vs bad spectra
			ds.print_feature_summary(cout,SQS_var_names);

			sqs_models[charge][0][sizeIndex]=new ME_Regression_Model;

			sqs_models[charge][0][sizeIndex]->train_cg(ds,250);

			sqs_models[charge][0][sizeIndex]->print_ds_probs(ds);

		}
	}

		
	////////////////////////////////////////////
	// train model vs. model if charge1>charge2
	if (1)
	{
		int charge1,charge2;
		for (charge1=2; charge1<=maximalChargeWithModels_; charge1++)
		{
			for (charge2=1; charge2<charge1; charge2++)
			{
				int sizeIndex;
				for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
				{
					ME_Regression_DataSet ds;

					ds.num_classes=2;
					ds.num_features=SQS_NUM_FIELDS;

					ds.add_samples(samples[charge1][sizeIndex]);

					int i;
					for (i=0; i<samples[charge2][sizeIndex].size(); i++)
					{
						samples[charge2][sizeIndex][i].label=1;
						ds.add_sample(samples[charge2][sizeIndex][i]);
						samples[charge2][sizeIndex][i].label=0;
					}

					float relative_weight = classWeights[charge1][sizeIndex]/
						(classWeights[charge1][sizeIndex]+classWeights[charge2][sizeIndex]);

					ds.tally_samples();

					if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001)
					{
						cout << "Warning: insufficient number of samples, not trianing model for charge " << charge1 <<
							" vs charge " << charge2<< " (size " << sizeIndex << ")" << endl;
						continue;
					}

					ds.calibrate_class_weights(relative_weight);

					sqs_models[charge1][charge2][sizeIndex] = new ME_Regression_Model;

					cout << endl << "CHARGE " << charge1 << " vs " << charge2 << "  size " << sizeIndex << endl;
					cout << "Relative weights: " << charge1 << "/(" << charge1 << "+" <<
						charge2 << "): " << relative_weight << endl;
				
					ds.print_feature_summary(cout,SQS_var_names);

					sqs_models[charge1][charge2][sizeIndex]->train_cg(ds,300);
					sqs_models[charge1][charge2][sizeIndex]->print_ds_probs(ds);
				}
			}
		}
	}

	init_sqs_correct_factors(maximalChargeWithModels_, sqsMassThresholds_.size());

	////////////////////////////////////////////
	// final report on datasets
	cout << endl;

	int sizeIndex;
	for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
	{
		cout << endl << "SIZE: " << sizeIndex << endl;
		cout << "--------" << endl;
		float p_thresh = 0.05;
		int d;
		for (d=0; d<=maximalChargeWithModels_; d++)
		{
			vector<int> counts;
			vector<int> max_counts;
			counts.resize(maximalChargeWithModels_+1,0);
			max_counts.resize(maximalChargeWithModels_+1,0);

			int i;
			for (i=0; i<samples[d][sizeIndex].size(); i++)
			{
				bool above_thresh=false;
				float max_prob=0;
				int   max_class=0;
				int c;
				for (c=1; c<=maximalChargeWithModels_; c++)
				{
					if (! sqs_models[c][0][sizeIndex])
						continue;

					float prob = sqs_models[c][0][sizeIndex]->p_y_given_x(0,samples[d][sizeIndex][i]);
					if (prob>p_thresh)
					{
						counts[c]++;
						above_thresh=true;
						if (prob>max_prob)
						{
							max_prob=prob;
							max_class=c;
						}
					}
				}
				max_counts[max_class]++;

				if (! above_thresh)
					counts[0]++;
			}

			cout << d << "\t";
			for (i=0; i<=maximalChargeWithModels_; i++)
				cout << fixed << setprecision(4) << max_counts[i]/(float)samples[d][sizeIndex].size() << "\t";
			cout << endl;
		}
	}



	ind_initialized_sqs = true;

	string path;
	path = config->get_resource_dir() + "/" + config->get_model_name() + "_SQS.txt";
	write_sqs_models(path.c_str());
}