Beispiel #1
0
// returns number of peaks that were stored
int	PeakList::readPeaksToLocalAllocation(const SpectraAggregator& sa,
        const SingleSpectrumHeader* header)
{
    // create a basic PeakList read function
    header_ = header;

    if (localAllocationSize_>0 && peaks_)
        delete [] peaks_;

    localAllocationSize_ = header->getOriginalNumPeaks();
    peaks_ = new Peak[localAllocationSize_];
    if (! peaks_)
    {
        cout << "Error: couldn't allocate memory for spectrum!" << endl;
        exit(1);
    }

    const int numPeaksRead = sa.readPeakList(header, peaks_);
    numPeaks_ = numPeaksRead;

    if (header->getFileType() != IFT_MZXML &&
            ( numPeaksRead != header->getOriginalNumPeaks() ||
              peaks_[0].mass != header->getFirstPeakMass()) )
    {
        cout << "Error reading scan " << header->getScanNumber() << ": " << header_->getTitle()
             << " in file " << sa.getSpectraFile(header_->getSpectraFileIndexInList()).getFilePath() << endl;
        if (numPeaksRead != header->getOriginalNumPeaks())
            cout << "Num peaks read " << numPeaksRead << ", expecting " <<  header->getScanNumber() << endl;
        if (peaks_[0].mass != header->getFirstPeakMass())
            cout << setprecision(5) << "First peak mass " << peaks_[0].mass
                 <<", expecting " << header->getFirstPeakMass() << endl;
        cout << "Could possibly be a dos/unix problem with the files, try running dos2unix (or unix2dos)..." << endl;
        cout <<"Skipping spectrum..." << endl;

        const int numPeaksRead = sa.readPeakList(header, peaks_);

        return 0;
    }

    return numPeaks_;
}
Beispiel #2
0
// function assumes that the buffer is sufficently large for all peaks being read
// returns number of peaks that were stored
int	PeakList::readPeaksToBuffer(const SpectraAggregator& sa,
                                const SingleSpectrumHeader* header,
                                Peak*	peakBuffer)
{
    // create a basic PeakList read function
    header_ = header;

    if (localAllocationSize_>0 && peaks_)
        delete [] peaks_;

    localAllocationSize_ = 0;

    peaks_ = peakBuffer;
    const int numPeaksRead = sa.readPeakList(header, peaks_);
    numPeaks_ = numPeaksRead;

    return numPeaks_;
}
Beispiel #3
0
bool Spectrum::readSpectrum(const SpectraAggregator& sa,	
							const SingleSpectrumHeader* header, 
							bool indFilterSpectrum)
{

	config_ = sa.getConfig();

	const int numPeaksRead = readPeaksToLocalAllocation(sa, header);
	if (numPeaksRead<5)
		return false;

	copyHeaderInformation();

	initializePeakList(config_, indFilterSpectrum);

	if (! sanityCheck())
		return false;

	initializeSpectrum();

	return (numPeaks_>0);
}
/********************************************************************************
Due to the limit on the number of different open file descriptors, the DAT
creation is done in two stages. First we convert the data into DAT files
using a large mz increment (e.g. 25 Da). No qaulity filtration is peformed at 
this stage.
*********************************************************************************/
string DatFileWriter::convertDataToDatFirstPass(const MsParameterStruct* params)
{
	const string& orgList  = (params->spectraListToLoad.length()>0 ? params->spectraListToLoad : params->list);
	const string& metaList = params->metaList;
	const string& datDir   = params->tmpDir;
	const string& datName  = params->outputName;
	float sqsThreshold     = params->sqsThreshold;
	size_t  fileStartIdx   = params->startFileIdx;
	int     verboseLevel   = params->verboseLevel;

	map<string,int> idTitles;
	if (params->gotCreateArchiveFromMgfs)
		readIdsTitleFromIdFile(params, idTitles);

	datDir_  = datDir;
	datName_ = datName + "_R1";
	verboseLevel_ = verboseLevel;

	init(MAJOR_MZ_INCREMENT_FOR_DAT);

	cout << endl << "Pass 1: reading spectra files and writing to dat with " 
			<< MAJOR_MZ_INCREMENT_FOR_DAT << " Da increments." << endl;
	cout         << "----------------------------------------------------------------------" << endl << endl;

	PMCSQS_Scorer* pmcsqsModel = const_cast<PMCSQS_Scorer*>(model_->get_pmcsqs_ptr());
	if (sqsThreshold>0.0 && ! pmcsqsModel->getIndInitializedSqs())
		error("Sqs model not initialized!, need a valid sqs model if using a filtering threshold!");

	if (sqsThreshold>0.0)
		cout << "Filtering spectra with SQS threshold of " << sqsThreshold << endl;

	vector<SinglePath> paths;
	size_t firstFileIdxInList = 0;
	if (orgList.length())
	{
		vector<string> regularPaths;
		firstFileIdxInList  = readListOfPaths(orgList.c_str(), regularPaths);
		if (fileStartIdx == 0 && firstFileIdxInList>0)
			fileStartIdx = firstFileIdxInList;

		numOriginalPaths_ = regularPaths.size();
		if (verboseLevel_>0)
		{
			cout << "Read " << paths.size() << " paths to spectra files." << endl;
			cout << "Converting data to DAT, using m/z increment of " << fixed << setprecision(2) << mzIncrement_ << endl;
		}

		paths.resize(regularPaths.size());
		for (size_t i=0; i<regularPaths.size(); i++)
		{
			paths[i].path = regularPaths[i];
			paths[i].datasetIdx = (params->datasetIdx == MAX_INT ? 0 : params->datasetIdx);
			paths[i].idxInList = fileStartIdx + i;
		}

		createDirIfDoesNotExist(params->outDir.c_str());
		ostringstream oss;
		oss << params->outputStub << "_" << (params->datasetIdx == MAX_INT ? 0 : params->datasetIdx) << "_spec_list.txt";
		writeListOfPaths(oss.str().c_str(), regularPaths);
	}
	else
	{
		assert( metaList.length()>0);
		MetaList ml;
		ml.readMetaList(metaList.c_str());
		ml.writeLists(params->outputName.c_str(), "_spec_list.txt");
		paths = ml.getSinglePaths();
		
	}


	ScanListManager sem;
	if (params->exclusionList.length()>0)
	{
		const size_t numExclusions = sem.initialize(params->exclusionList.c_str(), 
				    								params->minMz - 5.0, 
													params->maxMz + 5.0);
		if (verboseLevel_>0)
			cout << "Read " << numExclusions << " from " << params->exclusionList << endl;
	}

	if (paths.size() == 0)
		return (std::string(""));

	int numFilesWithoutSpectra = 0;
	size_t peakBufferSize = 10000;
	Peak*  peakBuffer = new Peak[peakBufferSize];
	
	numSpectraWrittenFirstPass_      = 0;
	numSpectraReadFromOriginalFiles_ = 0;
	
	map<string,int> numTimes;

	for (size_t i=0; i<paths.size(); i++)
	{
		const double fileStartTime = time(NULL);

		if (verboseLevel_>0)
			cout << i << "\tExtracting from: " << paths[i].path << " ["
				 << paths[i].datasetIdx << " : " << paths[i].idxInList << "]" << endl;
		SpectraAggregator sa;
		sa.initializeFromSpectraFilePath(paths[i].path.c_str(), config_, 
			paths[i].datasetIdx ,
			paths[i].idxInList, 
			params->gotOverwriteLocations);

		SpectraList sl(sa);
		sl.selectAllAggregatorHeaders();
		sl.removeExcludedScans(sem);

		if (verboseLevel_>0)
			cout << "\tFound " << sl.getNumHeaders() << " spectra...";

		if (sl.getNumHeaders() == 0)
		{
			numFilesWithoutSpectra++;
			cout << endl << endl;
			cout.flush();
			continue;
		}

		size_t numExtracted =0;
		for (size_t j=0; j<sl.getNumHeaders(); j++)
		{
			const SingleSpectrumHeader* header = sl.getSpectrumHeader(j);
			if (header->getOriginalNumPeaks()>1e6)
				continue;

			if (header->getOriginalNumPeaks()>= peakBufferSize)
			{
				delete [] peakBuffer;
				peakBufferSize = header->getOriginalNumPeaks()*2;
				peakBuffer = new Peak[peakBufferSize];
			}

			if (params->gotCreateArchiveFromMgfs && idTitles.size()>0)
			{
				if (idTitles.find(header->getTitle()) == idTitles.end())
					continue; // don't write spectra if the id file was supplied and the title is not there
			}

			numTimes[header->getTitle()]++;
		//	if (numTimes[header->getTitle()]>1 && header->getTitle().length()>0)
		//		cout << endl << "Warning: header appears multiple times: " << header->getTitle() << endl;

			// HACK (bad design)
			// save original generation idx and index in list
			// The problem I am trying to solve is how to keep the indexes written
			// in dat files (generation and index in file), yet still be able to read 
			// the file in the current list of paths (which has a different index)
			// solution (for next version)
			// have separate attributes (originaldatasetIdx, originalFileIndex)
			// these never change no matter where the spectrum gets moved!
			int originalDatasetIdx = header->getDatasetIndex();
			int originalIndexInList= header->getSpectraFileIndexInList();
			
			SingleSpectrumHeader* nonConstHeader = const_cast<SingleSpectrumHeader*>(header);
			nonConstHeader->setSpectraFileIndexInList(0);

			PeakList pl;
			pl.setPeaksPtr( peakBuffer );

			if (pl.readPeaksToBuffer(sa, header, peakBuffer) < 7) // if not enough peaks read, skip this spectrum
				continue;

			if (! params->gotOverwriteLocations)
			{
				nonConstHeader->setDatasetIndex(originalDatasetIdx);
				nonConstHeader->setSpectraFileIndexInList(originalIndexInList);
			}
			else
			{
				nonConstHeader->setDatasetIndex(paths[i].datasetIdx);
				nonConstHeader->setSpectraFileIndexInList(paths[i].idxInList);
			}

			pl.initializePeakList(config_, true);
			numSpectraReadFromOriginalFiles_++;

			if (pl.getNumPeaks()<7) // don't bother with spectra with too few peaks
				continue;

			if (pl.getNumPeaks()>100000)
			{
				header->printStats();
				cout << "num peaks: " << pl.getNumPeaks() << endl;
				error("Too many peaks in spectrum, something went wrong!");
			}

			if (pmcsqsModel && (sqsThreshold>0.0 || params->gotCorrectPM ))
			{
				size_t maxCharge=0;
				const float sqs = pmcsqsModel->calculateSqsScore(config_, pl, &maxCharge);
				if (sqs<sqsThreshold || maxCharge == 0)
					continue;
				header->setSqs(sqs);

				if (params->gotCorrectPM)
				{
					PmcSqsChargeRes res;
					pmcsqsModel->computeBestMzValuesForCharge(pl, maxCharge, config_->get_pm_tolerance(), res);

					//cout << header->getMOverZ() << " : " << maxCharge << "\t" << res.mz1 << "\t" << res.score1 << "\t" << res.mz2 << "\t" << res.score2 << endl;
					SingleSpectrumHeader* nonConstHeader = const_cast<SingleSpectrumHeader*>(header);
					nonConstHeader->setOriginalPmWith19(header->getMOverZ());
					
					// this is a wrong charge assignment, use original m/z
					if (fabs(res.mz1-header->getMOverZ())>8.0)
					{	
						nonConstHeader->setMOverZ(header->getMOverZ());
						nonConstHeader->setCharge(header->getCharge());
					}
					else
					{
						nonConstHeader->setMOverZ(res.mz1);
						nonConstHeader->setCharge(maxCharge);
					}
				}
			}
			addPeakListToDat(pl);

			numExtracted++;
		}

		numSpectraWrittenFirstPass_ += numExtracted;

		if (verboseLevel_>0)
		{
			const double fileEndTime = time(NULL);
			cout << " Wrote " << numExtracted << " to dat files (this took " 
				<< fileEndTime-fileStartTime << " sec.)" << endl << endl;
			cout.flush();
		}
	}

	closeAllOpenDats();

	if (peakBuffer)
		delete [] peakBuffer;

	// summary
	if (verboseLevel_>0)
	{
		cout << endl << "SUMMARY (first pass):" << endl;
		cout         << "---------------------" << endl;
		cout << "Wrote " << datPaths_.size() << " dat files to " << datDir_ << endl;
		cout << "These files contain " << numSpectraWrittenFirstPass_ 
			 << " spectra (from a total of " << numSpectraReadFromOriginalFiles_ << " that were read)" << endl;
	}

	if (numFilesWithoutSpectra>0)
	{
		cout << endl << "Warning: encountered " << numFilesWithoutSpectra 
			 << " spectra files for which no spectra were read." << endl << endl;
	}
	
	if (numSpectraWrittenFirstPass_ == 0)
		error("Did not write any spectra in first pass! Exiting.");

	// returns the path to the list of created dat files
	return (writeDatPaths());
}
Beispiel #5
0
void PMCSQS_Scorer::trainSqsModels(const Config* config, 
								   const SpectraAggregator& positiveSpectra,
								   const char* pathNegativeSpectraList,
								   int specificCharge,
								   vector< vector<float> >* inputWeights)
{
	// TODO add weight file that can be read from outside to set the weights... ?

	vector< vector< vector<ME_Regression_Sample> > > samples; //  first dim: neg, +1, +2, +3
															  // second dim: sizeIndex


	
	maximalChargeWithModels_ = (inputWeights ? inputWeights->size()-1 : 3);

	set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19
	set_bin_increment(0.1);

	set_sqs_mass_thresholds();
	if (pmcMassThresholds_.size() == 0)
	{
		pmcMassThresholds_=config->get_size_thresholds();
	}

	vector<vector<float> > classWeights;
	if (inputWeights)
	{
		classWeights = *inputWeights;
	}
	else
		setClassWeightsAccordingToData(positiveSpectra, classWeights);


	const int numSizes = sqsMassThresholds_.size();
	cout << "number of sizes for SQS models " << numSizes+1 << endl;

	samples.resize(maximalChargeWithModels_+1);
	
	SpectraAggregator negativeSpectra;
	negativeSpectra.initializeFromTextFile(pathNegativeSpectraList, config);
	const int maxHeadersPerModel = 8000;
	
	// read all samples
	size_t charge;
	for (charge=0; charge<=maximalChargeWithModels_; charge++)
	{
		if (charge>0 && specificCharge>0 && charge != specificCharge)
			continue; 

		samples[charge].resize(numSizes+1);

		size_t sizeIndex;
		for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
		{	
			const mass_t minMass = (sizeIndex == 0 ? 0 : sqsMassThresholds_[sizeIndex-1]);
			const mass_t maxMass = (sizeIndex == numSizes ? POS_INF : sqsMassThresholds_[sizeIndex]);

			const SpectraAggregator& sa = (charge == 0 ? negativeSpectra : positiveSpectra);
			SpectraList sl(sa);

			if (charge == 0)
			{
				sl.selectHeaders(minMass, maxMass);
			}
			else
				sl.selectHeaders(minMass, maxMass, charge, charge);

			cout << "Found " << sl.getNumHeaders() << " for charge " << charge << " ranges:" <<
				minMass << " - " << maxMass << endl;

			sl.randomlyReduceListToSize(maxHeadersPerModel);

			
			const int label = (charge == 0 ? 1 : 0);	
			samples[charge][sizeIndex].resize(sl.getNumHeaders());
			int i;
			for (i=0; i<sl.getNumHeaders(); i++)
			{
				const SingleSpectrumHeader* header = sl.getSpectrumHeader(i);
				PeakList pl;

				pl.readPeaksToLocalAllocation(sa,header);
				pl.initializePeakList(config, true);
			
				initializeForCurrentSpectrum(config, pl);

				calculateCurrentSpectrumPmcValues(pl, bin_increment);
			
				fillSqsMeSample(pl, samples[charge][sizeIndex][i]);
				samples[charge][sizeIndex][i].label = label;
			}
		}
	}

	// cout sample composition
	cout << "Sample composition:" << endl;
	for (charge=0; charge<=maximalChargeWithModels_; charge++)
	{
		cout << charge;
		size_t i;
		for (i=0; i<samples[charge].size(); i++)
			cout << "\t" << samples[charge][i].size();
		cout << endl;
	}

	// create SQS models
	sqs_models.resize(maximalChargeWithModels_+1);
	for (charge =0; charge<=maximalChargeWithModels_; charge++)
	{
		sqs_models[charge].resize(maximalChargeWithModels_+1);
		int j;
		for (j=0; j<sqs_models[charge].size(); j++)
			sqs_models[charge][j].resize(numSizes+1,NULL);
	}



	for (charge=1; charge<=maximalChargeWithModels_; charge++)
	{
		int sizeIndex;
		for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
		{
			cout << endl << "CHARGE " << charge << " SIZE " << sizeIndex << endl;

			
			ME_Regression_DataSet ds;
			ds.num_classes=2;
			ds.num_features=SQS_NUM_FIELDS;
			ds.add_samples(samples[0][sizeIndex]);
			ds.add_samples(samples[charge][sizeIndex]);
			ds.tally_samples();

			if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001)
			{
				cout << "Warning: insufficient number of samples, not trianing model for this charge " << charge <<
					" size " << sizeIndex << endl;
				continue;
			}

			const double pos_weight = 0.2 + classWeights[charge][sizeIndex]*0.3;

			ds.randomly_remove_samples_with_activated_feature(1,SQS_IND_MAX_TAG_LENGTH_ABOVE_4,0.5);

			ds.calibrate_class_weights(pos_weight); // charge vs bad spectra
			ds.print_feature_summary(cout,SQS_var_names);

			sqs_models[charge][0][sizeIndex]=new ME_Regression_Model;
			sqs_models[charge][0][sizeIndex]->train_cg(ds,250);
			sqs_models[charge][0][sizeIndex]->print_ds_probs(ds);
		
		}
	}

		
	////////////////////////////////////////////
	// train model vs. model if charge1>charge2
	if (1)
	{
		int charge1,charge2;
		for (charge1=2; charge1<=maximalChargeWithModels_; charge1++)
		{
			for (charge2=1; charge2<charge1; charge2++)
			{
				int sizeIndex;
				for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
				{
					ME_Regression_DataSet ds;

					ds.num_classes=2;
					ds.num_features=SQS_NUM_FIELDS;

					ds.add_samples(samples[charge1][sizeIndex]);

					int i;
					for (i=0; i<samples[charge2][sizeIndex].size(); i++)
					{
						samples[charge2][sizeIndex][i].label=1;
						ds.add_sample(samples[charge2][sizeIndex][i]);
						samples[charge2][sizeIndex][i].label=0;
					}

					float relative_weight = classWeights[charge1][sizeIndex]/
						(classWeights[charge1][sizeIndex]+classWeights[charge2][sizeIndex]);

					ds.tally_samples();

					if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001)
					{
						cout << "Warning: insufficient number of samples, not trianing model for charge " << charge1 <<
							" vs charge " << charge2<< " (size " << sizeIndex << ")" << endl;
						continue;
					}

					ds.calibrate_class_weights(relative_weight);

					sqs_models[charge1][charge2][sizeIndex] = new ME_Regression_Model;

					cout << endl << "CHARGE " << charge1 << " vs " << charge2 << "  size " << sizeIndex << endl;
					cout << "Relative weights: " << charge1 << "/(" << charge1 << "+" <<
						charge2 << "): " << relative_weight << endl;
				
					ds.print_feature_summary(cout,SQS_var_names);

					sqs_models[charge1][charge2][sizeIndex]->train_cg(ds,300);
					sqs_models[charge1][charge2][sizeIndex]->print_ds_probs(ds);
				}
			}
		}
	}

	init_sqs_correct_factors(maximalChargeWithModels_, sqsMassThresholds_.size());

	////////////////////////////////////////////
	// final report on datasets
	cout << endl;

	int sizeIndex;
	for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++)
	{
		cout << endl << "SIZE: " << sizeIndex << endl;
		cout << "--------" << endl;
		float p_thresh = 0.05;
		int d;
		for (d=0; d<=maximalChargeWithModels_; d++)
		{
			vector<int> counts;
			vector<int> max_counts;
			counts.resize(maximalChargeWithModels_+1,0);
			max_counts.resize(maximalChargeWithModels_+1,0);

			int i;
			for (i=0; i<samples[d][sizeIndex].size(); i++)
			{
				bool above_thresh=false;
				float max_prob=0;
				int   max_class=0;
				int c;
				for (c=1; c<=maximalChargeWithModels_; c++)
				{
					if (! sqs_models[c][0][sizeIndex])
						continue;

					float prob = sqs_models[c][0][sizeIndex]->p_y_given_x(0,samples[d][sizeIndex][i]);
					if (prob>p_thresh)
					{
						counts[c]++;
						above_thresh=true;
						if (prob>max_prob)
						{
							max_prob=prob;
							max_class=c;
						}
					}
				}
				max_counts[max_class]++;

				if (! above_thresh)
					counts[0]++;
			}

			cout << d << "\t";
			for (i=0; i<=maximalChargeWithModels_; i++)
				cout << fixed << setprecision(4) << max_counts[i]/(float)samples[d][sizeIndex].size() << "\t";
			cout << endl;
		}
	}



	ind_initialized_sqs = true;

	string path;
	path = config->get_resource_dir() + "/" + config->get_model_name() + "_SQS.txt";
	write_sqs_models(path.c_str());
}
Beispiel #6
0
int main(int argc, char **argv) 
{ 
	AllScoreModels model;

	int i;
	char ann_file[256];
	char out_file[256];
	char input_file[256];
	char inspect_results_file[256];
	char list_file[256];
	char model_file[256];
	char initial_model[256];
	char model_dir[256];
	char PTM_string[256];
	char mgf_out_dir[256];
	char neg_spec_list[256];
	char tag_string[64];
	char tag_suffix[64];
	
	bool got_input_file=false,got_model_file=false, got_list_file=false;
	bool got_model_dir=false, got_initial_model=false, got_PTM_string = false, got_neg_spec_list=false;
	bool prm_only=false;
	bool prm_norm=false;
	bool pmcsqs_only = false;
	bool sqs_only = false;
	bool got_filter_spectra = false;
	bool pmcsqs_and_prm = false;
	bool train_flag = false;
	bool correct_pm = false;
	bool use_spectrum_charge = false;
	bool use_spectrum_mz     = false;
	bool perform_filter		 = true;
	bool output_aa_probs	 = false;
	bool output_cumulative_probs = false;
	bool make_inspect_tags   = false;
	bool make_training_fa	 = false;
	bool test_tags			 = false;
	bool got_make_ann_mgf	 = false;
	bool got_make_training_mgf = false;
	bool got_rescore_inspect = false;
	bool got_recalibrate_inspect = false;
	bool got_make_peak_examples  = false;

	int start_train_idx=0;
	int end_train_idx = POS_INF;
	int specific_charge=-1;
	int specific_size=-1;
	int specific_region=-1;

	int specific_idx = -1;
	
	int file_start_idx =0;
	int tag_length = 0;
	int num_solutions = 20;
	int digest_type = TRYPSIN_DIGEST;
	mass_t train_tolerance;
	float min_pmcsqs_prob = -1.0;
	mass_t fragment_tolerance = -1.0;
	mass_t pm_tolerance = -1.0;
	float sqs_filter_thresh = 0.0;
	float min_filter_prob = 0.0;
	int   num_test_cases=-1;
	int	  num_training_spectra=-1;

	seedRandom(112233);
	strcpy(tag_suffix,"tags");

	// read command line arguments
	i=1;
	while (i<argc)
	{

		if (! strcmp(argv[i],"-make_ann_mgf"))
		{
			if (++i == argc)
				print_help("Missing file ann file!");

			strcpy(ann_file,argv[i]);	

			if (++i == argc)
				print_help("Missing file out file!");

			strcpy(out_file,argv[i]);	

			got_make_ann_mgf=true;
		}
		else
		if (! strcmp(argv[i],"-make_training_mgf"))
		{
			if (++i == argc)
				print_help("Missing file out file!");

			strcpy(out_file,argv[i]);	

			if (++i == argc)
				print_help("Missing num training spectra!");

			num_training_spectra = atoi(argv[i]);
			if (num_training_spectra<=0)
				print_help("Error: -make_training_mgf [out_file] [num spectra>0]\n");
			
			got_make_training_mgf=true;
		}
		else if (!strcmp(argv[i],"-file"))
		{
			if (++i == argc)
				print_help("Missing file name!");

			strcpy(input_file,argv[i]);
			got_input_file=true;
		}
		else
		if (!strcmp(argv[i],"-list"))
		{
			if (++i == argc)
				print_help("Missing list name!");

			strcpy(list_file,argv[i]);
			got_list_file=true;
		}
		else if  (!strcmp(argv[i],"-file_start_idx"))
		{
			if (++i == argc)
				print_help("Missing file start idx!");

			file_start_idx = atoi(argv[i]);
		}
		else if (!strcmp(argv[i],"-model")) 
		{
			if (++i == argc)
				print_help("Missing model name!");

			strcpy(model_file,argv[i]);
			got_model_file=true;
		}
		else if (! strcmp(argv[i],"-model_dir"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			strcpy(model_dir,argv[i]);
			got_model_dir=true;
		}
		else if (! strcmp(argv[i],"-fragment_tolerance"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			fragment_tolerance = atof(argv[i]);
			if (fragment_tolerance<0 || fragment_tolerance>0.75)
				print_help("Error: -fragment_toelerance should be 0-0.75\n");
		}
		else if (! strcmp(argv[i],"-pm_tolerance"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			pm_tolerance = atof(argv[i]);
			if (pm_tolerance<0 || pm_tolerance>5.0)
				print_help("Error: -pm_toelerance should be 0-5.0\n");
		}
		else if  (!strcmp(argv[i],"-num_solutions"))
		{
			if (++i == argc)
				print_help("Missing number of solutions!");

			num_solutions = atoi(argv[i]);
			if (num_solutions<=0 || num_solutions> 2000)
				print_help("Error: -num_solutions should be 1-2000\n");
		}
		else if (!strcmp(argv[i],"-tag_length"))
		{
			if (++i == argc)
				print_help("Missing minimum length parameter!");

			tag_length = atoi(argv[i]);
			if (tag_length<3 || tag_length>6)
				print_help("Error: -tag_length value must be 3-6\n");

		}
		else if (!strcmp(argv[i],"-digest"))
		{
			if (++i == argc)
				print_help("Missing digest type parameter : NON_SPECIFIC, TRYPSIN\n");

			if (! strcmp(argv[i],"NON_SPECIFIC"))
			{
				digest_type = NON_SPECIFIC_DIGEST;
			}
			else if (! strcmp(argv[i],"TRYPSIN"))
			{
				digest_type = TRYPSIN_DIGEST;
			}
			else
			{
				printf("Error: bad digest type: %s\n",argv[i]);
				print_help("Supported digest types: NON_SPECIFIC, TRYPSIN.");
			}
		}
		else if (! strcmp(argv[i],"-use_spectrum_charge"))
		{
			use_spectrum_charge = true;
		}
		else if (! strcmp(argv[i],"-use_spectrum_mz"))
		{
			use_spectrum_mz = true;
		}
		else if (! strcmp(argv[i],"-no_quality_filter"))
		{
			perform_filter = false;
		}
		else if (! strcmp(argv[i],"-correct_pm"))
		{
			correct_pm = true;
		}
		else if (! strcmp(argv[i],"-prm")) 
		{
			prm_only = true;
		}
		else if (! strcmp(argv[i],"-prm_norm")) 
		{
			prm_norm = true;
			prm_only = true;
		}
		else if (! strcmp(argv[i],"-output_aa_probs"))
		{
			output_aa_probs=true;
		}
		else if (! strcmp(argv[i],"-output_cumulative_probs"))
		{
			output_cumulative_probs=true;
		}
		else if (! strcmp(argv[i],"-pmcsqs_only"))
		{
			pmcsqs_only = true;
		}
		else if (! strcmp(argv[i],"-sqs_only"))
		{
			sqs_only = true;
		}
		else if (! strcmp(argv[i],"-min_filter_prob"))
		{
			if (++i == argc)
				print_help("Missing minimum probability parmater after -min_filter_prob !\n");

			min_filter_prob = -1.0;
			min_filter_prob = atof(argv[i]);
			if (min_filter_prob<0.0 || min_filter_prob>=1.0 || argv[i][0] != '0')
			{
				print_help("The flag -min_filter_prob should be followed by a minimal probability value [0-1.0]\n");
				exit(1);
			}
		}
		else if ( ! strcmp(argv[i],"-filter_spectra"))
		{
			got_filter_spectra = true;
			if (++i == argc)
				print_help("Missing minimum probability parmater after -filter_spectra !\n");
			
			sqs_filter_thresh=atof(argv[i]);

			if (sqs_filter_thresh <0 || sqs_filter_thresh>1.0)
				print_help("Error: the sqs threshold should be in the range 0-1 (recommended below 0.1)\n");
			
			if (++i == argc)
				print_help("Missing output directory for MGF files (second argument after -filter_spectra)!\n");
		
			strcpy(mgf_out_dir,argv[i]);
		}
		else if (! strcmp(argv[i],"-specific_idx"))
		{
			if (++i == argc)
				print_help("Missing idx!");
			specific_idx=atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-train_model"))
		{
			train_flag = true;
			if (++i == argc)
				print_help("Missing training tolerance!");

			train_tolerance = atof(argv[i]);
			if (train_tolerance<0.001 || train_tolerance>1.0)
				print_help("Error: training tolerance should be in the range 0.001 - 1.0\n");
		}
		else if (! strcmp(argv[i],"-start_train_idx"))
		{
			if (++i == argc)
				print_help("Missing start_train_idx!");

			start_train_idx = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-end_train_idx"))
		{
			if (++i == argc)
				print_help("end_train_idx!");

			end_train_idx = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-specific_reigon_model"))
		{
			if (++i == argc)
				print_help("specific_reigon_model!");

			specific_charge = atoi(argv[i++]);
			specific_size	= atoi(argv[i++]);
			specific_region = atoi(argv[i]);

		}
		else if (! strcmp(argv[i],"-specific_charge"))
		{
			if (++i == argc)
				print_help("specific_charge!");

			specific_charge = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-specific_size"))
		{
			if (++i == argc)
				print_help("specific_size!");

			specific_size = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-initial_model"))
		{
			got_initial_model = true;
			if (++i == argc)
				print_help("Missing initial model name!");
			strcpy(initial_model,argv[i]);
		}
		else if (! strcmp(argv[i],"-neg_spec_list"))
		{
			got_neg_spec_list = true;
			if (++i == argc)
				print_help("Missing neg spec list!");
			strcpy(neg_spec_list,argv[i]);
		}
		else if (! strcmp(argv[i],"-PTMs"))
		{
			got_PTM_string = true;
			if (++i == argc)
				print_help("Missing PTM list!");
			strcpy(PTM_string,argv[i]);
		}
		else if (! strcmp(argv[i],"-inspect_tags"))
		{
			make_inspect_tags=true;
			if (++i == argc)
				print_help("inspect_tags!");

			strcpy(tag_string,argv[i]);
		}
		else if (! strcmp(argv[i],"-rescore_inspect"))
		{
			got_rescore_inspect = true;
			if (++i == argc)
				print_help("Missing results file!");

			strcpy(inspect_results_file,argv[i]);

			if (++i == argc)
				print_help("Missing new results file!");

			strcpy(out_file,argv[i]);
		}
		else if (! strcmp(argv[i],"-recalibrate_inspect"))
		{
			got_recalibrate_inspect = true;
			if (++i == argc)
				print_help("Missing results file!");

			strcpy(inspect_results_file,argv[i]);

			if (++i == argc)
				print_help("Missing new results file!");

			strcpy(out_file,argv[i]); 		
		}
		else if ( ! strcmp(argv[i],"-make_peak_examples"))
		{
			got_make_peak_examples=true;
		}
		else if (! strcmp(argv[i],"-make_training_fa"))
		{
			make_training_fa=true;
		}
		else if (! strcmp(argv[i],"-test_tags"))
		{
			test_tags=true;
			if (++i == argc)
				print_help("test_tags!");

			strcpy(tag_string,argv[i]);
		}
		else if (! strcmp(argv[i],"-num_test_cases"))
		{
			if (++i == argc)
				print_help("num_test_cases!");

			num_test_cases = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-tag_suffix"))
		{
			if (++i == argc)
				print_help("tag suffix!");
			strcpy(tag_suffix,argv[i]);
		}
		else
		{
			printf("**********************************************************\n");
			printf("\nError: Unkown command line option: %s\n\n",argv[i]);
			print_help("");
			exit(0); 
		}
		i++;
	}


	if (! got_model_file) 
		print_help("Error: Missing model name!");


	if (!got_input_file && ! got_list_file)
		print_help("Error: missing input file (either -file or -list must be used).");

	Config *config = model.get_config();

	if (got_model_dir)
	{
		config->set_resource_dir(string(model_dir));
	}

	

	//////////////////////////////////////////////////////////////////
	// Model Training
	if (train_flag)
	{	
		if (got_initial_model)
		{
			model.read_model(initial_model);
			if (got_PTM_string)
				config->apply_selected_PTMs(PTM_string);
			model.read_rank_models(initial_model,true);
			model.read_cum_seq_prob_models(initial_model,true);
		}
		else
		{
			config->init_with_defaults();
			config->set_tolerance(train_tolerance);
			config->set_digest_type(digest_type);
			if (got_PTM_string)
				config->apply_selected_PTMs(PTM_string);
		}

		model.set_model_name(string(model_file));
	
		SpectraAggregator sa;
		if (! got_list_file)
		{
			if (got_input_file)
			{
		//		fm.init_from_mgf(config,input_file);
				sa.initializeFromSpectraFilePath(input_file, config);
			}
			else
			{
				printf("Must supply a list of annotated spectra for training!\n");
				exit(0);
			}
		}
		else
		{
		//	fm.init_from_list_file(config,list_file);
			sa.initializeFromTextFile(list_file, config);
		}
		
		
		model.trainModelsInStages(model_file, 
								  sa,
									train_tolerance, 
									start_train_idx, 
									end_train_idx,
									specific_charge, 
									specific_size, 
									specific_region,
									(got_neg_spec_list ? neg_spec_list : NULL));

	

		model.write_model();
		exit(0);
	}
	
	///////////////////////////////////////////////////////////////////
	// Model initializing (running some sort of de novo, need a model)
	// 
	const time_t start_time = time(NULL);

	cout << "PepNovo V3. Build " << build_name << endl;
	cout << "Copyright 2008, The Regents of the University of California. All Rights Reserved." << endl;
	cout << "Created by Ari Frank ([email protected])" << endl << endl;
	cout << "Initializing models (this might take a few seconds)... " << flush;

	// TODO: incorporate PTM line into the model reading and also the other model stuff below
	model.read_model(model_file,true); 
	if (got_PTM_string)
		config->apply_selected_PTMs(PTM_string);
	model.getPeptideCompositionAssigner().init_aa_translations();
	model.read_rank_models(model_file,true);
	model.read_cum_seq_prob_models(model_file,true);

	cout << "Done." << endl;

	config = model.get_config();
	config->set_digest_type(digest_type);

	if (fragment_tolerance>0)
		config->set_tolerance(fragment_tolerance);

	if (pm_tolerance>0)
		config->setPrecursorMassTolerance(pm_tolerance);

	if (correct_pm)
		config->set_need_to_estimate_pm(1);

	if (use_spectrum_mz)
		config->set_use_spectrum_mz(1);

	if (use_spectrum_charge)
		config->set_use_spectrum_charge(1);

	if (! perform_filter)
		config->set_filter_flag(0);

	if (config->get_pm_tolerance()<0.1)
		config->set_need_to_estimate_pm(0);

	cout << setprecision(4) << fixed;
	cout << "Fragment tolerance : " << config->getTolerance() << endl;
	cout << "PM tolernace       : " << config->get_pm_tolerance() << endl;
	cout << "PTMs considered    : " ;
	if (got_PTM_string)
	{
		cout << PTM_string << endl;
	}
	else
	{
		cout << "None" << endl;
	}
	


	///////////////////////////////////////////////////////////////////
	// Training fa
	if (make_training_fa)
	{
		make_denovo_training_fa(model,input_file);
		exit(0);
	}

	///////////////////////////////////////////////////////////////////
	// Inspect tags

	if (make_inspect_tags)
	{
		create_tag_file_for_inspect(model,input_file,tag_string,tag_suffix);
		exit(0);
	}

	if (test_tags)
	{
		benchmark_tags(model,list_file,tag_string,num_test_cases);
		exit(0);
	}


	////////////////////////////////////////////////////////////////////
	// Rescore InsPecT
	if (got_rescore_inspect)
	{
		PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0);
		db_score->rescore_inspect_results(input_file,inspect_results_file,out_file);
		exit(0);
	}

	if (got_recalibrate_inspect)
	{
		cout << "Recalibrating delta scores in " << input_file << endl;
		PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0);
		db_score->recalibrate_inspect_delta_scores(input_file,inspect_results_file,out_file);
		exit(0);
	}

	if (got_make_peak_examples)
	{
		cout << "Making peak examples " << input_file << endl;
		PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0);
		//db_score->make_peak_table_examples(input_file);
		exit(0);
	}



	///////////////////////////////////////////////////////////////////
	// Make input file list
	vector<string> list_vector;
	if (got_list_file)
	{
		readListOfPaths(list_file, list_vector);
	}
	else
		list_vector.push_back(input_file);

	int correct_benchmark =0;
	int total_benchmark =0;
	int counter=0;

	if (got_make_training_mgf)
	{
	//	make_training_mgf(config,list_file,num_training_spectra,out_file);
		exit(0);
	}


	if (sqs_only)
	{
		PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr();
		if (! pmcsqs ||  ! pmcsqs->getIndInitializedSqs())
		{
			cout << "Error: no spectrum quality score (SQS) for this model!" << endl;
			exit(1);
		}
	}
	else
	if (got_filter_spectra ||  pmcsqs_only)
	{
		PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr();
		if (! pmcsqs || ! pmcsqs->getIndInitializedPmc() || ! pmcsqs->getIndInitializedSqs())
		{
			cout << "Error: no parent mass correction (PMC) and/or quality score (SQS) for this model!" << endl;
			exit(1);
		}
	}




	///////////////////////////////////////////////////////////////////
	// FILTER SPECTRA
	if (got_filter_spectra)
	{
		int num_written =0;
		int num_read = 0;
		PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr();

	//	pmcsqs->output_filtered_spectra_to_mgfs(config, list_vector, mgf_out_dir, sqs_filter_thresh, num_written, num_read);
		
		time_t curr_time = time(NULL);
		double elapsed_time = (curr_time - start_time);
		cout << "Processed " << list_vector.size() << " (" << num_read << " spectra)." << endl;
		cout << "Wrote " << num_written << " spectra to mgfs in " << mgf_out_dir << endl;
		cout << "Elapsed time " << fixed << elapsed_time << " seconds." << endl;
		return 0;
	}

	//////////////////////////////////////////////////////////////////
	// PRM
	if (prm_only)
	{
		

		perform_prm_on_list_of_files(model, list_vector, min_filter_prob, file_start_idx, prm_norm);
	//	prm_benchmark(model, list_vector, min_pmcsqs_prob, file_start_idx);

	//	FileManager fm;
	//	fm.init_from_list(config,list_vector);
	//	model.learn_prm_normalizer_values(fm);
	//	model.write_prm_normalizer_values();
		return 0;
	}

	if (fabs(config->get_aa2mass()[Cys]-103.0)<1)
	{
		cout << endl <<"*** Warning: searching with unmodified cystine, usually the PTM C+57 should be included ***" << endl << endl;
	}
	cout << endl;

	//////////////////////////////////////////////////////////////////
	// PMCSQS
	if (pmcsqs_only)
	{
	//	perform_pmcsqs_on_list_of_files(model, list_vector, file_start_idx);
		return 0;
	}
 
	//////////////////////////////////////////////////////////////////
	// SQS
	if (sqs_only)
	{
	//	perform_sqs_on_list_of_files(model, list_vector, file_start_idx);
		return 0;
	}  
	
	//////////////////////////////////////////////////////////////////
	// DENOVO AND TAGS

	if (tag_length<=0)
	{
	//	perform_denovo_on_list_of_files(model, list_vector, file_start_idx, num_solutions, 7, 16, 
	//		false, min_filter_prob, output_aa_probs,  output_cumulative_probs, cout);
		new_perform_denovo_on_list_of_files(model, list_vector, file_start_idx, num_solutions, 7, 16, 
			false, min_filter_prob, output_aa_probs,  output_cumulative_probs, cout);
	}
	else
	{
		perform_tags_on_list_of_files(model,list_vector,file_start_idx,num_solutions,tag_length,
			false, min_filter_prob, output_aa_probs, output_cumulative_probs, cout);	
	}
	

#ifdef WIN32
	system("pause");
#endif

	return 0;
}
Beispiel #7
0
/***************************************************************************************
This function touches up inspect search results by rescoring the sequences returned by
inspect. The function produces a new inspect results file with the scores (and delta scores)
replaced.
****************************************************************************************/
void PeptideRankScorer::rescore_inspect_results(char *spectra_file, 
											   char *inspect_res, 
											   char *new_res_file) const
{
	AllScoreModels* allScoreModels = static_cast<AllScoreModels*>(this->allScoreModelsPtr_);
	Config *config = allScoreModels->get_config();

	ifstream org_res(inspect_res);

	if (!  org_res.is_open() || ! org_res.good())
	{
		cout << "Error: couldn't open original inspect results file for reading:" << inspect_res << endl;
		exit(1);
	}

	ofstream new_res(new_res_file);
	if (! new_res.is_open() || ! new_res.good())
	{
			cout << "Error: couldn't open new inspect results file for writing:" << new_res << endl;
		exit(1);
	}

	char line_buff[1024];
	org_res.getline(line_buff,1024);

	bool read_line  = true;
	vector<string> field_names;
	if (line_buff[0] != '#')
	{
		read_line = false;
	}
	else
	{
		string header = string(line_buff);
		split_string(header,field_names);

	//	int i;
	//	for (i=0; i<field_names.size(); i++)
	//		cout << i << "\t" << field_names[i] << endl;
		cout << "Header:" << endl << line_buff << endl;
	}


	vector<ScanCandidateSet> cand_sets;
	vector<int> scan_mapping;
	cand_sets.clear();
	scan_mapping.resize(100000,-1);
	
	while (! org_res.eof())
	{
		vector<string> fields;

		if (read_line)
		{
			org_res.getline(line_buff,1024);
			if (org_res.gcount() < 5)
				continue;
		}
		else
		{
			read_line = true;
		}

		split_string(line_buff,fields);
		InspectResultsLine res;

		res.parse_from_fields(config,fields);

		if (cand_sets.size()==0 || ! cand_sets[cand_sets.size()-1].add_new_line(res))
		{
			ScanCandidateSet new_set;
			new_set.add_new_line(res);
			
			if (new_set.scan>=scan_mapping.size())
				scan_mapping.resize(2*scan_mapping.size(),-1);

			scan_mapping[new_set.scan]=cand_sets.size();
			cand_sets.push_back(new_set);
		}
	}
	org_res.close();

	cout << "Read results for " << cand_sets.size() << " scans..." << endl;


	SpectraAggregator sa;
	sa.initializeFromSpectraFilePath(spectra_file, config);

	SpectraList sl(sa);
	sl.selectAllAggregatorHeaders();

	cout << "Read " <<  sl.getNumHeaders() << " spectra headers." << endl;

	if (sl.getNumHeaders() == 0)
	{
		cout << "Error: read not spectra headers from " << spectra_file << endl;
		return;
	}

	
	vector<bool> spectrum_indicators;
	spectrum_indicators.resize(cand_sets.size(),false);

	int num_found =0;
	int sc;
	for (sc=0; sc<sl.getNumHeaders(); sc++)
	{
		const SingleSpectrumHeader* header = sl.getSpectrumHeader(sc);		
		int scan_number = (header->getScanNumber() >=0 ? header->getScanNumber() : header->getIndexInFile());
		
		if (header->getFileType() == IFT_MGF)
			scan_number = header->getIndexInFile();

		assert(scan_number>=0);
		if (scan_mapping[scan_number]<0)
			continue;

		AnnotatedSpectrum as;
		if (! as.readSpectrum(sa, header))
		{
			continue;
		}

		spectrum_indicators[scan_mapping[scan_number]]=true;
		num_found++;

		ScanCandidateSet& cand_set = cand_sets[scan_mapping[scan_number]];
		
		vector<PeptideSolution> peptide_sols;
		peptide_sols.resize(cand_set.results.size());

		int j;
		for (j=0; j<cand_set.results.size(); j++)
		{
			InspectResultsLine& inspect_res = cand_set.results[j];
			PeptideSolution& sol = peptide_sols[j];

			sol.pep = inspect_res.pep;
			sol.pm_with_19 = sol.pep.get_mass_with_19();
			sol.charge = inspect_res.Charge;
			sol.reaches_n_terminal = true;
			sol.reaches_c_terminal = true;
		}

		vector<score_pair> scores;
	//	score_complete_sequences(peptide_sols,ssf,peaks,num_peaks,scores);
		scoreCompleteSequences(peptide_sols, as, scores);

		for (j=0; j<scores.size(); j++)
			cand_set.results[j].Score = scores[j].score;

		cand_set.recalbirate_scores(config);

		vector<string> pep_strings;
		pep_strings.resize(scores.size());
		int max_len =0;
		for (j=0; j<cand_set.results.size(); j++)
		{
			pep_strings[j]=cand_set.results[j].pep.as_string(config);
			if (pep_strings[j].length()>max_len)
				max_len = pep_strings[j].length();
		}

		if (1)
		{
			cand_set.output_to_stream(new_res,10);
		}
		else
		{
			for (j=0; j<cand_set.results.size(); j++)
			{
				cout << cand_set.scan << " " << cand_set.results[j].Charge << "\t";

				cout << cand_set.results[j].Protein.substr(0,3) << " " << pep_strings[j];
				if (pep_strings[j].length()<max_len)
				{
					int k;
					for (k=pep_strings[j].length(); k<max_len; k++)
						cout << " ";
				}
				cout << "\t" << cand_set.results[j].MQScore << "\t" << cand_set.results[j].Score << "\t" <<
				cand_set.results[j].DeltaScore << "\t" << cand_set.results[j].DeltaScoreOther << endl;
			}
			cout << endl;
		}
	}

	if (num_found<cand_sets.size())
	{
		cout << "Warning: found only " << num_found << "/" << cand_sets.size() << " of the scans scored by InsPecT!" << endl;
	}
	else
	{
		cout << "All scored scans found in spectrum file." << endl;
	}
}
void PrmNodeScoreModel::trainNodeScoreModels(void* allScoreModelsVoidPointer,
											 const char *name, 
											 const SpectraAggregator& sa,
											 int specificCharge, 
											 int specificSize, 
											 int specificRegion)
{
	AllScoreModels* allScoreModels = static_cast<AllScoreModels*>(allScoreModelsVoidPointer);
	config_ = allScoreModels->get_config();
	// resize regional breakage score models according to regional fragment sets
	const vector< vector< vector< RegionalFragments > > >& all_rfs = config_->get_regional_fragment_sets();

	int c;
	RegionalPrmNodeScoreModels_.resize(all_rfs.size());
	for (c=0; c<all_rfs.size(); c++)
	{
		RegionalPrmNodeScoreModels_[c].resize(all_rfs[c].size());
		int s;
		for (s=0; s<all_rfs[c].size(); s++)
		{
			RegionalPrmNodeScoreModels_[c][s].resize(all_rfs[c][s].size());
			int r;
			for (r=0; r<RegionalPrmNodeScoreModels_[c][s].size(); r++)
				if (! RegionalPrmNodeScoreModels_[c][s][r].get_was_initialized())
					RegionalPrmNodeScoreModels_[c][s][r].init(config_,c,s,r);
		}
	}


	// train models
	for (c=1; c<RegionalPrmNodeScoreModels_.size(); c++)
	{
		if (RegionalPrmNodeScoreModels_.size() == 0 || (specificCharge>0 && specificCharge != c))
			continue;

		if (sa.getNumSpectraWithCharge(c)<200)
		{
			cout << "WARNING: insufficient number of spectra to train breakage model for charge " << c << endl;
			cout <<	"		  only " << sa.getNumSpectraWithCharge(c) << " spectra were found so this charge is being skipped!" << endl << endl;
			continue;
		}

		int s;
		for (s=0; s<RegionalPrmNodeScoreModels_[c].size(); s++)
		{
			if (specificSize>=0 && s != specificSize)
				continue;

			int r;
			for (r=0; r<RegionalPrmNodeScoreModels_[c][s].size(); r++)
			{
				if (specificRegion>=0 && r != specificRegion)
					continue;
				
				RegionalPrmNodeScoreModels_[c][s][r].trainRegionalScoreModel(allScoreModelsVoidPointer, name, sa);
			}
		}
	}

	// train PRM normalizer values
//	cout << endl << "Training PRM normalizer vlaues..." << endl;

// TODO fix this issue, it needs to use the AllScoreModels class
//	learn_prm_normalizer_values(fm);

	ind_was_initialized=true;
}