// returns number of peaks that were stored int PeakList::readPeaksToLocalAllocation(const SpectraAggregator& sa, const SingleSpectrumHeader* header) { // create a basic PeakList read function header_ = header; if (localAllocationSize_>0 && peaks_) delete [] peaks_; localAllocationSize_ = header->getOriginalNumPeaks(); peaks_ = new Peak[localAllocationSize_]; if (! peaks_) { cout << "Error: couldn't allocate memory for spectrum!" << endl; exit(1); } const int numPeaksRead = sa.readPeakList(header, peaks_); numPeaks_ = numPeaksRead; if (header->getFileType() != IFT_MZXML && ( numPeaksRead != header->getOriginalNumPeaks() || peaks_[0].mass != header->getFirstPeakMass()) ) { cout << "Error reading scan " << header->getScanNumber() << ": " << header_->getTitle() << " in file " << sa.getSpectraFile(header_->getSpectraFileIndexInList()).getFilePath() << endl; if (numPeaksRead != header->getOriginalNumPeaks()) cout << "Num peaks read " << numPeaksRead << ", expecting " << header->getScanNumber() << endl; if (peaks_[0].mass != header->getFirstPeakMass()) cout << setprecision(5) << "First peak mass " << peaks_[0].mass <<", expecting " << header->getFirstPeakMass() << endl; cout << "Could possibly be a dos/unix problem with the files, try running dos2unix (or unix2dos)..." << endl; cout <<"Skipping spectrum..." << endl; const int numPeaksRead = sa.readPeakList(header, peaks_); return 0; } return numPeaks_; }
// function assumes that the buffer is sufficently large for all peaks being read // returns number of peaks that were stored int PeakList::readPeaksToBuffer(const SpectraAggregator& sa, const SingleSpectrumHeader* header, Peak* peakBuffer) { // create a basic PeakList read function header_ = header; if (localAllocationSize_>0 && peaks_) delete [] peaks_; localAllocationSize_ = 0; peaks_ = peakBuffer; const int numPeaksRead = sa.readPeakList(header, peaks_); numPeaks_ = numPeaksRead; return numPeaks_; }
bool Spectrum::readSpectrum(const SpectraAggregator& sa, const SingleSpectrumHeader* header, bool indFilterSpectrum) { config_ = sa.getConfig(); const int numPeaksRead = readPeaksToLocalAllocation(sa, header); if (numPeaksRead<5) return false; copyHeaderInformation(); initializePeakList(config_, indFilterSpectrum); if (! sanityCheck()) return false; initializeSpectrum(); return (numPeaks_>0); }
/******************************************************************************** Due to the limit on the number of different open file descriptors, the DAT creation is done in two stages. First we convert the data into DAT files using a large mz increment (e.g. 25 Da). No qaulity filtration is peformed at this stage. *********************************************************************************/ string DatFileWriter::convertDataToDatFirstPass(const MsParameterStruct* params) { const string& orgList = (params->spectraListToLoad.length()>0 ? params->spectraListToLoad : params->list); const string& metaList = params->metaList; const string& datDir = params->tmpDir; const string& datName = params->outputName; float sqsThreshold = params->sqsThreshold; size_t fileStartIdx = params->startFileIdx; int verboseLevel = params->verboseLevel; map<string,int> idTitles; if (params->gotCreateArchiveFromMgfs) readIdsTitleFromIdFile(params, idTitles); datDir_ = datDir; datName_ = datName + "_R1"; verboseLevel_ = verboseLevel; init(MAJOR_MZ_INCREMENT_FOR_DAT); cout << endl << "Pass 1: reading spectra files and writing to dat with " << MAJOR_MZ_INCREMENT_FOR_DAT << " Da increments." << endl; cout << "----------------------------------------------------------------------" << endl << endl; PMCSQS_Scorer* pmcsqsModel = const_cast<PMCSQS_Scorer*>(model_->get_pmcsqs_ptr()); if (sqsThreshold>0.0 && ! pmcsqsModel->getIndInitializedSqs()) error("Sqs model not initialized!, need a valid sqs model if using a filtering threshold!"); if (sqsThreshold>0.0) cout << "Filtering spectra with SQS threshold of " << sqsThreshold << endl; vector<SinglePath> paths; size_t firstFileIdxInList = 0; if (orgList.length()) { vector<string> regularPaths; firstFileIdxInList = readListOfPaths(orgList.c_str(), regularPaths); if (fileStartIdx == 0 && firstFileIdxInList>0) fileStartIdx = firstFileIdxInList; numOriginalPaths_ = regularPaths.size(); if (verboseLevel_>0) { cout << "Read " << paths.size() << " paths to spectra files." << endl; cout << "Converting data to DAT, using m/z increment of " << fixed << setprecision(2) << mzIncrement_ << endl; } paths.resize(regularPaths.size()); for (size_t i=0; i<regularPaths.size(); i++) { paths[i].path = regularPaths[i]; paths[i].datasetIdx = (params->datasetIdx == MAX_INT ? 0 : params->datasetIdx); paths[i].idxInList = fileStartIdx + i; } createDirIfDoesNotExist(params->outDir.c_str()); ostringstream oss; oss << params->outputStub << "_" << (params->datasetIdx == MAX_INT ? 0 : params->datasetIdx) << "_spec_list.txt"; writeListOfPaths(oss.str().c_str(), regularPaths); } else { assert( metaList.length()>0); MetaList ml; ml.readMetaList(metaList.c_str()); ml.writeLists(params->outputName.c_str(), "_spec_list.txt"); paths = ml.getSinglePaths(); } ScanListManager sem; if (params->exclusionList.length()>0) { const size_t numExclusions = sem.initialize(params->exclusionList.c_str(), params->minMz - 5.0, params->maxMz + 5.0); if (verboseLevel_>0) cout << "Read " << numExclusions << " from " << params->exclusionList << endl; } if (paths.size() == 0) return (std::string("")); int numFilesWithoutSpectra = 0; size_t peakBufferSize = 10000; Peak* peakBuffer = new Peak[peakBufferSize]; numSpectraWrittenFirstPass_ = 0; numSpectraReadFromOriginalFiles_ = 0; map<string,int> numTimes; for (size_t i=0; i<paths.size(); i++) { const double fileStartTime = time(NULL); if (verboseLevel_>0) cout << i << "\tExtracting from: " << paths[i].path << " [" << paths[i].datasetIdx << " : " << paths[i].idxInList << "]" << endl; SpectraAggregator sa; sa.initializeFromSpectraFilePath(paths[i].path.c_str(), config_, paths[i].datasetIdx , paths[i].idxInList, params->gotOverwriteLocations); SpectraList sl(sa); sl.selectAllAggregatorHeaders(); sl.removeExcludedScans(sem); if (verboseLevel_>0) cout << "\tFound " << sl.getNumHeaders() << " spectra..."; if (sl.getNumHeaders() == 0) { numFilesWithoutSpectra++; cout << endl << endl; cout.flush(); continue; } size_t numExtracted =0; for (size_t j=0; j<sl.getNumHeaders(); j++) { const SingleSpectrumHeader* header = sl.getSpectrumHeader(j); if (header->getOriginalNumPeaks()>1e6) continue; if (header->getOriginalNumPeaks()>= peakBufferSize) { delete [] peakBuffer; peakBufferSize = header->getOriginalNumPeaks()*2; peakBuffer = new Peak[peakBufferSize]; } if (params->gotCreateArchiveFromMgfs && idTitles.size()>0) { if (idTitles.find(header->getTitle()) == idTitles.end()) continue; // don't write spectra if the id file was supplied and the title is not there } numTimes[header->getTitle()]++; // if (numTimes[header->getTitle()]>1 && header->getTitle().length()>0) // cout << endl << "Warning: header appears multiple times: " << header->getTitle() << endl; // HACK (bad design) // save original generation idx and index in list // The problem I am trying to solve is how to keep the indexes written // in dat files (generation and index in file), yet still be able to read // the file in the current list of paths (which has a different index) // solution (for next version) // have separate attributes (originaldatasetIdx, originalFileIndex) // these never change no matter where the spectrum gets moved! int originalDatasetIdx = header->getDatasetIndex(); int originalIndexInList= header->getSpectraFileIndexInList(); SingleSpectrumHeader* nonConstHeader = const_cast<SingleSpectrumHeader*>(header); nonConstHeader->setSpectraFileIndexInList(0); PeakList pl; pl.setPeaksPtr( peakBuffer ); if (pl.readPeaksToBuffer(sa, header, peakBuffer) < 7) // if not enough peaks read, skip this spectrum continue; if (! params->gotOverwriteLocations) { nonConstHeader->setDatasetIndex(originalDatasetIdx); nonConstHeader->setSpectraFileIndexInList(originalIndexInList); } else { nonConstHeader->setDatasetIndex(paths[i].datasetIdx); nonConstHeader->setSpectraFileIndexInList(paths[i].idxInList); } pl.initializePeakList(config_, true); numSpectraReadFromOriginalFiles_++; if (pl.getNumPeaks()<7) // don't bother with spectra with too few peaks continue; if (pl.getNumPeaks()>100000) { header->printStats(); cout << "num peaks: " << pl.getNumPeaks() << endl; error("Too many peaks in spectrum, something went wrong!"); } if (pmcsqsModel && (sqsThreshold>0.0 || params->gotCorrectPM )) { size_t maxCharge=0; const float sqs = pmcsqsModel->calculateSqsScore(config_, pl, &maxCharge); if (sqs<sqsThreshold || maxCharge == 0) continue; header->setSqs(sqs); if (params->gotCorrectPM) { PmcSqsChargeRes res; pmcsqsModel->computeBestMzValuesForCharge(pl, maxCharge, config_->get_pm_tolerance(), res); //cout << header->getMOverZ() << " : " << maxCharge << "\t" << res.mz1 << "\t" << res.score1 << "\t" << res.mz2 << "\t" << res.score2 << endl; SingleSpectrumHeader* nonConstHeader = const_cast<SingleSpectrumHeader*>(header); nonConstHeader->setOriginalPmWith19(header->getMOverZ()); // this is a wrong charge assignment, use original m/z if (fabs(res.mz1-header->getMOverZ())>8.0) { nonConstHeader->setMOverZ(header->getMOverZ()); nonConstHeader->setCharge(header->getCharge()); } else { nonConstHeader->setMOverZ(res.mz1); nonConstHeader->setCharge(maxCharge); } } } addPeakListToDat(pl); numExtracted++; } numSpectraWrittenFirstPass_ += numExtracted; if (verboseLevel_>0) { const double fileEndTime = time(NULL); cout << " Wrote " << numExtracted << " to dat files (this took " << fileEndTime-fileStartTime << " sec.)" << endl << endl; cout.flush(); } } closeAllOpenDats(); if (peakBuffer) delete [] peakBuffer; // summary if (verboseLevel_>0) { cout << endl << "SUMMARY (first pass):" << endl; cout << "---------------------" << endl; cout << "Wrote " << datPaths_.size() << " dat files to " << datDir_ << endl; cout << "These files contain " << numSpectraWrittenFirstPass_ << " spectra (from a total of " << numSpectraReadFromOriginalFiles_ << " that were read)" << endl; } if (numFilesWithoutSpectra>0) { cout << endl << "Warning: encountered " << numFilesWithoutSpectra << " spectra files for which no spectra were read." << endl << endl; } if (numSpectraWrittenFirstPass_ == 0) error("Did not write any spectra in first pass! Exiting."); // returns the path to the list of created dat files return (writeDatPaths()); }
void PMCSQS_Scorer::trainSqsModels(const Config* config, const SpectraAggregator& positiveSpectra, const char* pathNegativeSpectraList, int specificCharge, vector< vector<float> >* inputWeights) { // TODO add weight file that can be read from outside to set the weights... ? vector< vector< vector<ME_Regression_Sample> > > samples; // first dim: neg, +1, +2, +3 // second dim: sizeIndex maximalChargeWithModels_ = (inputWeights ? inputWeights->size()-1 : 3); set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19 set_bin_increment(0.1); set_sqs_mass_thresholds(); if (pmcMassThresholds_.size() == 0) { pmcMassThresholds_=config->get_size_thresholds(); } vector<vector<float> > classWeights; if (inputWeights) { classWeights = *inputWeights; } else setClassWeightsAccordingToData(positiveSpectra, classWeights); const int numSizes = sqsMassThresholds_.size(); cout << "number of sizes for SQS models " << numSizes+1 << endl; samples.resize(maximalChargeWithModels_+1); SpectraAggregator negativeSpectra; negativeSpectra.initializeFromTextFile(pathNegativeSpectraList, config); const int maxHeadersPerModel = 8000; // read all samples size_t charge; for (charge=0; charge<=maximalChargeWithModels_; charge++) { if (charge>0 && specificCharge>0 && charge != specificCharge) continue; samples[charge].resize(numSizes+1); size_t sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { const mass_t minMass = (sizeIndex == 0 ? 0 : sqsMassThresholds_[sizeIndex-1]); const mass_t maxMass = (sizeIndex == numSizes ? POS_INF : sqsMassThresholds_[sizeIndex]); const SpectraAggregator& sa = (charge == 0 ? negativeSpectra : positiveSpectra); SpectraList sl(sa); if (charge == 0) { sl.selectHeaders(minMass, maxMass); } else sl.selectHeaders(minMass, maxMass, charge, charge); cout << "Found " << sl.getNumHeaders() << " for charge " << charge << " ranges:" << minMass << " - " << maxMass << endl; sl.randomlyReduceListToSize(maxHeadersPerModel); const int label = (charge == 0 ? 1 : 0); samples[charge][sizeIndex].resize(sl.getNumHeaders()); int i; for (i=0; i<sl.getNumHeaders(); i++) { const SingleSpectrumHeader* header = sl.getSpectrumHeader(i); PeakList pl; pl.readPeaksToLocalAllocation(sa,header); pl.initializePeakList(config, true); initializeForCurrentSpectrum(config, pl); calculateCurrentSpectrumPmcValues(pl, bin_increment); fillSqsMeSample(pl, samples[charge][sizeIndex][i]); samples[charge][sizeIndex][i].label = label; } } } // cout sample composition cout << "Sample composition:" << endl; for (charge=0; charge<=maximalChargeWithModels_; charge++) { cout << charge; size_t i; for (i=0; i<samples[charge].size(); i++) cout << "\t" << samples[charge][i].size(); cout << endl; } // create SQS models sqs_models.resize(maximalChargeWithModels_+1); for (charge =0; charge<=maximalChargeWithModels_; charge++) { sqs_models[charge].resize(maximalChargeWithModels_+1); int j; for (j=0; j<sqs_models[charge].size(); j++) sqs_models[charge][j].resize(numSizes+1,NULL); } for (charge=1; charge<=maximalChargeWithModels_; charge++) { int sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { cout << endl << "CHARGE " << charge << " SIZE " << sizeIndex << endl; ME_Regression_DataSet ds; ds.num_classes=2; ds.num_features=SQS_NUM_FIELDS; ds.add_samples(samples[0][sizeIndex]); ds.add_samples(samples[charge][sizeIndex]); ds.tally_samples(); if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001) { cout << "Warning: insufficient number of samples, not trianing model for this charge " << charge << " size " << sizeIndex << endl; continue; } const double pos_weight = 0.2 + classWeights[charge][sizeIndex]*0.3; ds.randomly_remove_samples_with_activated_feature(1,SQS_IND_MAX_TAG_LENGTH_ABOVE_4,0.5); ds.calibrate_class_weights(pos_weight); // charge vs bad spectra ds.print_feature_summary(cout,SQS_var_names); sqs_models[charge][0][sizeIndex]=new ME_Regression_Model; sqs_models[charge][0][sizeIndex]->train_cg(ds,250); sqs_models[charge][0][sizeIndex]->print_ds_probs(ds); } } //////////////////////////////////////////// // train model vs. model if charge1>charge2 if (1) { int charge1,charge2; for (charge1=2; charge1<=maximalChargeWithModels_; charge1++) { for (charge2=1; charge2<charge1; charge2++) { int sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { ME_Regression_DataSet ds; ds.num_classes=2; ds.num_features=SQS_NUM_FIELDS; ds.add_samples(samples[charge1][sizeIndex]); int i; for (i=0; i<samples[charge2][sizeIndex].size(); i++) { samples[charge2][sizeIndex][i].label=1; ds.add_sample(samples[charge2][sizeIndex][i]); samples[charge2][sizeIndex][i].label=0; } float relative_weight = classWeights[charge1][sizeIndex]/ (classWeights[charge1][sizeIndex]+classWeights[charge2][sizeIndex]); ds.tally_samples(); if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001) { cout << "Warning: insufficient number of samples, not trianing model for charge " << charge1 << " vs charge " << charge2<< " (size " << sizeIndex << ")" << endl; continue; } ds.calibrate_class_weights(relative_weight); sqs_models[charge1][charge2][sizeIndex] = new ME_Regression_Model; cout << endl << "CHARGE " << charge1 << " vs " << charge2 << " size " << sizeIndex << endl; cout << "Relative weights: " << charge1 << "/(" << charge1 << "+" << charge2 << "): " << relative_weight << endl; ds.print_feature_summary(cout,SQS_var_names); sqs_models[charge1][charge2][sizeIndex]->train_cg(ds,300); sqs_models[charge1][charge2][sizeIndex]->print_ds_probs(ds); } } } } init_sqs_correct_factors(maximalChargeWithModels_, sqsMassThresholds_.size()); //////////////////////////////////////////// // final report on datasets cout << endl; int sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { cout << endl << "SIZE: " << sizeIndex << endl; cout << "--------" << endl; float p_thresh = 0.05; int d; for (d=0; d<=maximalChargeWithModels_; d++) { vector<int> counts; vector<int> max_counts; counts.resize(maximalChargeWithModels_+1,0); max_counts.resize(maximalChargeWithModels_+1,0); int i; for (i=0; i<samples[d][sizeIndex].size(); i++) { bool above_thresh=false; float max_prob=0; int max_class=0; int c; for (c=1; c<=maximalChargeWithModels_; c++) { if (! sqs_models[c][0][sizeIndex]) continue; float prob = sqs_models[c][0][sizeIndex]->p_y_given_x(0,samples[d][sizeIndex][i]); if (prob>p_thresh) { counts[c]++; above_thresh=true; if (prob>max_prob) { max_prob=prob; max_class=c; } } } max_counts[max_class]++; if (! above_thresh) counts[0]++; } cout << d << "\t"; for (i=0; i<=maximalChargeWithModels_; i++) cout << fixed << setprecision(4) << max_counts[i]/(float)samples[d][sizeIndex].size() << "\t"; cout << endl; } } ind_initialized_sqs = true; string path; path = config->get_resource_dir() + "/" + config->get_model_name() + "_SQS.txt"; write_sqs_models(path.c_str()); }
int main(int argc, char **argv) { AllScoreModels model; int i; char ann_file[256]; char out_file[256]; char input_file[256]; char inspect_results_file[256]; char list_file[256]; char model_file[256]; char initial_model[256]; char model_dir[256]; char PTM_string[256]; char mgf_out_dir[256]; char neg_spec_list[256]; char tag_string[64]; char tag_suffix[64]; bool got_input_file=false,got_model_file=false, got_list_file=false; bool got_model_dir=false, got_initial_model=false, got_PTM_string = false, got_neg_spec_list=false; bool prm_only=false; bool prm_norm=false; bool pmcsqs_only = false; bool sqs_only = false; bool got_filter_spectra = false; bool pmcsqs_and_prm = false; bool train_flag = false; bool correct_pm = false; bool use_spectrum_charge = false; bool use_spectrum_mz = false; bool perform_filter = true; bool output_aa_probs = false; bool output_cumulative_probs = false; bool make_inspect_tags = false; bool make_training_fa = false; bool test_tags = false; bool got_make_ann_mgf = false; bool got_make_training_mgf = false; bool got_rescore_inspect = false; bool got_recalibrate_inspect = false; bool got_make_peak_examples = false; int start_train_idx=0; int end_train_idx = POS_INF; int specific_charge=-1; int specific_size=-1; int specific_region=-1; int specific_idx = -1; int file_start_idx =0; int tag_length = 0; int num_solutions = 20; int digest_type = TRYPSIN_DIGEST; mass_t train_tolerance; float min_pmcsqs_prob = -1.0; mass_t fragment_tolerance = -1.0; mass_t pm_tolerance = -1.0; float sqs_filter_thresh = 0.0; float min_filter_prob = 0.0; int num_test_cases=-1; int num_training_spectra=-1; seedRandom(112233); strcpy(tag_suffix,"tags"); // read command line arguments i=1; while (i<argc) { if (! strcmp(argv[i],"-make_ann_mgf")) { if (++i == argc) print_help("Missing file ann file!"); strcpy(ann_file,argv[i]); if (++i == argc) print_help("Missing file out file!"); strcpy(out_file,argv[i]); got_make_ann_mgf=true; } else if (! strcmp(argv[i],"-make_training_mgf")) { if (++i == argc) print_help("Missing file out file!"); strcpy(out_file,argv[i]); if (++i == argc) print_help("Missing num training spectra!"); num_training_spectra = atoi(argv[i]); if (num_training_spectra<=0) print_help("Error: -make_training_mgf [out_file] [num spectra>0]\n"); got_make_training_mgf=true; } else if (!strcmp(argv[i],"-file")) { if (++i == argc) print_help("Missing file name!"); strcpy(input_file,argv[i]); got_input_file=true; } else if (!strcmp(argv[i],"-list")) { if (++i == argc) print_help("Missing list name!"); strcpy(list_file,argv[i]); got_list_file=true; } else if (!strcmp(argv[i],"-file_start_idx")) { if (++i == argc) print_help("Missing file start idx!"); file_start_idx = atoi(argv[i]); } else if (!strcmp(argv[i],"-model")) { if (++i == argc) print_help("Missing model name!"); strcpy(model_file,argv[i]); got_model_file=true; } else if (! strcmp(argv[i],"-model_dir")) { if (++i == argc) print_help("Missing model dir name!"); strcpy(model_dir,argv[i]); got_model_dir=true; } else if (! strcmp(argv[i],"-fragment_tolerance")) { if (++i == argc) print_help("Missing model dir name!"); fragment_tolerance = atof(argv[i]); if (fragment_tolerance<0 || fragment_tolerance>0.75) print_help("Error: -fragment_toelerance should be 0-0.75\n"); } else if (! strcmp(argv[i],"-pm_tolerance")) { if (++i == argc) print_help("Missing model dir name!"); pm_tolerance = atof(argv[i]); if (pm_tolerance<0 || pm_tolerance>5.0) print_help("Error: -pm_toelerance should be 0-5.0\n"); } else if (!strcmp(argv[i],"-num_solutions")) { if (++i == argc) print_help("Missing number of solutions!"); num_solutions = atoi(argv[i]); if (num_solutions<=0 || num_solutions> 2000) print_help("Error: -num_solutions should be 1-2000\n"); } else if (!strcmp(argv[i],"-tag_length")) { if (++i == argc) print_help("Missing minimum length parameter!"); tag_length = atoi(argv[i]); if (tag_length<3 || tag_length>6) print_help("Error: -tag_length value must be 3-6\n"); } else if (!strcmp(argv[i],"-digest")) { if (++i == argc) print_help("Missing digest type parameter : NON_SPECIFIC, TRYPSIN\n"); if (! strcmp(argv[i],"NON_SPECIFIC")) { digest_type = NON_SPECIFIC_DIGEST; } else if (! strcmp(argv[i],"TRYPSIN")) { digest_type = TRYPSIN_DIGEST; } else { printf("Error: bad digest type: %s\n",argv[i]); print_help("Supported digest types: NON_SPECIFIC, TRYPSIN."); } } else if (! strcmp(argv[i],"-use_spectrum_charge")) { use_spectrum_charge = true; } else if (! strcmp(argv[i],"-use_spectrum_mz")) { use_spectrum_mz = true; } else if (! strcmp(argv[i],"-no_quality_filter")) { perform_filter = false; } else if (! strcmp(argv[i],"-correct_pm")) { correct_pm = true; } else if (! strcmp(argv[i],"-prm")) { prm_only = true; } else if (! strcmp(argv[i],"-prm_norm")) { prm_norm = true; prm_only = true; } else if (! strcmp(argv[i],"-output_aa_probs")) { output_aa_probs=true; } else if (! strcmp(argv[i],"-output_cumulative_probs")) { output_cumulative_probs=true; } else if (! strcmp(argv[i],"-pmcsqs_only")) { pmcsqs_only = true; } else if (! strcmp(argv[i],"-sqs_only")) { sqs_only = true; } else if (! strcmp(argv[i],"-min_filter_prob")) { if (++i == argc) print_help("Missing minimum probability parmater after -min_filter_prob !\n"); min_filter_prob = -1.0; min_filter_prob = atof(argv[i]); if (min_filter_prob<0.0 || min_filter_prob>=1.0 || argv[i][0] != '0') { print_help("The flag -min_filter_prob should be followed by a minimal probability value [0-1.0]\n"); exit(1); } } else if ( ! strcmp(argv[i],"-filter_spectra")) { got_filter_spectra = true; if (++i == argc) print_help("Missing minimum probability parmater after -filter_spectra !\n"); sqs_filter_thresh=atof(argv[i]); if (sqs_filter_thresh <0 || sqs_filter_thresh>1.0) print_help("Error: the sqs threshold should be in the range 0-1 (recommended below 0.1)\n"); if (++i == argc) print_help("Missing output directory for MGF files (second argument after -filter_spectra)!\n"); strcpy(mgf_out_dir,argv[i]); } else if (! strcmp(argv[i],"-specific_idx")) { if (++i == argc) print_help("Missing idx!"); specific_idx=atoi(argv[i]); } else if (! strcmp(argv[i],"-train_model")) { train_flag = true; if (++i == argc) print_help("Missing training tolerance!"); train_tolerance = atof(argv[i]); if (train_tolerance<0.001 || train_tolerance>1.0) print_help("Error: training tolerance should be in the range 0.001 - 1.0\n"); } else if (! strcmp(argv[i],"-start_train_idx")) { if (++i == argc) print_help("Missing start_train_idx!"); start_train_idx = atoi(argv[i]); } else if (! strcmp(argv[i],"-end_train_idx")) { if (++i == argc) print_help("end_train_idx!"); end_train_idx = atoi(argv[i]); } else if (! strcmp(argv[i],"-specific_reigon_model")) { if (++i == argc) print_help("specific_reigon_model!"); specific_charge = atoi(argv[i++]); specific_size = atoi(argv[i++]); specific_region = atoi(argv[i]); } else if (! strcmp(argv[i],"-specific_charge")) { if (++i == argc) print_help("specific_charge!"); specific_charge = atoi(argv[i]); } else if (! strcmp(argv[i],"-specific_size")) { if (++i == argc) print_help("specific_size!"); specific_size = atoi(argv[i]); } else if (! strcmp(argv[i],"-initial_model")) { got_initial_model = true; if (++i == argc) print_help("Missing initial model name!"); strcpy(initial_model,argv[i]); } else if (! strcmp(argv[i],"-neg_spec_list")) { got_neg_spec_list = true; if (++i == argc) print_help("Missing neg spec list!"); strcpy(neg_spec_list,argv[i]); } else if (! strcmp(argv[i],"-PTMs")) { got_PTM_string = true; if (++i == argc) print_help("Missing PTM list!"); strcpy(PTM_string,argv[i]); } else if (! strcmp(argv[i],"-inspect_tags")) { make_inspect_tags=true; if (++i == argc) print_help("inspect_tags!"); strcpy(tag_string,argv[i]); } else if (! strcmp(argv[i],"-rescore_inspect")) { got_rescore_inspect = true; if (++i == argc) print_help("Missing results file!"); strcpy(inspect_results_file,argv[i]); if (++i == argc) print_help("Missing new results file!"); strcpy(out_file,argv[i]); } else if (! strcmp(argv[i],"-recalibrate_inspect")) { got_recalibrate_inspect = true; if (++i == argc) print_help("Missing results file!"); strcpy(inspect_results_file,argv[i]); if (++i == argc) print_help("Missing new results file!"); strcpy(out_file,argv[i]); } else if ( ! strcmp(argv[i],"-make_peak_examples")) { got_make_peak_examples=true; } else if (! strcmp(argv[i],"-make_training_fa")) { make_training_fa=true; } else if (! strcmp(argv[i],"-test_tags")) { test_tags=true; if (++i == argc) print_help("test_tags!"); strcpy(tag_string,argv[i]); } else if (! strcmp(argv[i],"-num_test_cases")) { if (++i == argc) print_help("num_test_cases!"); num_test_cases = atoi(argv[i]); } else if (! strcmp(argv[i],"-tag_suffix")) { if (++i == argc) print_help("tag suffix!"); strcpy(tag_suffix,argv[i]); } else { printf("**********************************************************\n"); printf("\nError: Unkown command line option: %s\n\n",argv[i]); print_help(""); exit(0); } i++; } if (! got_model_file) print_help("Error: Missing model name!"); if (!got_input_file && ! got_list_file) print_help("Error: missing input file (either -file or -list must be used)."); Config *config = model.get_config(); if (got_model_dir) { config->set_resource_dir(string(model_dir)); } ////////////////////////////////////////////////////////////////// // Model Training if (train_flag) { if (got_initial_model) { model.read_model(initial_model); if (got_PTM_string) config->apply_selected_PTMs(PTM_string); model.read_rank_models(initial_model,true); model.read_cum_seq_prob_models(initial_model,true); } else { config->init_with_defaults(); config->set_tolerance(train_tolerance); config->set_digest_type(digest_type); if (got_PTM_string) config->apply_selected_PTMs(PTM_string); } model.set_model_name(string(model_file)); SpectraAggregator sa; if (! got_list_file) { if (got_input_file) { // fm.init_from_mgf(config,input_file); sa.initializeFromSpectraFilePath(input_file, config); } else { printf("Must supply a list of annotated spectra for training!\n"); exit(0); } } else { // fm.init_from_list_file(config,list_file); sa.initializeFromTextFile(list_file, config); } model.trainModelsInStages(model_file, sa, train_tolerance, start_train_idx, end_train_idx, specific_charge, specific_size, specific_region, (got_neg_spec_list ? neg_spec_list : NULL)); model.write_model(); exit(0); } /////////////////////////////////////////////////////////////////// // Model initializing (running some sort of de novo, need a model) // const time_t start_time = time(NULL); cout << "PepNovo V3. Build " << build_name << endl; cout << "Copyright 2008, The Regents of the University of California. All Rights Reserved." << endl; cout << "Created by Ari Frank ([email protected])" << endl << endl; cout << "Initializing models (this might take a few seconds)... " << flush; // TODO: incorporate PTM line into the model reading and also the other model stuff below model.read_model(model_file,true); if (got_PTM_string) config->apply_selected_PTMs(PTM_string); model.getPeptideCompositionAssigner().init_aa_translations(); model.read_rank_models(model_file,true); model.read_cum_seq_prob_models(model_file,true); cout << "Done." << endl; config = model.get_config(); config->set_digest_type(digest_type); if (fragment_tolerance>0) config->set_tolerance(fragment_tolerance); if (pm_tolerance>0) config->setPrecursorMassTolerance(pm_tolerance); if (correct_pm) config->set_need_to_estimate_pm(1); if (use_spectrum_mz) config->set_use_spectrum_mz(1); if (use_spectrum_charge) config->set_use_spectrum_charge(1); if (! perform_filter) config->set_filter_flag(0); if (config->get_pm_tolerance()<0.1) config->set_need_to_estimate_pm(0); cout << setprecision(4) << fixed; cout << "Fragment tolerance : " << config->getTolerance() << endl; cout << "PM tolernace : " << config->get_pm_tolerance() << endl; cout << "PTMs considered : " ; if (got_PTM_string) { cout << PTM_string << endl; } else { cout << "None" << endl; } /////////////////////////////////////////////////////////////////// // Training fa if (make_training_fa) { make_denovo_training_fa(model,input_file); exit(0); } /////////////////////////////////////////////////////////////////// // Inspect tags if (make_inspect_tags) { create_tag_file_for_inspect(model,input_file,tag_string,tag_suffix); exit(0); } if (test_tags) { benchmark_tags(model,list_file,tag_string,num_test_cases); exit(0); } //////////////////////////////////////////////////////////////////// // Rescore InsPecT if (got_rescore_inspect) { PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0); db_score->rescore_inspect_results(input_file,inspect_results_file,out_file); exit(0); } if (got_recalibrate_inspect) { cout << "Recalibrating delta scores in " << input_file << endl; PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0); db_score->recalibrate_inspect_delta_scores(input_file,inspect_results_file,out_file); exit(0); } if (got_make_peak_examples) { cout << "Making peak examples " << input_file << endl; PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0); //db_score->make_peak_table_examples(input_file); exit(0); } /////////////////////////////////////////////////////////////////// // Make input file list vector<string> list_vector; if (got_list_file) { readListOfPaths(list_file, list_vector); } else list_vector.push_back(input_file); int correct_benchmark =0; int total_benchmark =0; int counter=0; if (got_make_training_mgf) { // make_training_mgf(config,list_file,num_training_spectra,out_file); exit(0); } if (sqs_only) { PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr(); if (! pmcsqs || ! pmcsqs->getIndInitializedSqs()) { cout << "Error: no spectrum quality score (SQS) for this model!" << endl; exit(1); } } else if (got_filter_spectra || pmcsqs_only) { PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr(); if (! pmcsqs || ! pmcsqs->getIndInitializedPmc() || ! pmcsqs->getIndInitializedSqs()) { cout << "Error: no parent mass correction (PMC) and/or quality score (SQS) for this model!" << endl; exit(1); } } /////////////////////////////////////////////////////////////////// // FILTER SPECTRA if (got_filter_spectra) { int num_written =0; int num_read = 0; PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr(); // pmcsqs->output_filtered_spectra_to_mgfs(config, list_vector, mgf_out_dir, sqs_filter_thresh, num_written, num_read); time_t curr_time = time(NULL); double elapsed_time = (curr_time - start_time); cout << "Processed " << list_vector.size() << " (" << num_read << " spectra)." << endl; cout << "Wrote " << num_written << " spectra to mgfs in " << mgf_out_dir << endl; cout << "Elapsed time " << fixed << elapsed_time << " seconds." << endl; return 0; } ////////////////////////////////////////////////////////////////// // PRM if (prm_only) { perform_prm_on_list_of_files(model, list_vector, min_filter_prob, file_start_idx, prm_norm); // prm_benchmark(model, list_vector, min_pmcsqs_prob, file_start_idx); // FileManager fm; // fm.init_from_list(config,list_vector); // model.learn_prm_normalizer_values(fm); // model.write_prm_normalizer_values(); return 0; } if (fabs(config->get_aa2mass()[Cys]-103.0)<1) { cout << endl <<"*** Warning: searching with unmodified cystine, usually the PTM C+57 should be included ***" << endl << endl; } cout << endl; ////////////////////////////////////////////////////////////////// // PMCSQS if (pmcsqs_only) { // perform_pmcsqs_on_list_of_files(model, list_vector, file_start_idx); return 0; } ////////////////////////////////////////////////////////////////// // SQS if (sqs_only) { // perform_sqs_on_list_of_files(model, list_vector, file_start_idx); return 0; } ////////////////////////////////////////////////////////////////// // DENOVO AND TAGS if (tag_length<=0) { // perform_denovo_on_list_of_files(model, list_vector, file_start_idx, num_solutions, 7, 16, // false, min_filter_prob, output_aa_probs, output_cumulative_probs, cout); new_perform_denovo_on_list_of_files(model, list_vector, file_start_idx, num_solutions, 7, 16, false, min_filter_prob, output_aa_probs, output_cumulative_probs, cout); } else { perform_tags_on_list_of_files(model,list_vector,file_start_idx,num_solutions,tag_length, false, min_filter_prob, output_aa_probs, output_cumulative_probs, cout); } #ifdef WIN32 system("pause"); #endif return 0; }
/*************************************************************************************** This function touches up inspect search results by rescoring the sequences returned by inspect. The function produces a new inspect results file with the scores (and delta scores) replaced. ****************************************************************************************/ void PeptideRankScorer::rescore_inspect_results(char *spectra_file, char *inspect_res, char *new_res_file) const { AllScoreModels* allScoreModels = static_cast<AllScoreModels*>(this->allScoreModelsPtr_); Config *config = allScoreModels->get_config(); ifstream org_res(inspect_res); if (! org_res.is_open() || ! org_res.good()) { cout << "Error: couldn't open original inspect results file for reading:" << inspect_res << endl; exit(1); } ofstream new_res(new_res_file); if (! new_res.is_open() || ! new_res.good()) { cout << "Error: couldn't open new inspect results file for writing:" << new_res << endl; exit(1); } char line_buff[1024]; org_res.getline(line_buff,1024); bool read_line = true; vector<string> field_names; if (line_buff[0] != '#') { read_line = false; } else { string header = string(line_buff); split_string(header,field_names); // int i; // for (i=0; i<field_names.size(); i++) // cout << i << "\t" << field_names[i] << endl; cout << "Header:" << endl << line_buff << endl; } vector<ScanCandidateSet> cand_sets; vector<int> scan_mapping; cand_sets.clear(); scan_mapping.resize(100000,-1); while (! org_res.eof()) { vector<string> fields; if (read_line) { org_res.getline(line_buff,1024); if (org_res.gcount() < 5) continue; } else { read_line = true; } split_string(line_buff,fields); InspectResultsLine res; res.parse_from_fields(config,fields); if (cand_sets.size()==0 || ! cand_sets[cand_sets.size()-1].add_new_line(res)) { ScanCandidateSet new_set; new_set.add_new_line(res); if (new_set.scan>=scan_mapping.size()) scan_mapping.resize(2*scan_mapping.size(),-1); scan_mapping[new_set.scan]=cand_sets.size(); cand_sets.push_back(new_set); } } org_res.close(); cout << "Read results for " << cand_sets.size() << " scans..." << endl; SpectraAggregator sa; sa.initializeFromSpectraFilePath(spectra_file, config); SpectraList sl(sa); sl.selectAllAggregatorHeaders(); cout << "Read " << sl.getNumHeaders() << " spectra headers." << endl; if (sl.getNumHeaders() == 0) { cout << "Error: read not spectra headers from " << spectra_file << endl; return; } vector<bool> spectrum_indicators; spectrum_indicators.resize(cand_sets.size(),false); int num_found =0; int sc; for (sc=0; sc<sl.getNumHeaders(); sc++) { const SingleSpectrumHeader* header = sl.getSpectrumHeader(sc); int scan_number = (header->getScanNumber() >=0 ? header->getScanNumber() : header->getIndexInFile()); if (header->getFileType() == IFT_MGF) scan_number = header->getIndexInFile(); assert(scan_number>=0); if (scan_mapping[scan_number]<0) continue; AnnotatedSpectrum as; if (! as.readSpectrum(sa, header)) { continue; } spectrum_indicators[scan_mapping[scan_number]]=true; num_found++; ScanCandidateSet& cand_set = cand_sets[scan_mapping[scan_number]]; vector<PeptideSolution> peptide_sols; peptide_sols.resize(cand_set.results.size()); int j; for (j=0; j<cand_set.results.size(); j++) { InspectResultsLine& inspect_res = cand_set.results[j]; PeptideSolution& sol = peptide_sols[j]; sol.pep = inspect_res.pep; sol.pm_with_19 = sol.pep.get_mass_with_19(); sol.charge = inspect_res.Charge; sol.reaches_n_terminal = true; sol.reaches_c_terminal = true; } vector<score_pair> scores; // score_complete_sequences(peptide_sols,ssf,peaks,num_peaks,scores); scoreCompleteSequences(peptide_sols, as, scores); for (j=0; j<scores.size(); j++) cand_set.results[j].Score = scores[j].score; cand_set.recalbirate_scores(config); vector<string> pep_strings; pep_strings.resize(scores.size()); int max_len =0; for (j=0; j<cand_set.results.size(); j++) { pep_strings[j]=cand_set.results[j].pep.as_string(config); if (pep_strings[j].length()>max_len) max_len = pep_strings[j].length(); } if (1) { cand_set.output_to_stream(new_res,10); } else { for (j=0; j<cand_set.results.size(); j++) { cout << cand_set.scan << " " << cand_set.results[j].Charge << "\t"; cout << cand_set.results[j].Protein.substr(0,3) << " " << pep_strings[j]; if (pep_strings[j].length()<max_len) { int k; for (k=pep_strings[j].length(); k<max_len; k++) cout << " "; } cout << "\t" << cand_set.results[j].MQScore << "\t" << cand_set.results[j].Score << "\t" << cand_set.results[j].DeltaScore << "\t" << cand_set.results[j].DeltaScoreOther << endl; } cout << endl; } } if (num_found<cand_sets.size()) { cout << "Warning: found only " << num_found << "/" << cand_sets.size() << " of the scans scored by InsPecT!" << endl; } else { cout << "All scored scans found in spectrum file." << endl; } }
void PrmNodeScoreModel::trainNodeScoreModels(void* allScoreModelsVoidPointer, const char *name, const SpectraAggregator& sa, int specificCharge, int specificSize, int specificRegion) { AllScoreModels* allScoreModels = static_cast<AllScoreModels*>(allScoreModelsVoidPointer); config_ = allScoreModels->get_config(); // resize regional breakage score models according to regional fragment sets const vector< vector< vector< RegionalFragments > > >& all_rfs = config_->get_regional_fragment_sets(); int c; RegionalPrmNodeScoreModels_.resize(all_rfs.size()); for (c=0; c<all_rfs.size(); c++) { RegionalPrmNodeScoreModels_[c].resize(all_rfs[c].size()); int s; for (s=0; s<all_rfs[c].size(); s++) { RegionalPrmNodeScoreModels_[c][s].resize(all_rfs[c][s].size()); int r; for (r=0; r<RegionalPrmNodeScoreModels_[c][s].size(); r++) if (! RegionalPrmNodeScoreModels_[c][s][r].get_was_initialized()) RegionalPrmNodeScoreModels_[c][s][r].init(config_,c,s,r); } } // train models for (c=1; c<RegionalPrmNodeScoreModels_.size(); c++) { if (RegionalPrmNodeScoreModels_.size() == 0 || (specificCharge>0 && specificCharge != c)) continue; if (sa.getNumSpectraWithCharge(c)<200) { cout << "WARNING: insufficient number of spectra to train breakage model for charge " << c << endl; cout << " only " << sa.getNumSpectraWithCharge(c) << " spectra were found so this charge is being skipped!" << endl << endl; continue; } int s; for (s=0; s<RegionalPrmNodeScoreModels_[c].size(); s++) { if (specificSize>=0 && s != specificSize) continue; int r; for (r=0; r<RegionalPrmNodeScoreModels_[c][s].size(); r++) { if (specificRegion>=0 && r != specificRegion) continue; RegionalPrmNodeScoreModels_[c][s][r].trainRegionalScoreModel(allScoreModelsVoidPointer, name, sa); } } } // train PRM normalizer values // cout << endl << "Training PRM normalizer vlaues..." << endl; // TODO fix this issue, it needs to use the AllScoreModels class // learn_prm_normalizer_values(fm); ind_was_initialized=true; }