/****************************************************************************** Train PMC models from positive example files *******************************************************************************/ void PMCSQS_Scorer::train_pmc_rank_models(Config *config, const FileManager& fm, int sel_charge, bool overwrite) { const bool sample_diagnostic = false; const vector<int>& spectra_counts = fm.get_spectra_counts(); max_model_charge=0; int charge; for (charge=1; charge<spectra_counts.size(); charge++) { if (spectra_counts[charge]>=MIN_SPECTRA_FOR_PMCSQS_MODEL) max_model_charge=charge; } const int max_to_read_per_file = 40000; vector<string> real_names; init_PMC_feature_names(real_names); // try and read existing pmc model, otherwise init a new one string pmc_path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt"; ifstream model_stream(pmc_path.c_str()); if (model_stream.is_open() && model_stream.good()) { model_stream.close(); string pmcr_name = config->get_model_name() + "_PMCR.txt"; const char *path = pmc_path.c_str(); this->read_pmc_rank_models(config,(char *)pmcr_name.c_str()); } else { set_pmc_mass_thresholds(); this->set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19 this->set_bin_increment(0.1); pmc_rank_models.resize(pmc_rank_mass_thresholds.size()); pmc_charge_mz_biases.resize(pmc_rank_mass_thresholds.size()); } const double prop_train = 0.5; // It is assumed that the mass thresholds were set according to the training data // (this is done manually with values encoded in the set_mass_threhsolds function) for (charge=1; charge<=max_model_charge; charge++) { if (sel_charge>0 && charge != sel_charge) continue; const int num_sizes = pmc_rank_mass_thresholds[charge].size(); pmc_rank_models[charge].resize(num_sizes+1,NULL); pmc_charge_mz_biases[charge].resize(num_sizes+1,0); int size_idx; for (size_idx=0; size_idx<=num_sizes; size_idx++) { if (pmc_rank_models[charge][size_idx] && ! overwrite) continue; vector<SingleSpectrumFile *> test_ssfs; BasicSpecReader bsr; static QCPeak peaks[5000]; RankBoostDataset train_ds, test_ds, pos_ds, neg_ds; mass_t min_mass =0; mass_t max_mass = POS_INF; if (size_idx>0) min_mass = pmc_rank_mass_thresholds[charge][size_idx-1]; if (size_idx<num_sizes) max_mass = pmc_rank_mass_thresholds[charge][size_idx]; // these ranges are given according to pm_with_19 // so files should be selected through select_files and not // select_file_in_mz_range FileSet fs; fs.select_files(fm,min_mass,max_mass,-1,-1,charge); if (fs.get_total_spectra()<500) continue; int num_groups_in_train=0; int num_groups_in_test=0; cout << "TRAINING charge " << charge << " size " << size_idx << " (" << min_mass << "-" << max_mass << ")" << endl; fs.randomly_reduce_ssfs(max_to_read_per_file); const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers(); const int num_samples = all_ssf.size(); // first find the bias in number of bins between the true m/z bin and // the optimal m/z bin vector<bool> skipped_idxs; skipped_idxs.resize(num_samples,false); int skipped_bad_mz=0; mass_t total_bias=0; int i; for (i=0; i<num_samples; i++) { SingleSpectrumFile* ssf = all_ssf[i]; BasicSpectrum bs; bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks); bs.peaks = peaks; bs.ssf = ssf; ssf->peptide.calc_mass(config); const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge; if (fabs(true_mz - bs.ssf->m_over_z)>2.5) { //cout << setprecision(2) << true_mz << " <---> " << bs.ssf->m_over_z << " skipping" << endl; skipped_bad_mz++; skipped_idxs[i]=true; continue; } init_for_current_spec(config,bs); calculate_curr_spec_pmc_values(bs, bin_increment); // find the true_mz_bin_idx const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge]; int true_mz_bin_idx=0; while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz) true_mz_bin_idx++; if (true_mz_bin_idx == pmc_stats.size()) true_mz_bin_idx--; if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z) true_mz_bin_idx--; int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge); if (opt_bin_idx <=0 || opt_bin_idx == pmc_stats.size()-1) { skipped_bad_mz++; skipped_idxs[i]=true; continue; } total_bias += (pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z); if (fabs(pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z)>4.0) { cout << "opt bin: " << opt_bin_idx << " (" << pmc_stats[opt_bin_idx].m_over_z << ") "; cout << "tru bin: " << true_mz_bin_idx << " ("<< pmc_stats[true_mz_bin_idx].m_over_z << ")" << endl; } } mass_t mz_bias = total_bias / (mass_t)(num_samples-skipped_bad_mz); pmc_charge_mz_biases[charge][size_idx]=mz_bias; cout << "m/z bias: " << setprecision(4) << mz_bias << endl; cout << "skipped " << skipped_bad_mz << "/" << num_samples << " because of m/z more than 2.5 away from observed..." << endl; // pmc_charge_mz_biases[charge][size_idx] = 0; for (i=0; i<num_samples; i++) { if (skipped_idxs[i]) continue; SingleSpectrumFile* ssf = all_ssf[i]; BasicSpectrum bs; bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks); bs.peaks = peaks; bs.ssf = ssf; const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge; init_for_current_spec(config,bs); calculate_curr_spec_pmc_values(bs, bin_increment); // find the true_mz_bin_idx const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge]; int true_mz_bin_idx=0; while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz) true_mz_bin_idx++; if (true_mz_bin_idx == pmc_stats.size()) true_mz_bin_idx--; if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z) true_mz_bin_idx--; int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge); static vector<RankBoostSample> spec_samples; fill_RankBoost_smaples_with_PMC(bs, charge, spec_samples); // select samples and add them to pmc_ds int good_idx; vector<int> bad_idxs; select_training_sample_idxs(charge,spec_samples,bs,good_idx,bad_idxs); const bool ind_add_to_train = (my_random()<prop_train); int group_idx; if (ind_add_to_train) { group_idx= num_groups_in_train++; } else { group_idx= num_groups_in_test++; test_ssfs.push_back(ssf); } RankBoostDataset& ds = (ind_add_to_train ? train_ds : test_ds); const int pos_index = ds.get_num_samples(); spec_samples[good_idx].group_idx = group_idx; spec_samples[good_idx].rank_in_group=0; ds.add_sample(spec_samples[good_idx]); if (sample_diagnostic) pos_ds.add_sample(spec_samples[good_idx]); int j; for (j=0; j<bad_idxs.size(); j++) { const int bad_idx = bad_idxs[j]; if (bad_idx < 0 || bad_idx>= spec_samples.size()) continue; spec_samples[bad_idx].group_idx=group_idx; spec_samples[bad_idx].rank_in_group=1; ds.add_to_phi_vector(ds.get_num_samples(),pos_index); ds.add_sample(spec_samples[bad_idx]); if (sample_diagnostic) neg_ds.add_sample(spec_samples[bad_idx]); } } train_ds.set_num_groups(num_groups_in_train); test_ds.set_num_groups(num_groups_in_test); train_ds.compute_total_phi_weight(); train_ds.initialize_potenital_lists(); train_ds.initialzie_real_feature_table(real_names.size()); test_ds.compute_total_phi_weight(); if (pmc_rank_models[charge][size_idx]) delete pmc_rank_models[charge][size_idx]; pmc_rank_models[charge][size_idx] = new RankBoostModel; RankBoostModel* boost = pmc_rank_models[charge][size_idx]; vector<string> empty; empty.clear(); boost->init_rankboost_model_feature_names(empty,real_names); boost->init_rankboost_model_for_training(train_ds,100,25); train_ds.initialize_real_vote_lists(*boost); if (sample_diagnostic) { boost->summarize_features_pos_neg(pos_ds.get_samples(),neg_ds.get_samples()); } else boost->summarize_features(train_ds.get_samples()); boost->train_rankboost_model(train_ds,4000,NULL,&test_ds); boost->ouput_ranked_feature_list(); // output_pmc_rank_results(fm,charge,test_ssfs); // exit(0); ind_initialized_pmcr = true; // string path; // path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCRtt.txt"; // this->write_pmc_rank_models(path.c_str()); } } string path; path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt"; this->write_pmc_rank_models(path.c_str()); ind_initialized_pmcr = true; }
void PMCSQS_Scorer::train_sqs_models(Config *config, const FileManager& fm_pos, const char *neg_list, int specificCharge, vector<vector<float> > *inputWeights) { vector< vector< vector<ME_Regression_Sample> > > samples; // neg, p1, p2, p3 / sizeIndex FileManager fm_neg; const vector<int>& spectra_counts = fm_pos.get_spectra_counts(); maximalChargeWithModels_ = (inputWeights ? inputWeights->size()-1 : 3); int charge; set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19 set_bin_increment(0.1); this->set_sqs_mass_thresholds(); if (this->pmcMassThresholds_.size() == 0) { pmcMassThresholds_=config->get_size_thresholds(); } vector<vector<float> > classWeights; if (inputWeights) { classWeights = *inputWeights; } else { classWeights.resize(maximalChargeWithModels_+1); int i; for (i=0; i<classWeights.size(); i++) classWeights[i].resize(maximalChargeWithModels_+1,1.0); } const int numSizes = this->sqsMassThresholds_.size(); cout << "NUM SIZE MODELS: " << numSizes+1 << endl; samples.resize(maximalChargeWithModels_+1); fm_neg.init_from_list_file(config, neg_list); const int max_to_read_per_file = 8000; for (charge=0; charge<=maximalChargeWithModels_; charge++) { if (charge>0 && specificCharge>0 && charge != specificCharge) continue; int sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { const mass_t minMass = (sizeIndex == 0 ? 0 : sqsMassThresholds_[sizeIndex-1]); const mass_t maxMass = (sizeIndex == numSizes ? POS_INF : sqsMassThresholds_[sizeIndex]); samples[charge].resize(numSizes+1); BasicSpecReader bsr; QCPeak peaks[5000]; FileSet fs; if (charge == 0) { fs.select_files_in_mz_range(fm_neg,minMass, maxMass,0); } else { fs.select_files_in_mz_range(fm_pos, minMass, maxMass, charge); } cout << "Found " << fs.get_total_spectra() << " for charge " << charge << " ranges:" << minMass << " - " << maxMass << endl; fs.randomly_reduce_ssfs(max_to_read_per_file); const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers(); const int label = (charge == 0 ? 1 : 0); const int num_samples = all_ssf.size(); samples[charge][sizeIndex].resize(num_samples); int i; for (i=0; i<num_samples; i++) { SingleSpectrumFile* ssf = all_ssf[i]; BasicSpectrum bs; bs.peaks = peaks; bs.ssf = ssf; if (charge==0) { bs.num_peaks = bsr.read_basic_spec(config,fm_neg,ssf,peaks); bs.ssf->charge=0; } else bs.num_peaks = bsr.read_basic_spec(config,fm_pos,ssf,peaks); init_for_current_spec(config,bs); calculate_curr_spec_pmc_values(bs, bin_increment); fill_fval_vector_with_SQS(bs, samples[charge][sizeIndex][i]); samples[charge][sizeIndex][i].label = label; } } } // cout sample composition cout << "Sample composition:" << endl; for (charge=0; charge<=maximalChargeWithModels_; charge++) { cout << charge; int i; for (i=0; i<samples[charge].size(); i++) cout << "\t" << samples[charge][i].size(); cout << endl; } // create SQS models this->sqs_models.resize(maximalChargeWithModels_+1); for (charge =0; charge<=maximalChargeWithModels_; charge++) { sqs_models[charge].resize(maximalChargeWithModels_+1); int j; for (j=0; j<sqs_models[charge].size(); j++) sqs_models[charge][j].resize(numSizes+1,NULL); } for (charge=1; charge<=maximalChargeWithModels_; charge++) { int sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { ME_Regression_DataSet ds; cout << endl << "CHARGE " << charge << " SIZE " << sizeIndex << endl; ds.num_classes=2; ds.num_features=SQS_NUM_FIELDS; ds.add_samples(samples[0][sizeIndex]); ds.add_samples(samples[charge][sizeIndex]); ds.tally_samples(); if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001) { cout << "Warning: insufficient number of samples, not trianing model for this charge " << charge << " size " << sizeIndex << endl; continue; } const double pos_weight = 0.2 + classWeights[charge][sizeIndex]*0.3; ds.randomly_remove_samples_with_activated_feature(1,SQS_IND_MAX_TAG_LENGTH_ABOVE_4,0.5); ds.calibrate_class_weights(pos_weight); // charge vs bad spectra ds.print_feature_summary(cout,SQS_var_names); sqs_models[charge][0][sizeIndex]=new ME_Regression_Model; sqs_models[charge][0][sizeIndex]->train_cg(ds,250); sqs_models[charge][0][sizeIndex]->print_ds_probs(ds); } } //////////////////////////////////////////// // train model vs. model if charge1>charge2 if (1) { int charge1,charge2; for (charge1=2; charge1<=maximalChargeWithModels_; charge1++) { for (charge2=1; charge2<charge1; charge2++) { int sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { ME_Regression_DataSet ds; ds.num_classes=2; ds.num_features=SQS_NUM_FIELDS; ds.add_samples(samples[charge1][sizeIndex]); int i; for (i=0; i<samples[charge2][sizeIndex].size(); i++) { samples[charge2][sizeIndex][i].label=1; ds.add_sample(samples[charge2][sizeIndex][i]); samples[charge2][sizeIndex][i].label=0; } float relative_weight = classWeights[charge1][sizeIndex]/ (classWeights[charge1][sizeIndex]+classWeights[charge2][sizeIndex]); ds.tally_samples(); if (ds.class_weights[0]<0.0001 || ds.class_weights[1]<0.0001) { cout << "Warning: insufficient number of samples, not trianing model for charge " << charge1 << " vs charge " << charge2<< " (size " << sizeIndex << ")" << endl; continue; } ds.calibrate_class_weights(relative_weight); sqs_models[charge1][charge2][sizeIndex] = new ME_Regression_Model; cout << endl << "CHARGE " << charge1 << " vs " << charge2 << " size " << sizeIndex << endl; cout << "Relative weights: " << charge1 << "/(" << charge1 << "+" << charge2 << "): " << relative_weight << endl; ds.print_feature_summary(cout,SQS_var_names); sqs_models[charge1][charge2][sizeIndex]->train_cg(ds,300); sqs_models[charge1][charge2][sizeIndex]->print_ds_probs(ds); } } } } init_sqs_correct_factors(maximalChargeWithModels_, sqsMassThresholds_.size()); //////////////////////////////////////////// // final report on datasets cout << endl; int sizeIndex; for (sizeIndex=0; sizeIndex<=numSizes; sizeIndex++) { cout << endl << "SIZE: " << sizeIndex << endl; cout << "--------" << endl; float p_thresh = 0.05; int d; for (d=0; d<=maximalChargeWithModels_; d++) { vector<int> counts; vector<int> max_counts; counts.resize(maximalChargeWithModels_+1,0); max_counts.resize(maximalChargeWithModels_+1,0); int i; for (i=0; i<samples[d][sizeIndex].size(); i++) { bool above_thresh=false; float max_prob=0; int max_class=0; int c; for (c=1; c<=maximalChargeWithModels_; c++) { if (! sqs_models[c][0][sizeIndex]) continue; float prob = sqs_models[c][0][sizeIndex]->p_y_given_x(0,samples[d][sizeIndex][i]); if (prob>p_thresh) { counts[c]++; above_thresh=true; if (prob>max_prob) { max_prob=prob; max_class=c; } } } max_counts[max_class]++; if (! above_thresh) counts[0]++; } cout << d << "\t"; for (i=0; i<=maximalChargeWithModels_; i++) cout << fixed << setprecision(4) << max_counts[i]/(float)samples[d][sizeIndex].size() << "\t"; cout << endl; } } ind_initialized_sqs = true; string path; path = config->get_resource_dir() + "/" + config->get_model_name() + "_SQS.txt"; write_sqs_models(path.c_str()); }