void DetectabilitySimulation::predictDetectabilities(vector<String>& peptides_vector, vector<DoubleReal>& labels, vector<DoubleReal>& detectabilities) { // The support vector machine SVMWrapper svm; // initialize support vector machine LibSVMEncoder encoder; UInt k_mer_length = 0; DoubleReal sigma = 0.0; UInt border_length = 0; if (File::readable(dt_model_file_)) { svm.loadModel(dt_model_file_); } else { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation got invalid parameter. 'dt_model_file' " + dt_model_file_ + " is not readable"); } // load additional parameters if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO) { String add_paramfile = dt_model_file_ + "_additional_parameters"; if (!File::readable(add_paramfile)) { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: SVM parameter file " + add_paramfile + " is not readable"); } Param additional_parameters; ParamXMLFile paramFile; paramFile.load(add_paramfile, additional_parameters); if (additional_parameters.getValue("border_length") == DataValue::EMPTY && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO) { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: No border length defined in additional parameters file."); } border_length = ((String)additional_parameters.getValue("border_length")).toInt(); if (additional_parameters.getValue("k_mer_length") == DataValue::EMPTY && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO) { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: No k-mer length defined in additional parameters file."); } k_mer_length = ((String)additional_parameters.getValue("k_mer_length")).toInt(); if (additional_parameters.getValue("sigma") == DataValue::EMPTY && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO) { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: No sigma defined in additional parameters file."); } sigma = ((String)additional_parameters.getValue("sigma")).toFloat(); } if (File::readable(dt_model_file_)) { svm.setParameter(SVMWrapper::BORDER_LENGTH, (Int) border_length); svm.setParameter(SVMWrapper::SIGMA, sigma); // to obtain probabilities svm.setParameter(SVMWrapper::PROBABILITY, 1); } // loading training data String sample_file = dt_model_file_ + "_samples"; svm_problem* training_data = NULL; if (File::readable(sample_file)) { training_data = encoder.loadLibSVMProblem(sample_file); svm.setTrainingSample(training_data); } else { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: SVM sample file " + sample_file + " is not readable"); } LOG_INFO << "Predicting peptide detectabilities.. " << endl; String allowed_amino_acid_characters = "ACDEFGHIKLMNPQRSTVWY"; // Encoding test data vector<DoubleReal> probs; probs.resize(peptides_vector.size(), 0); svm_problem* prediction_data = encoder.encodeLibSVMProblemWithOligoBorderVectors(peptides_vector, probs, k_mer_length, allowed_amino_acid_characters, svm.getIntParameter(SVMWrapper::BORDER_LENGTH)); svm.getSVCProbabilities(prediction_data, detectabilities, labels); // clean up when finished with prediction delete prediction_data; delete training_data; }
ExitCodes main_(Int, const char**) { vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; vector<ProteinIdentification> protein_identifications_negative; vector<PeptideIdentification> identifications_negative; vector<String> training_peptides; vector<DoubleReal> training_labels; PeptideHit temp_peptide_hit; SVMWrapper svm; LibSVMEncoder encoder; svm_problem* encoded_training_sample = 0; String allowed_amino_acid_characters = "ACDEFGHIKLMNPQRSTVWY"; map<SVMWrapper::SVM_parameter_type, DoubleReal> start_values; map<SVMWrapper::SVM_parameter_type, DoubleReal> step_sizes; map<SVMWrapper::SVM_parameter_type, DoubleReal> end_values; DoubleReal sigma_start = 0; DoubleReal sigma_step_size = 0; DoubleReal sigma_stop = 0; UInt number_of_partitions = 0; UInt number_of_runs = 0; map<SVMWrapper::SVM_parameter_type, DoubleReal> optimized_parameters; map<SVMWrapper::SVM_parameter_type, DoubleReal>::iterator parameters_iterator; bool additive_cv = true; Param additional_parameters; Int temp_type = POLY; String debug_string = ""; DoubleReal sigma = 0.1; UInt k_mer_length = 1; Int border_length = 0; bool non_redundant = false; bool skip_cv = getFlag_("cv:skip_cv"); svm.setParameter(SVMWrapper::PROBABILITY, 1); //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_positives = getStringOption_("in_positive"); String inputfile_negatives = getStringOption_("in_negative"); String temp_string = ""; String outputfile_name = getStringOption_("out"); UInt max_positive_count = getIntOption_("max_positive_count"); UInt max_negative_count = getIntOption_("max_negative_count"); //SVM type String type = getStringOption_("svm_type"); if (type == "NU_SVC") { svm.setParameter(SVMWrapper::SVM_TYPE, NU_SVC); } else if (type == "C_SVC") { svm.setParameter(SVMWrapper::SVM_TYPE, C_SVC); } else { writeLog_("Illegal svm type given. Svm type has to be either " + String("NU_SVC or C_SVC. Aborting!")); printUsage_(); return ILLEGAL_PARAMETERS; } //Kernel type type = getStringOption_("kernel_type"); if (type == "POLY") { svm.setParameter(SVMWrapper::KERNEL_TYPE, POLY); temp_type = POLY; } else if (type == "LINEAR") { svm.setParameter(SVMWrapper::KERNEL_TYPE, LINEAR); temp_type = LINEAR; } else if (type == "RBF") { svm.setParameter(SVMWrapper::KERNEL_TYPE, RBF); temp_type = RBF; } else if (type == "OLIGO") { svm.setParameter(SVMWrapper::KERNEL_TYPE, SVMWrapper::OLIGO); temp_type = SVMWrapper::OLIGO; } else if (type == "SIGMOID") { svm.setParameter(SVMWrapper::KERNEL_TYPE, SIGMOID); temp_type = SIGMOID; } else { writeLog_("Unknown kernel type given. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } //parameters svm.setParameter(SVMWrapper::C, getDoubleOption_("c")); svm.setParameter(SVMWrapper::DEGREE, getIntOption_("degree")); if (svm.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC) { svm.setParameter(SVMWrapper::NU, getDoubleOption_("nu")); } //grid search parameters if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == POLY) { svm.setParameter(SVMWrapper::DEGREE, getIntOption_("degree")); if (!skip_cv) { DoubleReal degree_start = getIntOption_("cv:degree_start"); DoubleReal degree_step_size = getIntOption_("cv:degree_step_size"); if (!additive_cv && degree_step_size <= 1) { writeLog_("Step size of degree <= 1 and additive_cv is false. Aborting!"); return ILLEGAL_PARAMETERS; } DoubleReal degree_stop = getIntOption_("cv:degree_stop"); start_values.insert(make_pair(SVMWrapper::DEGREE, degree_start)); step_sizes.insert(make_pair(SVMWrapper::DEGREE, degree_step_size)); end_values.insert(make_pair(SVMWrapper::DEGREE, degree_stop)); } } if (svm.getIntParameter(SVMWrapper::SVM_TYPE) == C_SVC && !skip_cv) { DoubleReal c_start = getDoubleOption_("cv:c_start"); DoubleReal c_step_size = getDoubleOption_("cv:c_step_size"); if (!additive_cv && c_step_size <= 1) { writeLog_("Step size of c <= 1 and additive_cv is false. Aborting!"); return ILLEGAL_PARAMETERS; } DoubleReal c_stop = getDoubleOption_("cv:c_stop"); start_values.insert(make_pair(SVMWrapper::C, c_start)); step_sizes.insert(make_pair(SVMWrapper::C, c_step_size)); end_values.insert(make_pair(SVMWrapper::C, c_stop)); } if (svm.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC && !skip_cv) { DoubleReal nu_start = getDoubleOption_("cv:nu_start"); DoubleReal nu_step_size = getDoubleOption_("cv:nu_step_size"); if (!additive_cv && nu_step_size <= 1) { writeLog_("Step size of nu <= 1 and additive_cv is false. Aborting!"); return ILLEGAL_PARAMETERS; } DoubleReal nu_stop = getDoubleOption_("cv:nu_stop"); start_values.insert(make_pair(SVMWrapper::NU, nu_start)); step_sizes.insert(make_pair(SVMWrapper::NU, nu_step_size)); end_values.insert(make_pair(SVMWrapper::NU, nu_stop)); } border_length = getIntOption_("border_length"); svm.setParameter(SVMWrapper::BORDER_LENGTH, border_length); sigma = getDoubleOption_("sigma"); svm.setParameter(SVMWrapper::SIGMA, sigma); k_mer_length = getIntOption_("k_mer_length"); sigma_start = 0.; sigma_step_size = 0.; sigma_stop = 0.; if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO && !skip_cv) { sigma_start = getDoubleOption_("cv:sigma_start"); sigma_step_size = getDoubleOption_("cv:sigma_step_size"); if (!additive_cv && sigma_step_size <= 1) { writeLog_("Step size of sigma <= 1 and additive_cv is false. Aborting!"); return ILLEGAL_PARAMETERS; } sigma_stop = getDoubleOption_("cv:sigma_stop"); start_values.insert(make_pair(SVMWrapper::SIGMA, sigma_start)); step_sizes.insert(make_pair(SVMWrapper::SIGMA, sigma_step_size)); end_values.insert(make_pair(SVMWrapper::SIGMA, sigma_stop)); debug_string = "CV from sigma = " + String(sigma_start) + " to sigma = " + String(sigma_stop) + " with step size " + String(sigma_step_size); writeDebug_(debug_string, 1); } if (!skip_cv && !start_values.empty()) { number_of_runs = getIntOption_("cv:number_of_runs"); writeDebug_(String("Number of CV runs: ") + String(number_of_runs), 1); number_of_partitions = getIntOption_("cv:number_of_partitions"); writeDebug_(String("Number of CV partitions: ") + String(number_of_partitions), 1); additive_cv = getFlag_("additive_cv"); } Int debug_level = getIntOption_("debug"); non_redundant = !(getFlag_("redundant")); //------------------------------------------------------------- // reading input //------------------------------------------------------------- String document_id; IdXMLFile().load(inputfile_positives, protein_identifications, identifications, document_id); IdXMLFile().load(inputfile_negatives, protein_identifications_negative, identifications_negative, document_id); //------------------------------------------------------------- // calculations //------------------------------------------------------------- for (Size i = 0; i < identifications.size(); i++) { const vector<PeptideHit>& temp_peptide_hits = identifications[i].getHits(); Size temp_size = temp_peptide_hits.size(); if (temp_size > 0) { for (Size j = 0; j < temp_size; ++j) { temp_peptide_hit = temp_peptide_hits[j]; temp_string = temp_peptide_hit.getSequence().toUnmodifiedString(); if (!non_redundant || find(training_peptides.begin(), training_peptides.end(), temp_string) == training_peptides.end()) { training_peptides.push_back(temp_peptide_hit.getSequence().toUnmodifiedString()); } } } } training_labels.resize(training_peptides.size(), 1.0); debug_string = String(training_labels.size()) + " positive sequences read"; writeDebug_(debug_string, 1); if (training_peptides.size() > max_positive_count) { random_shuffle(training_peptides.begin(), training_peptides.end()); training_peptides.resize(max_positive_count, ""); training_labels.resize(max_positive_count, 1.); } debug_string = String(training_peptides.size()) + " positive sequences for training"; writeDebug_(debug_string, 1); UInt counter = 0; vector<String> temp_training_peptides; for (Size i = 0; i < identifications_negative.size(); i++) { const vector<PeptideHit>& temp_peptide_hits = identifications_negative[i].getHits(); Size temp_size = temp_peptide_hits.size(); if (temp_size > 0) { for (Size j = 0; j < temp_size; ++j) { temp_peptide_hit = temp_peptide_hits[j]; temp_string = temp_peptide_hit.getSequence().toUnmodifiedString(); if (find(training_peptides.begin(), training_peptides.end(), temp_string) != training_peptides.end()) { writeLog_("Peptides are not allowed to occur in the positive and the negative set. Example: '" + temp_string + "'"); return ILLEGAL_PARAMETERS; } if (!non_redundant || find(training_peptides.begin(), training_peptides.end(), temp_string) == training_peptides.end()) { temp_training_peptides.push_back(temp_peptide_hit.getSequence().toUnmodifiedString()); training_labels.push_back(-1.0); ++counter; } } } } if (non_redundant) { debug_string = String(counter) + " non redundant negative sequences read"; } else { debug_string = String(counter) + " negative sequences read"; } writeDebug_(debug_string, 1); if (temp_training_peptides.size() > max_negative_count) { random_shuffle(temp_training_peptides.begin(), temp_training_peptides.end()); temp_training_peptides.resize(max_negative_count, ""); training_labels.resize(training_peptides.size() + max_negative_count, -1.); } training_peptides.insert(training_peptides.end(), temp_training_peptides.begin(), temp_training_peptides.end()); debug_string = String(temp_training_peptides.size()) + " negative sequences for training"; writeDebug_(debug_string, 1); temp_training_peptides.clear(); if (temp_type == LINEAR || temp_type == POLY || temp_type == RBF) { UInt maximum_sequence_length = 50; encoded_training_sample = encoder.encodeLibSVMProblemWithCompositionAndLengthVectors(training_peptides, training_labels, allowed_amino_acid_characters, maximum_sequence_length); } else if (temp_type == SVMWrapper::OLIGO) { encoded_training_sample = encoder.encodeLibSVMProblemWithOligoBorderVectors(training_peptides, training_labels, k_mer_length, allowed_amino_acid_characters, svm.getIntParameter(SVMWrapper::BORDER_LENGTH)); } if (!start_values.empty()) { String digest = ""; bool output_flag = false; if (debug_level >= 1) { output_flag = true; vector<String> parts; outputfile_name.split('/', parts); if (parts.empty()) { digest = outputfile_name; } else { digest = parts[parts.size() - 1]; } } SVMData dummy; DoubleReal cv_quality = svm.performCrossValidation(encoded_training_sample, dummy, false, start_values, step_sizes, end_values, number_of_partitions, number_of_runs, optimized_parameters, additive_cv, output_flag, "performances_" + digest + ".txt"); String debug_string = "Best parameters found in cross validation:"; for (parameters_iterator = optimized_parameters.begin(); parameters_iterator != optimized_parameters.end(); ++parameters_iterator) { svm.setParameter(parameters_iterator->first, parameters_iterator->second); if (parameters_iterator->first == SVMWrapper::DEGREE) { debug_string += " degree: " + String(parameters_iterator->second); } else if (parameters_iterator->first == SVMWrapper::C) { debug_string += " C: " + String(parameters_iterator->second); } else if (parameters_iterator->first == SVMWrapper::NU) { debug_string += " nu: " + String(parameters_iterator->second); } else if (parameters_iterator->first == SVMWrapper::SIGMA) { debug_string += " sigma: " + String(parameters_iterator->second); } } debug_string += " with performance " + String(cv_quality); writeDebug_(debug_string, 1); } svm.train(encoded_training_sample); //------------------------------------------------------------- // writing output //------------------------------------------------------------- svm.saveModel(outputfile_name); // If the oligo-border kernel is used some additional information has to be stored if (temp_type == SVMWrapper::OLIGO) { encoder.storeLibSVMProblem(outputfile_name + "_samples", encoded_training_sample); additional_parameters.setValue("kernel_type", temp_type); if (temp_type == SVMWrapper::OLIGO) { additional_parameters.setValue("border_length", svm.getIntParameter(SVMWrapper::BORDER_LENGTH)); additional_parameters.setValue("k_mer_length", k_mer_length); additional_parameters.setValue("sigma", svm.getDoubleParameter(SVMWrapper::SIGMA)); } ParamXMLFile paramFile; paramFile.store(outputfile_name + "_additional_parameters", additional_parameters); } return EXECUTION_OK; }