Ejemplo n.º 1
0
  void DetectabilitySimulation::predictDetectabilities(vector<String>& peptides_vector, vector<DoubleReal>& labels,
                                                       vector<DoubleReal>& detectabilities)
  {
    // The support vector machine
    SVMWrapper svm;

    // initialize support vector machine
    LibSVMEncoder encoder;
    UInt k_mer_length = 0;
    DoubleReal sigma = 0.0;
    UInt border_length = 0;

    if (File::readable(dt_model_file_))
    {
      svm.loadModel(dt_model_file_);
    }
    else
    {
      throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation got invalid parameter. 'dt_model_file' " + dt_model_file_ + " is not readable");
    }

    // load additional parameters
    if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
    {
      String add_paramfile = dt_model_file_ + "_additional_parameters";
      if (!File::readable(add_paramfile))
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: SVM parameter file " + add_paramfile + " is not readable");
      }

      Param additional_parameters;
      ParamXMLFile paramFile;
      paramFile.load(add_paramfile, additional_parameters);

      if (additional_parameters.getValue("border_length") == DataValue::EMPTY
         && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: No border length defined in additional parameters file.");
      }
      border_length = ((String)additional_parameters.getValue("border_length")).toInt();
      if (additional_parameters.getValue("k_mer_length") == DataValue::EMPTY
         && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: No k-mer length defined in additional parameters file.");
      }
      k_mer_length = ((String)additional_parameters.getValue("k_mer_length")).toInt();

      if (additional_parameters.getValue("sigma") == DataValue::EMPTY
         && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: No sigma defined in additional parameters file.");
      }

      sigma = ((String)additional_parameters.getValue("sigma")).toFloat();
    }

    if (File::readable(dt_model_file_))
    {
      svm.setParameter(SVMWrapper::BORDER_LENGTH, (Int) border_length);
      svm.setParameter(SVMWrapper::SIGMA, sigma);
      // to obtain probabilities
      svm.setParameter(SVMWrapper::PROBABILITY, 1);
    }
    // loading training data
    String sample_file = dt_model_file_ + "_samples";
    svm_problem* training_data = NULL;
    if (File::readable(sample_file))
    {
      training_data = encoder.loadLibSVMProblem(sample_file);
      svm.setTrainingSample(training_data);
    }
    else
    {
      throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "DetectibilitySimulation: SVM sample file " + sample_file + " is not readable");
    }


    LOG_INFO << "Predicting peptide detectabilities..    " << endl;

    String allowed_amino_acid_characters = "ACDEFGHIKLMNPQRSTVWY";

    // Encoding test data
    vector<DoubleReal> probs;
    probs.resize(peptides_vector.size(), 0);

    svm_problem* prediction_data = encoder.encodeLibSVMProblemWithOligoBorderVectors(peptides_vector, probs,
                                                                                     k_mer_length,
                                                                                     allowed_amino_acid_characters,
                                                                                     svm.getIntParameter(SVMWrapper::BORDER_LENGTH));

    svm.getSVCProbabilities(prediction_data, detectabilities, labels);

    // clean up when finished with prediction
    delete prediction_data;
    delete training_data;
  }
Ejemplo n.º 2
0
  ExitCodes main_(Int, const char**)
  {
    vector<ProteinIdentification> protein_identifications;
    vector<PeptideIdentification> identifications;
    vector<ProteinIdentification> protein_identifications_negative;
    vector<PeptideIdentification> identifications_negative;
    vector<String> training_peptides;
    vector<DoubleReal> training_labels;
    PeptideHit temp_peptide_hit;
    SVMWrapper svm;
    LibSVMEncoder encoder;
    svm_problem* encoded_training_sample = 0;
    String allowed_amino_acid_characters = "ACDEFGHIKLMNPQRSTVWY";
    map<SVMWrapper::SVM_parameter_type, DoubleReal> start_values;
    map<SVMWrapper::SVM_parameter_type, DoubleReal> step_sizes;
    map<SVMWrapper::SVM_parameter_type, DoubleReal> end_values;
    DoubleReal sigma_start = 0;
    DoubleReal sigma_step_size = 0;
    DoubleReal sigma_stop = 0;
    UInt number_of_partitions = 0;
    UInt number_of_runs = 0;
    map<SVMWrapper::SVM_parameter_type, DoubleReal> optimized_parameters;
    map<SVMWrapper::SVM_parameter_type, DoubleReal>::iterator parameters_iterator;
    bool additive_cv = true;
    Param additional_parameters;
    Int temp_type = POLY;
    String debug_string = "";
    DoubleReal sigma = 0.1;
    UInt k_mer_length = 1;
    Int border_length = 0;
    bool non_redundant = false;
    bool skip_cv = getFlag_("cv:skip_cv");

    svm.setParameter(SVMWrapper::PROBABILITY, 1);
    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------
    String inputfile_positives = getStringOption_("in_positive");
    String inputfile_negatives = getStringOption_("in_negative");
    String temp_string = "";

    String outputfile_name = getStringOption_("out");

    UInt max_positive_count = getIntOption_("max_positive_count");
    UInt max_negative_count = getIntOption_("max_negative_count");

    //SVM type
    String type = getStringOption_("svm_type");
    if (type == "NU_SVC")
    {
      svm.setParameter(SVMWrapper::SVM_TYPE, NU_SVC);
    }
    else if (type == "C_SVC")
    {
      svm.setParameter(SVMWrapper::SVM_TYPE, C_SVC);
    }
    else
    {
      writeLog_("Illegal svm type given. Svm type has to be either "
                + String("NU_SVC or C_SVC. Aborting!"));
      printUsage_();
      return ILLEGAL_PARAMETERS;
    }
    //Kernel type
    type = getStringOption_("kernel_type");
    if (type == "POLY")
    {
      svm.setParameter(SVMWrapper::KERNEL_TYPE, POLY);
      temp_type = POLY;
    }
    else if (type == "LINEAR")
    {
      svm.setParameter(SVMWrapper::KERNEL_TYPE, LINEAR);
      temp_type = LINEAR;
    }
    else if (type == "RBF")
    {
      svm.setParameter(SVMWrapper::KERNEL_TYPE, RBF);
      temp_type = RBF;
    }
    else if (type == "OLIGO")
    {
      svm.setParameter(SVMWrapper::KERNEL_TYPE, SVMWrapper::OLIGO);
      temp_type = SVMWrapper::OLIGO;
    }
    else if (type == "SIGMOID")
    {
      svm.setParameter(SVMWrapper::KERNEL_TYPE, SIGMOID);
      temp_type = SIGMOID;
    }
    else
    {
      writeLog_("Unknown kernel type given. Aborting!");
      printUsage_();
      return ILLEGAL_PARAMETERS;
    }

    //parameters
    svm.setParameter(SVMWrapper::C, getDoubleOption_("c"));
    svm.setParameter(SVMWrapper::DEGREE, getIntOption_("degree"));
    if (svm.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC)
    {
      svm.setParameter(SVMWrapper::NU, getDoubleOption_("nu"));
    }

    //grid search parameters
    if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == POLY)
    {
      svm.setParameter(SVMWrapper::DEGREE, getIntOption_("degree"));
      if (!skip_cv)
      {
        DoubleReal degree_start = getIntOption_("cv:degree_start");
        DoubleReal degree_step_size = getIntOption_("cv:degree_step_size");
        if (!additive_cv && degree_step_size <= 1)
        {
          writeLog_("Step size of degree <= 1 and additive_cv is false. Aborting!");
          return ILLEGAL_PARAMETERS;
        }
        DoubleReal degree_stop = getIntOption_("cv:degree_stop");

        start_values.insert(make_pair(SVMWrapper::DEGREE, degree_start));
        step_sizes.insert(make_pair(SVMWrapper::DEGREE, degree_step_size));
        end_values.insert(make_pair(SVMWrapper::DEGREE, degree_stop));
      }
    }

    if (svm.getIntParameter(SVMWrapper::SVM_TYPE) == C_SVC && !skip_cv)
    {
      DoubleReal c_start = getDoubleOption_("cv:c_start");
      DoubleReal c_step_size = getDoubleOption_("cv:c_step_size");
      if (!additive_cv && c_step_size <= 1)
      {
        writeLog_("Step size of c <= 1 and additive_cv is false. Aborting!");
        return ILLEGAL_PARAMETERS;
      }
      DoubleReal c_stop = getDoubleOption_("cv:c_stop");

      start_values.insert(make_pair(SVMWrapper::C, c_start));
      step_sizes.insert(make_pair(SVMWrapper::C, c_step_size));
      end_values.insert(make_pair(SVMWrapper::C, c_stop));
    }

    if (svm.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC && !skip_cv)
    {
      DoubleReal nu_start = getDoubleOption_("cv:nu_start");
      DoubleReal nu_step_size = getDoubleOption_("cv:nu_step_size");
      if (!additive_cv && nu_step_size <= 1)
      {
        writeLog_("Step size of nu <= 1 and additive_cv is false. Aborting!");
        return ILLEGAL_PARAMETERS;
      }
      DoubleReal nu_stop = getDoubleOption_("cv:nu_stop");

      start_values.insert(make_pair(SVMWrapper::NU, nu_start));
      step_sizes.insert(make_pair(SVMWrapper::NU, nu_step_size));
      end_values.insert(make_pair(SVMWrapper::NU, nu_stop));
    }

    border_length = getIntOption_("border_length");
    svm.setParameter(SVMWrapper::BORDER_LENGTH, border_length);

    sigma = getDoubleOption_("sigma");
    svm.setParameter(SVMWrapper::SIGMA, sigma);

    k_mer_length = getIntOption_("k_mer_length");

    sigma_start = 0.;
    sigma_step_size = 0.;
    sigma_stop = 0.;
    if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO
       && !skip_cv)
    {
      sigma_start = getDoubleOption_("cv:sigma_start");
      sigma_step_size = getDoubleOption_("cv:sigma_step_size");
      if (!additive_cv && sigma_step_size <= 1)
      {
        writeLog_("Step size of sigma <= 1 and additive_cv is false. Aborting!");
        return ILLEGAL_PARAMETERS;
      }
      sigma_stop = getDoubleOption_("cv:sigma_stop");

      start_values.insert(make_pair(SVMWrapper::SIGMA, sigma_start));
      step_sizes.insert(make_pair(SVMWrapper::SIGMA, sigma_step_size));
      end_values.insert(make_pair(SVMWrapper::SIGMA, sigma_stop));

      debug_string = "CV from sigma = " + String(sigma_start) +
                     " to sigma = " + String(sigma_stop) + " with step size " +
                     String(sigma_step_size);
      writeDebug_(debug_string, 1);
    }

    if (!skip_cv && !start_values.empty())
    {
      number_of_runs = getIntOption_("cv:number_of_runs");
      writeDebug_(String("Number of CV runs: ") + String(number_of_runs), 1);

      number_of_partitions = getIntOption_("cv:number_of_partitions");
      writeDebug_(String("Number of CV partitions: ") + String(number_of_partitions), 1);

      additive_cv = getFlag_("additive_cv");
    }

    Int debug_level = getIntOption_("debug");
    non_redundant = !(getFlag_("redundant"));

    //-------------------------------------------------------------
    // reading input
    //-------------------------------------------------------------
    String document_id;
    IdXMLFile().load(inputfile_positives, protein_identifications, identifications, document_id);
    IdXMLFile().load(inputfile_negatives, protein_identifications_negative, identifications_negative, document_id);

    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------
    for (Size i = 0; i < identifications.size(); i++)
    {
      const vector<PeptideHit>& temp_peptide_hits = identifications[i].getHits();
      Size temp_size = temp_peptide_hits.size();
      if (temp_size > 0)
      {
        for (Size j = 0; j < temp_size; ++j)
        {
          temp_peptide_hit = temp_peptide_hits[j];
          temp_string = temp_peptide_hit.getSequence().toUnmodifiedString();
          if (!non_redundant
             || find(training_peptides.begin(), training_peptides.end(), temp_string) == training_peptides.end())
          {
            training_peptides.push_back(temp_peptide_hit.getSequence().toUnmodifiedString());
          }
        }
      }
    }
    training_labels.resize(training_peptides.size(), 1.0);
    debug_string = String(training_labels.size()) + " positive sequences read";
    writeDebug_(debug_string, 1);

    if (training_peptides.size() > max_positive_count)
    {
      random_shuffle(training_peptides.begin(), training_peptides.end());
      training_peptides.resize(max_positive_count, "");
      training_labels.resize(max_positive_count, 1.);
    }
    debug_string = String(training_peptides.size()) + " positive sequences for training";
    writeDebug_(debug_string, 1);

    UInt counter = 0;

    vector<String> temp_training_peptides;
    for (Size i = 0; i < identifications_negative.size(); i++)
    {
      const vector<PeptideHit>& temp_peptide_hits = identifications_negative[i].getHits();
      Size temp_size = temp_peptide_hits.size();
      if (temp_size > 0)
      {
        for (Size j = 0; j < temp_size; ++j)
        {
          temp_peptide_hit = temp_peptide_hits[j];
          temp_string = temp_peptide_hit.getSequence().toUnmodifiedString();
          if (find(training_peptides.begin(), training_peptides.end(), temp_string) != training_peptides.end())
          {
            writeLog_("Peptides are not allowed to occur in the positive and the negative set. Example: '" + temp_string + "'");
            return ILLEGAL_PARAMETERS;
          }

          if (!non_redundant
             || find(training_peptides.begin(), training_peptides.end(), temp_string) == training_peptides.end())
          {
            temp_training_peptides.push_back(temp_peptide_hit.getSequence().toUnmodifiedString());
            training_labels.push_back(-1.0);
            ++counter;
          }
        }
      }
    }
    if (non_redundant)
    {
      debug_string = String(counter) + " non redundant negative sequences read";
    }
    else
    {
      debug_string = String(counter) + " negative sequences read";
    }
    writeDebug_(debug_string, 1);
    if (temp_training_peptides.size() > max_negative_count)
    {
      random_shuffle(temp_training_peptides.begin(), temp_training_peptides.end());
      temp_training_peptides.resize(max_negative_count, "");
      training_labels.resize(training_peptides.size() + max_negative_count, -1.);
    }
    training_peptides.insert(training_peptides.end(),
                             temp_training_peptides.begin(),
                             temp_training_peptides.end());

    debug_string = String(temp_training_peptides.size()) + " negative sequences for training";
    writeDebug_(debug_string, 1);
    temp_training_peptides.clear();

    if (temp_type == LINEAR || temp_type == POLY || temp_type == RBF)
    {
      UInt maximum_sequence_length = 50;
      encoded_training_sample =
        encoder.encodeLibSVMProblemWithCompositionAndLengthVectors(training_peptides,
                                                                   training_labels,
                                                                   allowed_amino_acid_characters,
                                                                   maximum_sequence_length);
    }
    else if (temp_type == SVMWrapper::OLIGO)
    {
      encoded_training_sample =
        encoder.encodeLibSVMProblemWithOligoBorderVectors(training_peptides,
                                                          training_labels,
                                                          k_mer_length,
                                                          allowed_amino_acid_characters,
                                                          svm.getIntParameter(SVMWrapper::BORDER_LENGTH));
    }

    if (!start_values.empty())
    {
      String digest = "";
      bool output_flag = false;
      if (debug_level >= 1)
      {
        output_flag = true;
        vector<String> parts;
        outputfile_name.split('/', parts);
        if (parts.empty())
        {
          digest = outputfile_name;
        }
        else
        {
          digest = parts[parts.size() - 1];
        }
      }
      SVMData dummy;
      DoubleReal cv_quality = svm.performCrossValidation(encoded_training_sample,
                                                         dummy,
                                                         false,
                                                         start_values,
                                                         step_sizes,
                                                         end_values,
                                                         number_of_partitions,
                                                         number_of_runs,
                                                         optimized_parameters,
                                                         additive_cv,
                                                         output_flag,
                                                         "performances_" + digest + ".txt");

      String debug_string = "Best parameters found in cross validation:";

      for (parameters_iterator = optimized_parameters.begin();
           parameters_iterator != optimized_parameters.end();
           ++parameters_iterator)
      {
        svm.setParameter(parameters_iterator->first,
                         parameters_iterator->second);
        if (parameters_iterator->first == SVMWrapper::DEGREE)
        {
          debug_string += " degree: " + String(parameters_iterator->second);
        }
        else if (parameters_iterator->first == SVMWrapper::C)
        {
          debug_string += " C: " + String(parameters_iterator->second);
        }
        else if (parameters_iterator->first == SVMWrapper::NU)
        {
          debug_string += " nu: " + String(parameters_iterator->second);
        }
        else if (parameters_iterator->first == SVMWrapper::SIGMA)
        {
          debug_string += " sigma: " + String(parameters_iterator->second);
        }
      }
      debug_string += " with performance " + String(cv_quality);
      writeDebug_(debug_string, 1);
    }

    svm.train(encoded_training_sample);

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------

    svm.saveModel(outputfile_name);

    // If the oligo-border kernel is used some additional information has to be stored
    if (temp_type == SVMWrapper::OLIGO)
    {
      encoder.storeLibSVMProblem(outputfile_name + "_samples", encoded_training_sample);
      additional_parameters.setValue("kernel_type", temp_type);

      if (temp_type == SVMWrapper::OLIGO)
      {
        additional_parameters.setValue("border_length", svm.getIntParameter(SVMWrapper::BORDER_LENGTH));
        additional_parameters.setValue("k_mer_length", k_mer_length);
        additional_parameters.setValue("sigma", svm.getDoubleParameter(SVMWrapper::SIGMA));
      }

      ParamXMLFile paramFile;
      paramFile.store(outputfile_name + "_additional_parameters", additional_parameters);
    }

    return EXECUTION_OK;
  }