Ejemplo n.º 1
0
  void RTSimulation::wrapSVM(std::vector<AASequence>& peptide_sequences, std::vector<DoubleReal>& predicted_retention_times)
  {
    String allowed_amino_acid_characters = "ACDEFGHIKLMNPQRSTVWY";
    SVMWrapper svm;
    LibSVMEncoder encoder;
    svm_problem* training_data = NULL;
    SVMData prediction_samples;
    SVMData training_samples;
    UInt k_mer_length = 0;
    DoubleReal sigma = 0.0;
    UInt border_length = 0;
    Size max_number_of_peptides(2000); // hard coding pediction bins; larger values only take more memory, result is not affected

    LOG_INFO << "Predicting RT ... ";

    svm.loadModel(rt_model_file_);

    // load additional parameters
    if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
    {
      String add_paramfile = rt_model_file_ + "_additional_parameters";
      if (!File::readable(add_paramfile))
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "RTSimulation: SVM parameter file " + add_paramfile + " is not readable");
      }

      Param additional_parameters;
      ParamXMLFile paramFile;
      paramFile.load(add_paramfile, additional_parameters);

      if (additional_parameters.getValue("border_length") == DataValue::EMPTY
         && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "RTSimulation: No border length defined in additional parameters file.");
      }
      border_length = ((String)additional_parameters.getValue("border_length")).toInt();
      if (additional_parameters.getValue("k_mer_length") == DataValue::EMPTY
         && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "RTSimulation: No k-mer length defined in additional parameters file.");
      }
      k_mer_length = ((String)additional_parameters.getValue("k_mer_length")).toInt();

      if (additional_parameters.getValue("sigma") == DataValue::EMPTY
         && svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
      {
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "RTSimulation: No sigma defined in additional parameters file.");
      }

      sigma = ((String)additional_parameters.getValue("sigma")).toFloat();
    }

    svm.setParameter(SVMWrapper::BORDER_LENGTH, (Int) border_length);
    svm.setParameter(SVMWrapper::SIGMA, sigma);

    // loading model data
    String sample_file = rt_model_file_ + "_samples";
    if (!File::readable(sample_file))
    {
      throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "RTSimulation: SVM sample file " + sample_file + " is not readable");
    }
    training_samples.load(sample_file);
    svm.setTrainingSample(training_samples);
    svm.setTrainingSample(training_data);

    // use maximally max_number_of_peptides peptide sequence at once
    Size tmp_count = 0;
    Size count = 0;
    std::vector<AASequence>::iterator pep_iter_start = peptide_sequences.begin();
    std::vector<AASequence>::iterator pep_iter_stop = peptide_sequences.begin();
    while (count < peptide_sequences.size())
    {
      while (pep_iter_stop != peptide_sequences.end() && tmp_count < max_number_of_peptides)
      {
        ++tmp_count;
        ++pep_iter_stop;
      }
      std::vector<AASequence> tmp_peptide_seqs;
      tmp_peptide_seqs.insert(tmp_peptide_seqs.end(), pep_iter_start, pep_iter_stop);
      std::vector<DoubleReal> tmp_rts(tmp_peptide_seqs.size(), 0);
      std::vector<DoubleReal> tmp_pred_rts;
      // Encoding test data
      encoder.encodeProblemWithOligoBorderVectors(tmp_peptide_seqs, k_mer_length, allowed_amino_acid_characters, border_length, prediction_samples.sequences);
      prediction_samples.labels = tmp_rts;

      svm.predict(prediction_samples, tmp_pred_rts);
      predicted_retention_times.insert(predicted_retention_times.end(), tmp_pred_rts.begin(), tmp_pred_rts.end());
      pep_iter_start = pep_iter_stop;
      count += tmp_count;
      tmp_count = 0;
    }
    LibSVMEncoder::destroyProblem(training_data);

    LOG_INFO << "done" << endl;
  }