void DetectabilitySimulation::svmFilter_(FeatureMapSim& features)
  {

    // transform featuremap to peptides vector
    vector<String> peptides_vector(features.size());
    for (Size i = 0; i < features.size(); ++i)
    {
      peptides_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString();
    }

    vector<DoubleReal> labels;
    vector<DoubleReal> detectabilities;
    predictDetectabilities(peptides_vector, labels, detectabilities);


    // copy all meta data stored in the feature map
    FeatureMapSim temp_copy(features);
    temp_copy.clear(false);

    for (Size i = 0; i < peptides_vector.size(); ++i)
    {

      if (detectabilities[i] > min_detect_)
      {
        features[i].setMetaValue("detectability", detectabilities[i]);
        temp_copy.push_back(features[i]);
      }
#ifdef DEBUG_SIM
      cout << detectabilities[i] << " " << min_detect_ << endl;
#endif
    }

    features.swap(temp_copy);
  }
Example #2
0
  /**
   @brief Gets a feature map containing the peptides and predicts for those the retention times
   */
  void RTSimulation::predictRT(FeatureMapSim& features)
  {
    LOG_INFO << "RT Simulation ... started" << std::endl;

    vector<DoubleReal>  predicted_retention_times;
    bool is_relative = (param_.getValue("auto_scale") == "true");
    if (param_.getValue("rt_column") == "none")
    {
      noRTColumn_(features);
      return;
    }
    // CE or HPLC:
    else if (param_.getValue("rt_column") == "CE")
    {
      calculateMT_(features, predicted_retention_times);
    }
    else if (param_.getValue("rt_column") == "HPLC")
    {
      vector<AASequence> peptides_aa_vector(features.size());
      for (Size i = 0; i < features.size(); ++i)
      {
        peptides_aa_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence();
      }
      wrapSVM(peptides_aa_vector, predicted_retention_times);
    }

    // rt error dicing
    SimCoordinateType rt_offset = param_.getValue("variation:affine_offset");
    SimCoordinateType rt_scale  = param_.getValue("variation:affine_scale");
    SimCoordinateType rt_ft_stddev = param_.getValue("variation:feature_stddev");

    FeatureMapSim fm_tmp(features);
    fm_tmp.clear(false);
    StringList deleted_features;
    for (Size i = 0; i < predicted_retention_times.size(); ++i)
    {
      // relative -> absolute RT's (with border)
      if (is_relative)
      {
        predicted_retention_times[i] *= total_gradient_time_;
      }

      //overwrite RT (if given by user)
      if (features[i].metaValueExists("rt"))
      {
        predicted_retention_times[i] = features[i].getMetaValue("rt");
      }
      // add variation
      SimCoordinateType rt_error = gsl_ran_gaussian(rnd_gen_->technical_rng, rt_ft_stddev) + rt_offset;
      predicted_retention_times[i] = predicted_retention_times[i] * rt_scale + rt_error;
      //overwrite RT [no randomization] (if given by user)
      if (features[i].metaValueExists("RT"))
      {
        predicted_retention_times[i] = features[i].getMetaValue("RT");
      }

      // remove invalid peptides & (later) display removed ones
      if (
        (predicted_retention_times[i] < 0.0) || // check for invalid RT
        (predicted_retention_times[i] > gradient_max_) || // check if RT is not in scan window
        (predicted_retention_times[i] < gradient_min_) // check if RT is not in scan window
        )
      {
        deleted_features.push_back(features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString() + " [" +
                                   String::number(predicted_retention_times[i], 2)
                                   + "]");
        continue;
      }

      features[i].setRT(predicted_retention_times[i]);

      // determine shape parameters for EGH
      DoubleReal variance = egh_variance_location_ + (egh_variance_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_));
      DoubleReal tau = egh_tau_location_ + (egh_tau_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_));

      // resample variance if it is below 0
      // try this only 10 times to avoid endless loop in case of
      // a bad parameter combination
      Size retry_variance_sampling = 0;
      while ((variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_)) && retry_variance_sampling < 9)
      {
        variance = egh_variance_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_);
        ++retry_variance_sampling;
      }

      if (variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_))
      {
        LOG_ERROR << "Sigma^2 was negative, resulting in a feature with width=0. Tried to resample 10 times and then stopped. Setting it to the user defined width value of " << egh_variance_location_ << "!" << std::endl;
        variance = egh_variance_location_;
      }

      // resample tau if the value is to big
      // try this only 10 times to avoid endless loop in case of
      // a bad parameter combination
      Size retry_tau_sampling = 0;
      while (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_  && retry_tau_sampling < 9)
      {
        tau = egh_tau_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_);
        ++retry_tau_sampling;
      }

      if (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_)
      {
        LOG_ERROR << "Tau is to big for a reasonable feature. Tried to resample 10 times and then stopped. Setting it to the user defined skewness value of " << egh_tau_location_ << "!" << std::endl;
        tau = egh_tau_location_;
      }

      features[i].setMetaValue("RT_egh_variance", variance);
      features[i].setMetaValue("RT_egh_tau", tau);

      fm_tmp.push_back(features[i]);
    }

    // print invalid features:
    if (deleted_features.size() > 0)
    {
      LOG_WARN << "RT prediction gave 'invalid' results for " << deleted_features.size() << " peptide(s), making them unobservable.\n";
      if (deleted_features.size() < 100)
        LOG_WARN << "  " << ListUtils::concatenate(deleted_features, "\n  ") << std::endl;
      else
        LOG_WARN << "  (List is too big to show)" << std::endl;
    }
    // only retain valid features:
    features.swap(fm_tmp);

    features.sortByPosition();
    features.updateRanges();

  }