void DetectabilitySimulation::svmFilter_(FeatureMapSim& features)
  {

    // transform featuremap to peptides vector
    vector<String> peptides_vector(features.size());
    for (Size i = 0; i < features.size(); ++i)
    {
      peptides_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString();
    }

    vector<DoubleReal> labels;
    vector<DoubleReal> detectabilities;
    predictDetectabilities(peptides_vector, labels, detectabilities);


    // copy all meta data stored in the feature map
    FeatureMapSim temp_copy(features);
    temp_copy.clear(false);

    for (Size i = 0; i < peptides_vector.size(); ++i)
    {

      if (detectabilities[i] > min_detect_)
      {
        features[i].setMetaValue("detectability", detectabilities[i]);
        temp_copy.push_back(features[i]);
      }
#ifdef DEBUG_SIM
      cout << detectabilities[i] << " " << min_detect_ << endl;
#endif
    }

    features.swap(temp_copy);
  }
Example #2
0
 void RTSimulation::noRTColumn_(FeatureMapSim& features)
 {
   for (FeatureMapSim::iterator it_f = features.begin(); it_f != features.end();
        ++it_f)
   {
     (*it_f).setRT(-1);
   }
 }
  void DetectabilitySimulation::noFilter_(FeatureMapSim& features)
  {
    // set detectibility to 1.0 for all given peptides
    DoubleReal defaultDetectibility = 1.0;

    for (FeatureMapSim::iterator feature_it = features.begin();
         feature_it != features.end();
         ++feature_it)
    {
      (*feature_it).setMetaValue("detectability", defaultDetectibility);
    }
  }
Example #4
0
  void RTSimulation::predictContaminantsRT(FeatureMapSim& contaminants)
  {
    // iterate of feature map
    for (Size i = 0; i < contaminants.size(); ++i)
    {

      // assign random retention time
      SimCoordinateType retention_time = gsl_ran_flat(rnd_gen_->technical_rng, 0, total_gradient_time_);
      contaminants[i].setRT(retention_time);
    }
  }
Example #5
0
void BaseLabeler::recomputeConsensus_(const FeatureMapSim & simulated_features)
{
    // iterate over all given features stored in the labeling consensus and try to find the corresponding feature in
    // in the feature map

    // build index for faster access
    Map<String, IntList> id_map;
    Map<UInt64, Size> features_per_labeled_map;
    for (Size i = 0; i < simulated_features.size(); ++i)
    {
        if (simulated_features[i].metaValueExists("parent_feature"))
        {
            LOG_DEBUG << "Checking [" << i << "]: " << simulated_features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toString()
                      << " with charge " << simulated_features[i].getCharge() << " (" << simulated_features[i].getMetaValue("charge_adducts") << ")"
                      << " parent was " << simulated_features[i].getMetaValue("parent_feature") << std::endl;
            id_map[simulated_features[i].getMetaValue("parent_feature")].push_back((Int)i);

            UInt64 map_index = 0;
            if (simulated_features[i].metaValueExists("map_index"))
            {
                map_index = simulated_features[i].getMetaValue("map_index");
            }
            ++features_per_labeled_map[map_index];
        }
    }

    for (Map<String, IntList>::iterator it = id_map.begin(); it != id_map.end(); ++it)
    {
        LOG_DEBUG << it->first << " " << it->second << std::endl;
    }

    // new consensus map
    ConsensusMap new_cm;

    // initialize submaps in consensus map
    for (Map<UInt64, Size>::Iterator it = features_per_labeled_map.begin(); it != features_per_labeled_map.end(); ++it)
    {
        new_cm.getFileDescriptions()[it->first].size = it->second;
        new_cm.getFileDescriptions()[it->first].unique_id = simulated_features.getUniqueId();
    }

    for (ConsensusMap::iterator cm_iter = consensus_.begin(); cm_iter != consensus_.end(); ++cm_iter)
    {
        bool complete = true;

        LOG_DEBUG << "Checking consensus feature containing: " << std::endl;

        // check if we have all elements of current CF in the new feature map (simulated_features)
        for (ConsensusFeature::iterator cf_iter = (*cm_iter).begin(); cf_iter != (*cm_iter).end(); ++cf_iter)
        {
            complete &= id_map.has(String((*cf_iter).getUniqueId()));
            LOG_DEBUG << "\t" << String((*cf_iter).getUniqueId()) << std::endl;
        }

        if (complete)
        {
            // get all elements sorted by charge state; since the same charge can be achieved by different
            // adduct compositions we use the adduct-string as indicator to find the groups
            Map<String, std::set<FeatureHandle, FeatureHandle::IndexLess> > charge_mapping;

            for (ConsensusFeature::iterator cf_iter = (*cm_iter).begin(); cf_iter != (*cm_iter).end(); ++cf_iter)
            {
                IntList feature_indices = id_map[String((*cf_iter).getUniqueId())];

                for (IntList::iterator it = feature_indices.begin(); it != feature_indices.end(); ++it)
                {
                    UInt64 map_index = 0;
                    if (simulated_features[*it].metaValueExists("map_index"))
                    {
                        map_index = simulated_features[*it].getMetaValue("map_index");
                    }

                    if (charge_mapping.has(simulated_features[*it].getMetaValue("charge_adducts")))
                    {
                        charge_mapping[simulated_features[*it].getMetaValue("charge_adducts")].insert(FeatureHandle(map_index, simulated_features[*it]));
                    }
                    else
                    {
                        LOG_DEBUG << "Create new set with charge composition " << simulated_features[*it].getMetaValue("charge_adducts") << std::endl;
                        std::set<FeatureHandle, FeatureHandle::IndexLess> fh_set;

                        fh_set.insert(FeatureHandle(map_index, simulated_features[*it]));
                        charge_mapping.insert(std::make_pair(simulated_features[*it].getMetaValue("charge_adducts"), fh_set));
                    }
                }
            }

            // create new consensus feature from derived features (separated by charge, if charge != 0)
            for (Map<String, std::set<FeatureHandle, FeatureHandle::IndexLess> >::const_iterator charge_group_it = charge_mapping.begin();
                    charge_group_it != charge_mapping.end();
                    ++charge_group_it)
            {
                ConsensusFeature cf;
                cf.setCharge((*(*charge_group_it).second.begin()).getCharge());
                cf.setMetaValue("charge_adducts", charge_group_it->first);

                std::vector<PeptideIdentification> ids;
                for (std::set<FeatureHandle, FeatureHandle::IndexLess>::const_iterator fh_it = (charge_group_it->second).begin(); fh_it != (charge_group_it->second).end(); ++fh_it)
                {
                    cf.insert(*fh_it);
                    // append identifications
                    Size f_index = simulated_features.uniqueIdToIndex(fh_it->getUniqueId());
                    std::vector<PeptideIdentification> ids_feature = simulated_features[f_index].getPeptideIdentifications();
                    ids.insert(ids.end(), ids_feature.begin(), ids_feature.end());
                }

                cf.computeMonoisotopicConsensus();
                cf.setPeptideIdentifications(ids);

                new_cm.push_back(cf);
            }

        }
    }

    new_cm.setProteinIdentifications(simulated_features.getProteinIdentifications());

    consensus_.swap(new_cm);
    consensus_.applyMemberFunction(&UniqueIdInterface::ensureUniqueId);
}
Example #6
0
  void RTSimulation::calculateMT_(FeatureMapSim& features, std::vector<DoubleReal>& predicted_retention_times)
  {
    Map<String, double> q_cterm, q_nterm, q_aa_basic, q_aa_acidic;
    getChargeContribution_(q_cterm, q_nterm, q_aa_basic, q_aa_acidic);

    DoubleReal alpha = param_.getValue("CE:alpha");
    bool auto_scale = (param_.getValue("auto_scale") == "true");
    DoubleReal c = (auto_scale ? 1 : (DoubleReal)param_.getValue("CE:lenght_d") * (DoubleReal)param_.getValue("CE:length_total") / (DoubleReal)param_.getValue("CE:voltage"));

    predicted_retention_times.resize(features.size());

    for (Size i = 0; i < features.size(); ++i)
    {
      String seq = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString();

      // ** determine charge of peptide **

      DoubleReal charge = 0;
      // C&N term charge contribution
      if (q_nterm.has(seq[0]))
        charge +=  q_nterm[seq[0]];
      if (q_cterm.has(seq.suffix(1)))
        charge +=  q_cterm[seq.suffix(1)];

      // sidechains ...
      Map<String, Size> frequency_table;
      features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().getAAFrequencies(frequency_table);
      for (Map<String, Size>::const_iterator it = frequency_table.begin(); it != frequency_table.end(); ++it)
      {
        if (q_aa_basic.has(it->first))
          charge +=  q_aa_basic[it->first] * it->second;
        if (q_aa_acidic.has(it->first))
          charge +=  q_aa_acidic[it->first] * it->second;
      }

      // ** determine mass of peptide
      DoubleReal mass = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().getFormula().getAverageWeight();

      // ** mobility (mu = mu_ep + mu_eo = (q/MW^alpha) + mu_eo
      DoubleReal mu = (charge / std::pow(mass, alpha)) + (auto_scale ? 0 : (DoubleReal)param_.getValue("CE:mu_eo"));
      predicted_retention_times[i] = c / mu; // this is L_d*L_t / (mu * V)  as "c = L_d*L_t/V"
    }

    // ** only when Auto-Scaling is active ** /
    std::vector<DoubleReal> rt_sorted(predicted_retention_times);
    std::sort(rt_sorted.begin(), rt_sorted.end());

    DoubleReal max_rt = rt_sorted.back();

    if (auto_scale)
    {
      max_rt = 1; // highest will be scaled to 1

      //std::cerr << "minRT: " << rt_sorted[0] << "   max: " << rt_sorted.back() << "\n";
      // normalize to 5th - 95th percentile (we want to avoid that few outliers with huge/small MT can compress the others to a small MT range):
      DoubleReal mt_5p = rt_sorted[rt_sorted.size() * 5 / 100];
      DoubleReal mt_95p = rt_sorted[rt_sorted.size() * 95 / 100];
      // ... assume 95% MT range at 95th percentile
      DoubleReal range = std::max(1.0, (mt_95p - mt_5p) * 0.9);

      //std::cerr << " 5% MT: " << mt_5p << ",   95% MT: " << mt_95p << " Range: " << range << "\n";

      DoubleReal new_offset = mt_5p - range * 0.05;

      // scale MT's between 0 and 1 (except for outliers --> which will get <0 or >1)
      for (Size i = 0; i < features.size(); ++i)
      {
        predicted_retention_times[i] = (predicted_retention_times[i] - new_offset) / range;
      }
    }

    // the width factor is 1.0 at MT=0 and reaches its max (default 2.0) at MT=max
    DoubleReal rt_widening_max = 2.0;
    for (Size i = 0; i < features.size(); ++i)
    {
      features[i].setMetaValue("RT_CE_width_factor", (predicted_retention_times[i] / max_rt * (rt_widening_max - 1) + 1));
    }

  }
Example #7
0
  /**
   @brief Gets a feature map containing the peptides and predicts for those the retention times
   */
  void RTSimulation::predictRT(FeatureMapSim& features)
  {
    LOG_INFO << "RT Simulation ... started" << std::endl;

    vector<DoubleReal>  predicted_retention_times;
    bool is_relative = (param_.getValue("auto_scale") == "true");
    if (param_.getValue("rt_column") == "none")
    {
      noRTColumn_(features);
      return;
    }
    // CE or HPLC:
    else if (param_.getValue("rt_column") == "CE")
    {
      calculateMT_(features, predicted_retention_times);
    }
    else if (param_.getValue("rt_column") == "HPLC")
    {
      vector<AASequence> peptides_aa_vector(features.size());
      for (Size i = 0; i < features.size(); ++i)
      {
        peptides_aa_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence();
      }
      wrapSVM(peptides_aa_vector, predicted_retention_times);
    }

    // rt error dicing
    SimCoordinateType rt_offset = param_.getValue("variation:affine_offset");
    SimCoordinateType rt_scale  = param_.getValue("variation:affine_scale");
    SimCoordinateType rt_ft_stddev = param_.getValue("variation:feature_stddev");

    FeatureMapSim fm_tmp(features);
    fm_tmp.clear(false);
    StringList deleted_features;
    for (Size i = 0; i < predicted_retention_times.size(); ++i)
    {
      // relative -> absolute RT's (with border)
      if (is_relative)
      {
        predicted_retention_times[i] *= total_gradient_time_;
      }

      //overwrite RT (if given by user)
      if (features[i].metaValueExists("rt"))
      {
        predicted_retention_times[i] = features[i].getMetaValue("rt");
      }
      // add variation
      SimCoordinateType rt_error = gsl_ran_gaussian(rnd_gen_->technical_rng, rt_ft_stddev) + rt_offset;
      predicted_retention_times[i] = predicted_retention_times[i] * rt_scale + rt_error;
      //overwrite RT [no randomization] (if given by user)
      if (features[i].metaValueExists("RT"))
      {
        predicted_retention_times[i] = features[i].getMetaValue("RT");
      }

      // remove invalid peptides & (later) display removed ones
      if (
        (predicted_retention_times[i] < 0.0) || // check for invalid RT
        (predicted_retention_times[i] > gradient_max_) || // check if RT is not in scan window
        (predicted_retention_times[i] < gradient_min_) // check if RT is not in scan window
        )
      {
        deleted_features.push_back(features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString() + " [" +
                                   String::number(predicted_retention_times[i], 2)
                                   + "]");
        continue;
      }

      features[i].setRT(predicted_retention_times[i]);

      // determine shape parameters for EGH
      DoubleReal variance = egh_variance_location_ + (egh_variance_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_));
      DoubleReal tau = egh_tau_location_ + (egh_tau_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_));

      // resample variance if it is below 0
      // try this only 10 times to avoid endless loop in case of
      // a bad parameter combination
      Size retry_variance_sampling = 0;
      while ((variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_)) && retry_variance_sampling < 9)
      {
        variance = egh_variance_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_);
        ++retry_variance_sampling;
      }

      if (variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_))
      {
        LOG_ERROR << "Sigma^2 was negative, resulting in a feature with width=0. Tried to resample 10 times and then stopped. Setting it to the user defined width value of " << egh_variance_location_ << "!" << std::endl;
        variance = egh_variance_location_;
      }

      // resample tau if the value is to big
      // try this only 10 times to avoid endless loop in case of
      // a bad parameter combination
      Size retry_tau_sampling = 0;
      while (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_  && retry_tau_sampling < 9)
      {
        tau = egh_tau_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_);
        ++retry_tau_sampling;
      }

      if (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_)
      {
        LOG_ERROR << "Tau is to big for a reasonable feature. Tried to resample 10 times and then stopped. Setting it to the user defined skewness value of " << egh_tau_location_ << "!" << std::endl;
        tau = egh_tau_location_;
      }

      features[i].setMetaValue("RT_egh_variance", variance);
      features[i].setMetaValue("RT_egh_tau", tau);

      fm_tmp.push_back(features[i]);
    }

    // print invalid features:
    if (deleted_features.size() > 0)
    {
      LOG_WARN << "RT prediction gave 'invalid' results for " << deleted_features.size() << " peptide(s), making them unobservable.\n";
      if (deleted_features.size() < 100)
        LOG_WARN << "  " << ListUtils::concatenate(deleted_features, "\n  ") << std::endl;
      else
        LOG_WARN << "  (List is too big to show)" << std::endl;
    }
    // only retain valid features:
    features.swap(fm_tmp);

    features.sortByPosition();
    features.updateRanges();

  }