void QuantitativeExperimentalDesign::applyDesign2Quantifier(PeptideAndProteinQuant & quantifier, TextFile & file, StringList & file_paths)
{
    //        vector< pair<PeptideAndProteinQuant::PeptideData,PeptideAndProteinQuant::ProteinQuant> >& result)
    //create mapping from experimental setting to all respective file names
    map<String, StringList> design2FileBaseName;
    mapFiles2Design_(design2FileBaseName, file);
    //filter out all non-existing files
    map<String, StringList> design2FilePath;
    findRelevantFilePaths_(design2FileBaseName, design2FilePath, file_paths);

    //determine wether we deal with idXML or featureXML
    FileTypes::Type in_type = FileHandler::getType(file_paths.front());

    if (in_type == FileTypes::FEATUREXML)
    {
        FeatureMap<> features;

        for (map<String, StringList>::iterator iter =  design2FilePath.begin(); iter != design2FilePath.end(); ++iter)
        {
            mergeFeatureMaps_(features, iter->first, iter->second);
        }
        LOG_INFO << "Number of proteinIdentifications: " << features.getProteinIdentifications().size() << endl;
        ProteinIdentification & proteins = features.getProteinIdentifications()[0];

        quantifier.quantifyPeptides(features);
        quantifier.quantifyProteins(proteins);
    }
    else
    {
        ConsensusMap consensus;

        for (map<String, StringList>::iterator iter =  design2FilePath.begin(); iter != design2FilePath.end(); ++iter)
        {
            mergeConsensusMaps_(consensus, iter->first, iter->second);
        }

        LOG_INFO << "Number of proteinIdentifications: " << consensus.getProteinIdentifications().size() << endl;
        ProteinIdentification & proteins = consensus.getProteinIdentifications()[0];

        quantifier.quantifyPeptides(consensus);
        quantifier.quantifyProteins(proteins);
    }
}
Exemplo n.º 2
0
  void FeatureGroupingAlgorithmQT::group_(const vector<MapType>& maps,
                                          ConsensusMap& out)
  {
    // check that the number of maps is ok:
    if (maps.size() < 2)
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
                                       "At least two maps must be given!");
    }

    QTClusterFinder cluster_finder;
    cluster_finder.setParameters(param_.copy("", true));

    cluster_finder.run(maps, out);

    StringList ms_run_locations;

    // add protein IDs and unassigned peptide IDs to the result map here,
    // to keep the same order as the input maps (useful for output later):
    for (typename vector<MapType>::const_iterator map_it = maps.begin();
         map_it != maps.end(); ++map_it)
    {      
      // add protein identifications to result map:
      out.getProteinIdentifications().insert(
        out.getProteinIdentifications().end(),
        map_it->getProteinIdentifications().begin(),
        map_it->getProteinIdentifications().end());

      // add unassigned peptide identifications to result map:
      out.getUnassignedPeptideIdentifications().insert(
        out.getUnassignedPeptideIdentifications().end(),
        map_it->getUnassignedPeptideIdentifications().begin(),
        map_it->getUnassignedPeptideIdentifications().end());
    }

    // canonical ordering for checking the results:
    out.sortByQuality();
    out.sortByMaps();
    out.sortBySize();
    return;
  }
Exemplo n.º 3
0
  void MetaDataBrowser::add(ConsensusMap & map)
  {
    //identifier
    add(static_cast<DocumentIdentifier &>(map));

    // protein identifications
    for (Size i = 0; i < map.getProteinIdentifications().size(); ++i)
    {
      add(map.getProteinIdentifications()[i]);
    }

    //unassigned peptide ids
    for (Size i = 0; i < map.getUnassignedPeptideIdentifications().size(); ++i)
    {
      add(map.getUnassignedPeptideIdentifications()[i]);
    }

    add(static_cast<MetaInfoInterface &>(map));

    treeview_->expandItem(treeview_->findItems(QString::number(0), Qt::MatchExactly, 1).first());
  }
Exemplo n.º 4
0
 void ProteinInference::infer(ConsensusMap & consensus_map, const UInt reference_map)
 {
   // we infer Proteins for every IdentificationRun separately. If you want this combined, then
   // do that before calling this function
   // Each ProteinIdentification will be augmented with the quantification (where possible)
   for (size_t i = 0;
        i < consensus_map.getProteinIdentifications().size();
        ++i)
   {
     infer_(consensus_map, i, reference_map);
   }
 }
Exemplo n.º 5
0
  void IDMapper::annotate(ConsensusMap & map, const std::vector<PeptideIdentification> & ids, const std::vector<ProteinIdentification> & protein_ids, bool measure_from_subelements)
  {
    // validate "RT" and "MZ" metavalues exist
    checkHits_(ids);

    //append protein identifications to Map
    map.getProteinIdentifications().insert(map.getProteinIdentifications().end(), protein_ids.begin(), protein_ids.end());

    //keep track of assigned/unassigned peptide identifications
    std::map<Size, Size> assigned;

    // store which peptides fit which feature (and avoid double entries)
    // consensusMap -> {peptide_index}
    std::vector<std::set<size_t> > mapping(map.size());

    DoubleList mz_values;
    DoubleReal rt_pep;
    IntList charges;

    //iterate over the peptide IDs
    for (Size i = 0; i < ids.size(); ++i)
    {
      if (ids[i].getHits().empty())
        continue;

      getIDDetails_(ids[i], rt_pep, mz_values, charges);

      //iterate over the features
      for (Size cm_index = 0; cm_index < map.size(); ++cm_index)
      {
        // if set to TRUE, we leave the i_mz-loop as we added the whole ID with all hits
        bool was_added = false;       // was current pep-m/z matched?!

        // iterate over m/z values of pepIds
        for (Size i_mz = 0; i_mz < mz_values.size(); ++i_mz)
        {
          DoubleReal mz_pep = mz_values[i_mz];

          // charge states to use for checking:
          IntList current_charges;
          if (!ignore_charge_)
          {
            // if "mz_ref." is "precursor", we have only one m/z value to check,
            // but still one charge state per peptide hit that could match:
            if (mz_values.size() == 1)
            {
              current_charges = charges;
            }
            else
            {
              current_charges.push_back(charges[i_mz]);
            }
            current_charges.push_back(0);             // "not specified" always matches
          }

          //check if we compare distance from centroid or subelements
          if (!measure_from_subelements)
          {
            if (isMatch_(rt_pep - map[cm_index].getRT(), mz_pep, map[cm_index].getMZ()) && (ignore_charge_ || ListUtils::contains(current_charges, map[cm_index].getCharge())))
            {
              was_added = true;
              map[cm_index].getPeptideIdentifications().push_back(ids[i]);
              ++assigned[i];
            }
          }
          else
          {
            for (ConsensusFeature::HandleSetType::const_iterator it_handle = map[cm_index].getFeatures().begin();
                 it_handle != map[cm_index].getFeatures().end();
                 ++it_handle)
            {
              if (isMatch_(rt_pep - it_handle->getRT(), mz_pep, it_handle->getMZ())  && (ignore_charge_ || ListUtils::contains(current_charges, it_handle->getCharge())))
              {
                was_added = true;
                if (mapping[cm_index].count(i) == 0)
                {
                  map[cm_index].getPeptideIdentifications().push_back(ids[i]);
                  ++assigned[i];
                  mapping[cm_index].insert(i);
                }
                break;                 // we added this peptide already.. no need to check other handles
              }
            }
            // continue to here
          }

          if (was_added)
            break;

        }         // m/z values to check

        // break to here

      }       // features
    }     // Identifications


    Size matches_none(0);
    Size matches_single(0);
    Size matches_multi(0);

    //append unassigned peptide identifications
    for (Size i = 0; i < ids.size(); ++i)
    {
      if (assigned[i] == 0)
      {
        map.getUnassignedPeptideIdentifications().push_back(ids[i]);
        ++matches_none;
      }
      else if (assigned[i] == 1)
      {
        ++matches_single;
      }
      else if (assigned[i] > 1)
      {
        ++matches_multi;
      }
    }

    //some statistics output
    LOG_INFO << "Unassigned peptides: " << matches_none << "\n"
             << "Peptides assigned to exactly one feature: "
             << matches_single << "\n"
             << "Peptides assigned to multiple features: "
             << matches_multi << std::endl;

  }
  ExitCodes main_(int, const char **)
  {
    FeatureGroupingAlgorithmUnlabeled * algorithm = new FeatureGroupingAlgorithmUnlabeled();

    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    StringList ins;
    ins = getStringList_("in");
    String out = getStringOption_("out");

    //-------------------------------------------------------------
    // check for valid input
    //-------------------------------------------------------------
    // check if all input files have the correct type
    FileTypes::Type file_type = FileHandler::getType(ins[0]);
    for (Size i = 0; i < ins.size(); ++i)
    {
      if (FileHandler::getType(ins[i]) != file_type)
      {
        writeLog_("Error: All input files must be of the same type!");
        return ILLEGAL_PARAMETERS;
      }
    }

    //-------------------------------------------------------------
    // set up algorithm
    //-------------------------------------------------------------
    Param algorithm_param = getParam_().copy("algorithm:", true);
    writeDebug_("Used algorithm parameters", algorithm_param, 3);
    algorithm->setParameters(algorithm_param);

    Size reference_index(0);
    //-------------------------------------------------------------
    // perform grouping
    //-------------------------------------------------------------
    // load input
    ConsensusMap out_map;
    StringList ms_run_locations;
    if (file_type == FileTypes::FEATUREXML)
    {
      // use map with highest number of features as reference:
      Size max_count(0);
      FeatureXMLFile f;
      for (Size i = 0; i < ins.size(); ++i)
      {
        Size s = f.loadSize(ins[i]);
        if (s > max_count)
        {
          max_count = s;
          reference_index = i;
        }
      }

      // Load reference map and input it to the algorithm
      UInt64 ref_id;
      Size ref_size;
      std::vector<PeptideIdentification> ref_pepids;
      std::vector<ProteinIdentification> ref_protids;
      {
        FeatureMap map_ref;
        FeatureXMLFile f_fxml_tmp;
        f_fxml_tmp.getOptions().setLoadConvexHull(false);
        f_fxml_tmp.getOptions().setLoadSubordinates(false);
        f_fxml_tmp.load(ins[reference_index], map_ref);
        algorithm->setReference(reference_index, map_ref);
        ref_id = map_ref.getUniqueId();
        ref_size = map_ref.size();
        ref_pepids = map_ref.getUnassignedPeptideIdentifications();
        ref_protids = map_ref.getProteinIdentifications();
      }

      ConsensusMap dummy;
      // go through all input files and add them to the result one by one
      for (Size i = 0; i < ins.size(); ++i)
      {

        FeatureXMLFile f_fxml_tmp;
        FeatureMap tmp_map;
        f_fxml_tmp.getOptions().setLoadConvexHull(false);
        f_fxml_tmp.getOptions().setLoadSubordinates(false);
        f_fxml_tmp.load(ins[i], tmp_map);

        // copy over information on the primary MS run
        StringList ms_runs;
        tmp_map.getPrimaryMSRunPath(ms_runs);
        ms_run_locations.insert(ms_run_locations.end(), ms_runs.begin(), ms_runs.end());

        if (i != reference_index)
        {
          algorithm->addToGroup(i, tmp_map);

          // store some meta-data about the maps in the "dummy" object -> try to
          // keep the same order as they were given in the input independent of
          // which map is the reference.

          dummy.getFileDescriptions()[i].filename = ins[i];
          dummy.getFileDescriptions()[i].size = tmp_map.size();
          dummy.getFileDescriptions()[i].unique_id = tmp_map.getUniqueId();

          // add protein identifications to result map
          dummy.getProteinIdentifications().insert(
            dummy.getProteinIdentifications().end(),
            tmp_map.getProteinIdentifications().begin(),
            tmp_map.getProteinIdentifications().end());

          // add unassigned peptide identifications to result map
          dummy.getUnassignedPeptideIdentifications().insert(
            dummy.getUnassignedPeptideIdentifications().end(),
            tmp_map.getUnassignedPeptideIdentifications().begin(),
            tmp_map.getUnassignedPeptideIdentifications().end());
        }
        else
        {
          // copy the meta-data from the refernce map
          dummy.getFileDescriptions()[i].filename = ins[i];
          dummy.getFileDescriptions()[i].size = ref_size;
          dummy.getFileDescriptions()[i].unique_id = ref_id;

          // add protein identifications to result map
          dummy.getProteinIdentifications().insert(
            dummy.getProteinIdentifications().end(),
            ref_protids.begin(),
            ref_protids.end());

          // add unassigned peptide identifications to result map
          dummy.getUnassignedPeptideIdentifications().insert(
            dummy.getUnassignedPeptideIdentifications().end(),
            ref_pepids.begin(),
            ref_pepids.end());
        }
      }

      // get the resulting map
      out_map = algorithm->getResultMap();

      //
      // Copy back meta-data (Protein / Peptide ids / File descriptions)
      //

      // add protein identifications to result map
      out_map.getProteinIdentifications().insert(
        out_map.getProteinIdentifications().end(),
        dummy.getProteinIdentifications().begin(),
        dummy.getProteinIdentifications().end());

      // add unassigned peptide identifications to result map
      out_map.getUnassignedPeptideIdentifications().insert(
        out_map.getUnassignedPeptideIdentifications().end(),
        dummy.getUnassignedPeptideIdentifications().begin(),
        dummy.getUnassignedPeptideIdentifications().end());

      out_map.setFileDescriptions(dummy.getFileDescriptions());

      // canonical ordering for checking the results, and the ids have no real meaning anyway
      // the way this was done in DelaunayPairFinder and StablePairFinder
      // -> the same ordering as FeatureGroupingAlgorithmUnlabeled::group applies!
      out_map.sortByMZ();
      out_map.updateRanges();
    }
    else
    {
      vector<ConsensusMap> maps(ins.size());
      ConsensusXMLFile f;
      for (Size i = 0; i < ins.size(); ++i)
      {
        f.load(ins[i], maps[i]);
        StringList ms_runs;
        maps[i].getPrimaryMSRunPath(ms_runs);
        ms_run_locations.insert(ms_run_locations.end(), ms_runs.begin(), ms_runs.end());
      }
      // group
      algorithm->FeatureGroupingAlgorithm::group(maps, out_map);

      // set file descriptions:
      bool keep_subelements = getFlag_("keep_subelements");
      if (!keep_subelements)
      {
        for (Size i = 0; i < ins.size(); ++i)
        {
          out_map.getFileDescriptions()[i].filename = ins[i];
          out_map.getFileDescriptions()[i].size = maps[i].size();
          out_map.getFileDescriptions()[i].unique_id = maps[i].getUniqueId();
        }
      }
      else
      {
        // components of the output map are not the input maps themselves, but
        // the components of the input maps:
        algorithm->transferSubelements(maps, out_map);
      }
    }

    // assign unique ids
    out_map.applyMemberFunction(&UniqueIdInterface::setUniqueId);

    // annotate output with data processing info
    addDataProcessing_(out_map, getProcessingInfo_(DataProcessing::FEATURE_GROUPING));

    out_map.setPrimaryMSRunPath(ms_run_locations);
    // write output
    ConsensusXMLFile().store(out, out_map);

    // some statistics
    map<Size, UInt> num_consfeat_of_size;
    for (ConsensusMap::const_iterator cmit = out_map.begin(); cmit != out_map.end(); ++cmit)
    {
      ++num_consfeat_of_size[cmit->size()];
    }

    LOG_INFO << "Number of consensus features:" << endl;
    for (map<Size, UInt>::reverse_iterator i = num_consfeat_of_size.rbegin(); i != num_consfeat_of_size.rend(); ++i)
    {
      LOG_INFO << "  of size " << setw(2) << i->first << ": " << setw(6) << i->second << endl;
    }
    LOG_INFO << "  total:      " << setw(6) << out_map.size() << endl;

    delete algorithm;

    return EXECUTION_OK;
  }
  void FeatureGroupingAlgorithmUnlabeled::group(const std::vector<FeatureMap> & maps, ConsensusMap & out)
  {
    // check that the number of maps is ok
    if (maps.size() < 2)
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "At least two maps must be given!");
    }

    // define reference map (the one with most peaks)
    Size reference_map_index = 0;
    Size max_count = 0;
    for (Size m = 0; m < maps.size(); ++m)
    {
      if (maps[m].size() > max_count)
      {
        max_count = maps[m].size();
        reference_map_index = m;
      }
    }

    std::vector<ConsensusMap> input(2);

    // build a consensus map of the elements of the reference map (contains only singleton consensus elements)
    MapConversion::convert(reference_map_index, maps[reference_map_index],
                          input[0]);

    // loop over all other maps, extend the groups
    StablePairFinder pair_finder;
    pair_finder.setParameters(param_.copy("", true));

    for (Size i = 0; i < maps.size(); ++i)
    {
      if (i != reference_map_index)
      {
        MapConversion::convert(i, maps[i], input[1]);
        // compute the consensus of the reference map and map i
        ConsensusMap result;
        pair_finder.run(input, result);
        input[0].swap(result);
      }
    }

    // replace result with temporary map
    out.swap(input[0]);
    // copy back the input maps (they have been deleted while swapping)
    out.getFileDescriptions() = input[0].getFileDescriptions();

    // add protein IDs and unassigned peptide IDs to the result map here,
    // to keep the same order as the input maps (useful for output later)
    for (std::vector<FeatureMap>::const_iterator map_it = maps.begin();
         map_it != maps.end(); ++map_it)
    {
      // add protein identifications to result map
      out.getProteinIdentifications().insert(
        out.getProteinIdentifications().end(),
        map_it->getProteinIdentifications().begin(),
        map_it->getProteinIdentifications().end());

      // add unassigned peptide identifications to result map
      out.getUnassignedPeptideIdentifications().insert(
        out.getUnassignedPeptideIdentifications().end(),
        map_it->getUnassignedPeptideIdentifications().begin(),
        map_it->getUnassignedPeptideIdentifications().end());
    }

    // canonical ordering for checking the results, and the ids have no real meaning anyway
#if 1 // the way this was done in DelaunayPairFinder and StablePairFinder
    out.sortByMZ();
#else
    out.sortByQuality();
    out.sortByMaps();
    out.sortBySize();
#endif

    return;
  }
Exemplo n.º 8
0
  void ProteinInference::infer_(ConsensusMap & consensus_map,
                                const size_t protein_idenfication_index,
                                const UInt reference_map)
  {

    ProteinIdentification & protein_ident = consensus_map.getProteinIdentifications()[protein_idenfication_index];
    for (size_t i = 0; i < protein_ident.getHits().size(); ++i)
    {
      // Protein Accession
      String accession = protein_ident.getHits()[i].getAccession();

      // consensus feature -> peptide hit
      Map<size_t, PeptideHit> consensus_to_peptide;

      // search for it in consensus elements:
      for (size_t i_cm = 0; i_cm < consensus_map.size(); ++i_cm)
      {
        std::vector<PeptideHit> peptide_hits;
        for (std::vector<PeptideIdentification>::iterator it_pepid = consensus_map[i_cm].getPeptideIdentifications().begin();
             it_pepid != consensus_map[i_cm].getPeptideIdentifications().end();
             ++it_pepid)
        {
          // are Protein- and PeptideIdentification from the same search engine run?
          if (it_pepid->getIdentifier() != protein_ident.getIdentifier())
            continue;

          std::vector<PeptideHit> peptide_hits_local;

          it_pepid->getReferencingHits(accession, peptide_hits_local);

          if (peptide_hits_local.empty())
            continue;

          if (sortByUnique_(peptide_hits_local, it_pepid->isHigherScoreBetter())) // we found a unique peptide
          {
            peptide_hits.push_back(peptide_hits_local[0]);
          }

        }

        // if several PeptideIdentifications (==Spectra) were assigned to current ConsensusElement
        // --> take the best (as above), e.g. in SILAC this could happen
        // TODO: better idea?
        if (!peptide_hits.empty())
        {
          if (sortByUnique_(peptide_hits, consensus_map[i_cm].getPeptideIdentifications()[0].isHigherScoreBetter())) //found a unique peptide for current ConsensusElement
          {
            consensus_to_peptide[i_cm] = peptide_hits[0];
#ifdef DEBUG_INFERENCE
            std::cout << "assign peptide " <<  peptide_hits[0].getSequence() << " to Protein " << accession << std::endl;
#endif
          }
        }

      }       // ! ConsensusMap loop

      // no peptides found that match current Protein
      if (consensus_to_peptide.empty())
        continue;

      // Use all matching ConsensusElements to derive a quantitation for current protein
      // build up ratios for every map vs reference
      double coverage = 0;
      Map<Size, std::vector<IntensityType> > ratios;

      // number of unique peptides pointing to current protein
      UInt coverage_count = (UInt)consensus_to_peptide.size();

      for (Map<size_t, PeptideHit>::iterator it_pephits = consensus_to_peptide.begin();
           it_pephits != consensus_to_peptide.end();
           ++it_pephits)
      {
        coverage += it_pephits->second.getSequence().size();
        const ConsensusFeature::HandleSetType & handles = consensus_map[it_pephits->first].getFeatures();
        //search if reference is present
        ConsensusFeature::HandleSetType::const_iterator it_ref = handles.end();
        for (ConsensusFeature::HandleSetType::const_iterator it = handles.begin();
             it != handles.end();
             ++it)
        {
          if (it->getMapIndex() == reference_map)
          {
            it_ref = it;
            break;
          }
        }

        // did not find a reference
        // TODO assume intensity==0 instead??
        if (it_ref == handles.end())
          continue;

        for (ConsensusFeature::HandleSetType::const_iterator it = handles.begin();
             it != handles.end();
             ++it)
        {
          ratios[it->getMapIndex()].push_back(it->getIntensity() / it_ref->getIntensity());
        }

      }

      // sort ratios map-wise and take median
      for (ConsensusMap::FileDescriptions::const_iterator it_file = consensus_map.getFileDescriptions().begin();
           it_file != consensus_map.getFileDescriptions().end();
           ++it_file)
      {
        if (ratios.has(it_file->first))
        {
          //sort intensity ratios for map #it_file->first
          std::sort(ratios[it_file->first].begin(), ratios[it_file->first].end());
          //take median
          IntensityType protein_ratio = ratios[it_file->first][ratios[it_file->first].size() / 2];

          //TODO if ratios have high variance emit a warning!

          protein_ident.getHits()[i].setMetaValue(String("ratio_") + String(it_file->first), protein_ratio);
        }

      }       // ! map loop

      // % coverage of protein by peptides
      coverage /= DoubleReal(protein_ident.getHits()[i].getSequence().size()) / 100;

      protein_ident.getHits()[i].setMetaValue("coverage", coverage);
      protein_ident.getHits()[i].setMetaValue("hits", coverage_count);

    }     // ! Protein loop



    // protein_to_peptides now contains the Protein -> Peptides mapping
    // lets estimate the

  }
Exemplo n.º 9
0
  void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map)
  {
    if (input_maps.size() != 1)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required");
    if (result_map.getFileDescriptions().size() != 2)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required");
    if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name");
    checkIds_(input_maps);

    //look up the light and heavy index
    Size light_index = numeric_limits<Size>::max();
    Size heavy_index = numeric_limits<Size>::max();
    for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin();
         it != result_map.getFileDescriptions().end();
         ++it)
    {
      if (it->second.label == "heavy")
      {
        heavy_index = it->first;
      }
      else if (it->second.label == "light")
      {
        light_index = it->first;
      }
    }
    if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max())
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'");
    }

    result_map.clear(false);

    // sort consensus features by RT (and MZ) to speed up searching afterwards
    typedef ConstRefVector<ConsensusMap> RefMap;
    RefMap model_ref(input_maps[0].begin(), input_maps[0].end());
    model_ref.sortByPosition();

    //calculate matches
    ConsensusMap matches;
    //settings
    double rt_pair_dist = param_.getValue("rt_pair_dist");
    double rt_dev_low = param_.getValue("rt_dev_low");
    double rt_dev_high = param_.getValue("rt_dev_high");
    double mz_dev = param_.getValue("mz_dev");
    DoubleList mz_pair_dists = param_.getValue("mz_pair_dists");
    bool mrm = param_.getValue("mrm").toBool();

    //estimate RT parameters
    if (param_.getValue("rt_estimate") == "true")
    {
      //find all possible RT distances of features with the same charge and a good m/z distance
      vector<double> dists;
      dists.reserve(model_ref.size());
      for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
      {
        for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2)
        {
          for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
          {
            double mz_pair_dist = *dist_it;
            if (it2->getCharge() == it->getCharge()
               && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev
               && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev)
            {
              dists.push_back(it2->getRT() - it->getRT());
            }
          }
        }
      }
      if (dists.empty())
      {
        cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl;
      }
      else
      {
        if (dists.size() < 50)
        {
          cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl;
        }
        //--------------------------- estimate initial parameters of fit ---------------------------
        GaussFitter::GaussFitResult result(-1, -1, -1);
        //first estimate of the optimal shift: median of the distances
        sort(dists.begin(), dists.end());
        Size median_index = dists.size() / 2;
        result.x0 = dists[median_index];
        //create histogram of distances
        //consider only the maximum of pairs, centered around the optimal shift
        Size max_pairs = model_ref.size() / 2;
        Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2));
        Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2));
        double start_value = dists[start_index];
        double end_value = dists[end_index];
        double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins
        Math::Histogram<> hist(start_value, end_value, bin_step);
        //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl;
        for (Size i = start_index; i <= end_index; ++i)
        {
          hist.inc(dists[i]);
        }
        //cout << hist << endl;
        dists.clear();
        //determine median of bins (uniform background distribution)
        vector<Size> bins(hist.begin(), hist.end());
        sort(bins.begin(), bins.end());
        Size bin_median = bins[bins.size() / 2];
        bins.clear();
        //estimate scale A: maximum of the histogram
        Size max_value = hist.maxValue();
        result.A = max_value - bin_median;
        //overwrite estimate of x0 with the position of the highest bin
        for (Size i = 0; i < hist.size(); ++i)
        {
          if (hist[i] == max_value)
          {
            result.x0 = hist.centerOfBin(i);
            break;
          }
        }
        //estimate sigma: first time the count is less or equal the median count in the histogram
        double pos = result.x0;
        while (pos > start_value && hist.binValue(pos) > bin_median)
        {
          pos -= bin_step;
        }
        double sigma_low =  result.x0 - pos;
        pos = result.x0;
        while (pos<end_value&& hist.binValue(pos)> bin_median)
        {
          pos += bin_step;
        }
        double sigma_high = pos - result.x0;
        result.sigma = (sigma_high + sigma_low) / 6.0;
        //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl;
        //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl;
        //--------------------------- do gauss fit ---------------------------
        vector<DPosition<2> > points(hist.size());
        for (Size i = 0; i < hist.size(); ++i)
        {
          points[i][0] = hist.centerOfBin(i);
          points[i][1] = max(0u, hist[i]);
        }
        GaussFitter fitter;
        fitter.setInitialParameters(result);
        result = fitter.fit(points);
        cout << "estimated optimal RT distance: " << result.x0 << endl;
        cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl;
        rt_pair_dist = result.x0;
        rt_dev_low = fabs(result.sigma) * 3.0;
        rt_dev_high = fabs(result.sigma) * 3.0;
      }
    }


    // check each feature
    for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
    {
      for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
      {
        double mz_pair_dist = *dist_it;
        RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess());
        while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high)
        {
          // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same

          double prec_mz_diff(0);
          if (mrm)
          {
            prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ"));
            if (it->getCharge() != 0)
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge());
            }
            else
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist);
            }
          }

          bool mrm_correct_dist(false);
          double frag_mz_diff = fabs(it->getMZ() - it2->getMZ());

          //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl;

          if (mrm &&
              it2->getCharge() == it->getCharge() &&
              prec_mz_diff < mz_dev &&
              (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev))
          {
            mrm_correct_dist = true;
            //cerr << "mrm_correct_dist" << endl;
          }

          if ((mrm && mrm_correct_dist) || (!mrm &&
                                            it2->getCharge() == it->getCharge() &&
                                            it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev &&
                                            it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev
                                            ))
          {
            //cerr << "dist correct" << endl;
            double score = sqrt(
              PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) *
              PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high)
              );

            // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels.
            // We might want to report consensus features where the light one is missing but more than one heavier variant was found.
            // Also, the old strategy is inconsistent with what was done in the unlabeled case.  Thus now we assign a new unique id here.
            matches.push_back(ConsensusFeature());
            matches.back().setUniqueId();

            matches.back().insert(light_index, *it);
            matches.back().clearMetaInfo();
            matches.back().insert(heavy_index, *it2);
            matches.back().setQuality(score);
            matches.back().setCharge(it->getCharge());
            matches.back().computeMonoisotopicConsensus();
          }
          ++it2;
        }
      }
    }

    //compute best pairs
    // - sort matches by quality
    // - take highest-quality matches first (greedy) and mark them as used
    set<Size> used_features;
    matches.sortByQuality(true);
    for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match)
    {
      //check if features are not used yet
      if (used_features.find(match->begin()->getUniqueId()) == used_features.end() &&
          used_features.find(match->rbegin()->getUniqueId()) == used_features.end()
          )
      {
        //if unused, add it to the final set of elements
        result_map.push_back(*match);
        used_features.insert(match->begin()->getUniqueId());
        used_features.insert(match->rbegin()->getUniqueId());
      }
    }

    //Add protein identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end());
    }

    //Add unassigned peptide identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end());
    }

    // Very useful for checking the results, and the ids have no real meaning anyway
    result_map.sortByMZ();
  }
Exemplo n.º 10
0
  void IBSpectraFile::store(const String& filename, const ConsensusMap& cm)
  {
    // typdefs for shorter code
    typedef std::vector<ProteinHit>::iterator ProtHitIt;

    // general settings .. do we need to expose these?
    // ----------------------------------------------------------------------
    /// Allow also non-unique peptides to be exported
    bool allow_non_unique = true;
    /// Intensities below this value will be set to 0.0 to avoid numerical problems when quantifying
    double intensity_threshold = 0.00001;
    // ----------------------------------------------------------------------


    // guess experiment type
    boost::shared_ptr<IsobaricQuantitationMethod> quantMethod = guessExperimentType_(cm);

    // we need the protein identifications to reference the protein names
    ProteinIdentification protIdent;
    bool has_proteinIdentifications = false;
    if (cm.getProteinIdentifications().size() > 0)
    {
      protIdent = cm.getProteinIdentifications()[0];
      has_proteinIdentifications = true;
    }

    // start the file by adding the tsv header
    TextFile textFile;
    textFile.addLine(ListUtils::concatenate(constructHeader_(*quantMethod), "\t"));

    for (ConsensusMap::ConstIterator cm_iter = cm.begin();
         cm_iter != cm.end();
         ++cm_iter)
    {
      const ConsensusFeature& cFeature = *cm_iter;
      std::vector<IdCSV> entries;

      /// 1st we extract the identification information from the consensus feature
      if (cFeature.getPeptideIdentifications().size() == 0 || !has_proteinIdentifications)
      {
        // we store unidentified hits anyway, because the iTRAQ quant is still helpful for normalization
        entries.push_back(IdCSV());
      }
      else
      {
        // protein name:
        const PeptideHit& peptide_hit = cFeature.getPeptideIdentifications()[0].getHits()[0];
        std::set<String> protein_accessions = peptide_hit.extractProteinAccessions();
        if (protein_accessions.size() != 1)
        {
          if (!allow_non_unique) continue; // we only want unique peptides
        }

        for (std::set<String>::const_iterator prot_ac = protein_accessions.begin(); prot_ac != protein_accessions.end(); ++prot_ac)
        {
          IdCSV entry;
          entry.charge = cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge();
          entry.peptide = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString();
          entry.theo_mass = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().getMonoWeight(Residue::Full, cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge());

          // write modif
          entry.modif = getModifString_(cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence());

          ProtHitIt proteinHit = protIdent.findHit(*prot_ac);
          if (proteinHit == protIdent.getHits().end())
          {
            std::cerr << "Protein referenced in peptide not found...\n";
            continue; // protein not found
          }

          entry.accession = proteinHit->getAccession();
          entries.push_back(entry);
        }
      }

      // 2nd we add the quantitative information of the channels

      // .. skip features with 0 intensity
      if (cFeature.getIntensity() == 0)
      {
        continue;
      }

      for (std::vector<IdCSV>::iterator entry = entries.begin();
           entry != entries.end();
           ++entry)
      {
        // set parent intensity
        entry->parent_intens = cFeature.getIntensity();
        entry->retention_time = cFeature.getRT();
        entry->spectrum = cFeature.getUniqueId();
        entry->exp_mass = cFeature.getMZ();

        // create output line
        StringList currentLine;

        // add entry to currentLine
        entry->toStringList(currentLine);

        // extract channel intensities and positions
        std::map<Int, double> intensityMap;
        ConsensusFeature::HandleSetType features = cFeature.getFeatures();

        for (ConsensusFeature::HandleSetType::const_iterator fIt = features.begin();
             fIt != features.end();
             ++fIt)
        {
          intensityMap[Int(fIt->getMZ())] = (fIt->getIntensity() > intensity_threshold ? fIt->getIntensity() : 0.0);
        }
        for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin();
             it != quantMethod->getChannelInformation().end();
             ++it)
        {
          currentLine.push_back(String(it->center));
        }
        for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin();
             it != quantMethod->getChannelInformation().end();
             ++it)
        {
          currentLine.push_back(String(intensityMap[int(it->center)]));
        }

        textFile.addLine(ListUtils::concatenate(currentLine, "\t"));
      }
    }

    // write to file
    textFile.store(filename);
  }
Exemplo n.º 11
0
  ExitCodes main_(int, const char **)
  {
    String in = getStringOption_("in"), out = getStringOption_("out"),
           id_out = getStringOption_("id_out");

    if (out.empty() && id_out.empty())
    {
      throw Exception::RequiredParameterNotGiven(__FILE__, __LINE__,
                                                 __PRETTY_FUNCTION__,
                                                 "out/id_out");
    }

    vector<ProteinIdentification> proteins;
    vector<PeptideIdentification> peptides;

    FileTypes::Type in_type = FileHandler::getType(in);

    if (in_type == FileTypes::MZML)
    {
      MSExperiment<> experiment;
      MzMLFile().load(in, experiment);
      // what about unassigned peptide IDs?
      for (MSExperiment<>::Iterator exp_it = experiment.begin();
           exp_it != experiment.end(); ++exp_it)
      {
        peptides.insert(peptides.end(),
                        exp_it->getPeptideIdentifications().begin(),
                        exp_it->getPeptideIdentifications().end());
        exp_it->getPeptideIdentifications().clear();
      }
      experiment.getProteinIdentifications().swap(proteins);
      if (!out.empty())
      {
        addDataProcessing_(experiment,
                           getProcessingInfo_(DataProcessing::FILTERING));
        MzMLFile().store(out, experiment);
      }
    }
    else if (in_type == FileTypes::FEATUREXML)
    {
      FeatureMap features;
      FeatureXMLFile().load(in, features);
      features.getUnassignedPeptideIdentifications().swap(peptides);
      for (FeatureMap::Iterator feat_it = features.begin();
           feat_it != features.end(); ++feat_it)
      {
        peptides.insert(peptides.end(),
                        feat_it->getPeptideIdentifications().begin(),
                        feat_it->getPeptideIdentifications().end());
        feat_it->getPeptideIdentifications().clear();
      }
      features.getProteinIdentifications().swap(proteins);
      if (!out.empty())
      {
        addDataProcessing_(features,
                           getProcessingInfo_(DataProcessing::FILTERING));
        FeatureXMLFile().store(out, features);
      }
    }
    else         // consensusXML
    {
      ConsensusMap consensus;
      ConsensusXMLFile().load(in, consensus);
      consensus.getUnassignedPeptideIdentifications().swap(peptides);
      for (ConsensusMap::Iterator cons_it = consensus.begin();
           cons_it != consensus.end(); ++cons_it)
      {
        peptides.insert(peptides.end(),
                        cons_it->getPeptideIdentifications().begin(),
                        cons_it->getPeptideIdentifications().end());
        cons_it->getPeptideIdentifications().clear();
      }
      consensus.getProteinIdentifications().swap(proteins);
      if (!out.empty())
      {
        addDataProcessing_(consensus,
                           getProcessingInfo_(DataProcessing::FILTERING));
        ConsensusXMLFile().store(out, consensus);
      }
    }

    if (!id_out.empty())
    {
      // IDMapper can match a peptide ID to several overlapping features,
      // resulting in duplicates; this shouldn't be the case for peak data
      if (in_type != FileTypes::MZML) removeDuplicates_(peptides);
      IdXMLFile().store(id_out, proteins, peptides);
    }

    return EXECUTION_OK;
  }
Exemplo n.º 12
0
TEST_EQUAL(map.getFileDescriptions()[1].getMetaValue("name5") == DataValue("value5"), true)
TEST_EQUAL(map.getFileDescriptions()[1].getMetaValue("name6") == DataValue(6.0), true)
//data processing
TEST_EQUAL(map.getDataProcessing().size(), 2)
TEST_STRING_EQUAL(map.getDataProcessing()[0].getSoftware().getName(), "Software1")
TEST_STRING_EQUAL(map.getDataProcessing()[0].getSoftware().getVersion(), "0.91a")
TEST_EQUAL(map.getDataProcessing()[0].getProcessingActions().size(), 1)
TEST_EQUAL(map.getDataProcessing()[0].getProcessingActions().count(DataProcessing::DEISOTOPING), 1)
TEST_STRING_EQUAL(map.getDataProcessing()[0].getMetaValue("name"), "dataProcessing")
TEST_STRING_EQUAL(map.getDataProcessing()[1].getSoftware().getName(), "Software2")
TEST_STRING_EQUAL(map.getDataProcessing()[1].getSoftware().getVersion(), "0.92a")
TEST_EQUAL(map.getDataProcessing()[1].getProcessingActions().size(), 2)
TEST_EQUAL(map.getDataProcessing()[1].getProcessingActions().count(DataProcessing::SMOOTHING), 1)
TEST_EQUAL(map.getDataProcessing()[1].getProcessingActions().count(DataProcessing::BASELINE_REDUCTION), 1)
//protein identifications
TEST_EQUAL(map.getProteinIdentifications().size(), 2)
TEST_EQUAL(map.getProteinIdentifications()[0].getHits().size(), 2)
TEST_EQUAL(map.getProteinIdentifications()[0].getHits()[0].getSequence(), "ABCDEFG")
TEST_EQUAL(map.getProteinIdentifications()[0].getHits()[1].getSequence(), "HIJKLMN")
TEST_EQUAL(map.getProteinIdentifications()[1].getHits().size(), 1)
TEST_EQUAL(map.getProteinIdentifications()[1].getHits()[0].getSequence(), "OPQREST")
//peptide identifications
TEST_EQUAL(map[0].getPeptideIdentifications().size(), 2)
TEST_EQUAL(map[0].getPeptideIdentifications()[0].getHits().size(), 1)
TEST_EQUAL(map[0].getPeptideIdentifications()[0].getHits()[0].getSequence(), "A")
TEST_EQUAL(map[0].getPeptideIdentifications()[1].getHits().size(), 2)
TEST_EQUAL(map[0].getPeptideIdentifications()[1].getHits()[0].getSequence(), "C")
TEST_EQUAL(map[0].getPeptideIdentifications()[1].getHits()[1].getSequence(), "D")
TEST_EQUAL(map[1].getPeptideIdentifications().size(), 1)
TEST_EQUAL(map[1].getPeptideIdentifications()[0].getHits().size(), 1)
TEST_EQUAL(map[1].getPeptideIdentifications()[0].getHits()[0].getSequence(), "E")