Ejemplo n.º 1
0
  ExitCodes main_(int, const char**)
  {
    String inputfile_name = getStringOption_("in");
    String outputfile_name = getStringOption_("out");

    vector<ProteinIdentification> proteins;
    vector<PeptideIdentification> peptides;
    IdXMLFile().load(inputfile_name, proteins, peptides);

    Size n_prot_ids = proteins.size();
    Size n_prot_hits = IDFilter::countHits(proteins);
    Size n_pep_ids = peptides.size();
    Size n_pep_hits = IDFilter::countHits(peptides);


    // Filtering peptide identification according to set criteria

    double rt_high = numeric_limits<double>::infinity(), rt_low = -rt_high;
    if (parseRange_(getStringOption_("precursor:rt"), rt_low, rt_high))
    {
      LOG_INFO << "Filtering peptide IDs by precursor RT..." << endl;
      IDFilter::filterPeptidesByRT(peptides, rt_low, rt_high);
    }

    double mz_high = numeric_limits<double>::infinity(), mz_low = -mz_high;
    if (parseRange_(getStringOption_("precursor:mz"), mz_low, mz_high))
    {
      LOG_INFO << "Filtering peptide IDs by precursor m/z...";
      IDFilter::filterPeptidesByMZ(peptides, mz_low, mz_high);
    }


    // Filtering peptide hits according to set criteria

    if (getFlag_("unique"))
    {
      LOG_INFO << "Removing duplicate peptide hits..." << endl;
      IDFilter::removeDuplicatePeptideHits(peptides);
    }

    if (getFlag_("unique_per_protein"))
    {
      LOG_INFO << "Filtering peptides by unique match to a protein..." << endl;
      IDFilter::keepUniquePeptidesPerProtein(peptides);
    }

    double peptide_significance = getDoubleOption_("thresh:pep");
    if (peptide_significance > 0)
    {
      LOG_INFO << "Filtering by peptide significance threshold..." << endl;
      IDFilter::filterHitsBySignificance(peptides, peptide_significance);
    }

    double pred_rt_pv = getDoubleOption_("rt:p_value");
    if (pred_rt_pv > 0)
    {
      LOG_INFO << "Filtering by RT prediction p-value..." << endl;
      IDFilter::filterPeptidesByRTPredictPValue(
        peptides, "predicted_RT_p_value", pred_rt_pv);
    }

    double pred_rt_pv_1d = getDoubleOption_("rt:p_value_1st_dim");
    if (pred_rt_pv_1d > 0)
    {
      LOG_INFO << "Filtering by RT prediction p-value (first dim.)..." << endl;
      IDFilter::filterPeptidesByRTPredictPValue(
        peptides, "predicted_RT_p_value_first_dim", pred_rt_pv_1d);
    }

    String whitelist_fasta = getStringOption_("whitelist:proteins").trim();
    if (!whitelist_fasta.empty())
    {
      LOG_INFO << "Filtering by protein whitelisting (FASTA input)..." << endl;
      // load protein accessions from FASTA file:
      vector<FASTAFile::FASTAEntry> fasta;
      FASTAFile().load(whitelist_fasta, fasta);
      set<String> accessions;
      for (vector<FASTAFile::FASTAEntry>::iterator it = fasta.begin();
           it != fasta.end(); ++it)
      {
        accessions.insert(it->identifier);
      }
      IDFilter::keepHitsMatchingProteins(peptides, accessions);
      IDFilter::keepHitsMatchingProteins(proteins, accessions);
    }

    vector<String> whitelist_accessions =
      getStringList_("whitelist:protein_accessions");
    if (!whitelist_accessions.empty())
    {
      LOG_INFO << "Filtering by protein whitelisting (accessions input)..."
               << endl;
      set<String> accessions(whitelist_accessions.begin(),
                             whitelist_accessions.end());
      IDFilter::keepHitsMatchingProteins(peptides, accessions);
      IDFilter::keepHitsMatchingProteins(proteins, accessions);
    }

    String whitelist_peptides = getStringOption_("whitelist:peptides").trim();
    if (!whitelist_peptides.empty())
    {
      LOG_INFO << "Filtering by inclusion peptide whitelisting..." << endl;
      vector<PeptideIdentification> inclusion_peptides;
      vector<ProteinIdentification> inclusion_proteins; // ignored
      IdXMLFile().load(whitelist_peptides, inclusion_proteins,
                       inclusion_peptides);
      bool ignore_mods = getFlag_("whitelist:ignore_modifications");
      IDFilter::keepPeptidesWithMatchingSequences(peptides, inclusion_peptides,
                                                  ignore_mods);
    }

    vector<String> whitelist_mods = getStringList_("whitelist:modifications");
    if (!whitelist_mods.empty())
    {
      LOG_INFO << "Filtering peptide IDs by modification whitelisting..."
               << endl;
      set<String> good_mods(whitelist_mods.begin(), whitelist_mods.end());
      IDFilter::keepPeptidesWithMatchingModifications(peptides, good_mods);
    }

    String blacklist_fasta = getStringOption_("blacklist:proteins").trim();
    if (!blacklist_fasta.empty())
    {
      LOG_INFO << "Filtering by protein blacklisting (FASTA input)..." << endl;
      // load protein accessions from FASTA file:
      vector<FASTAFile::FASTAEntry> fasta;
      FASTAFile().load(blacklist_fasta, fasta);
      set<String> accessions;
      for (vector<FASTAFile::FASTAEntry>::iterator it = fasta.begin();
           it != fasta.end(); ++it)
      {
        accessions.insert(it->identifier);
      }
      IDFilter::removeHitsMatchingProteins(peptides, accessions);
      IDFilter::removeHitsMatchingProteins(proteins, accessions);
    }

    vector<String> blacklist_accessions =
      getStringList_("blacklist:protein_accessions");
    if (!blacklist_accessions.empty())
    {
      LOG_INFO << "Filtering by protein blacklisting (accessions input)..."
               << endl;
      set<String> accessions(blacklist_accessions.begin(),
                             blacklist_accessions.end());
      IDFilter::removeHitsMatchingProteins(peptides, accessions);
      IDFilter::removeHitsMatchingProteins(proteins, accessions);
    }

    String blacklist_peptides = getStringOption_("blacklist:peptides").trim();
    if (!blacklist_peptides.empty())
    {
      LOG_INFO << "Filtering by exclusion peptide blacklisting..." << endl;
      vector<PeptideIdentification> exclusion_peptides;
      vector<ProteinIdentification> exclusion_proteins; // ignored
      IdXMLFile().load(blacklist_peptides, exclusion_proteins,
                       exclusion_peptides);
      bool ignore_mods = getFlag_("blacklist:ignore_modifications");
      IDFilter::removePeptidesWithMatchingSequences(
        peptides, exclusion_peptides, ignore_mods);
    }

    vector<String> blacklist_mods = getStringList_("blacklist:modifications");
    if (!blacklist_mods.empty())
    {
      LOG_INFO << "Filtering peptide IDs by modification blacklisting..."
               << endl;
      set<String> bad_mods(blacklist_mods.begin(), blacklist_mods.end());
      IDFilter::removePeptidesWithMatchingModifications(peptides, bad_mods);
    }


    if (getFlag_("best:strict"))
    {
      LOG_INFO << "Filtering by best peptide hits..." << endl;
      IDFilter::keepBestPeptideHits(peptides, true);
    }


    Int min_length = 0, max_length = 0;
    if (parseRange_(getStringOption_("length"), min_length, max_length))
    {
      LOG_INFO << "Filtering by peptide length..." << endl;
      if ((min_length < 0) || (max_length < 0))
      {
        LOG_ERROR << "Fatal error: negative values are not allowed for parameter 'length'" << endl;
        return ILLEGAL_PARAMETERS;
      }
      IDFilter::filterPeptidesByLength(peptides, Size(min_length),
                                       Size(max_length));
    }

    // Filter by digestion enzyme product

    String protein_fasta = getStringOption_("digest:fasta").trim();
    if (!protein_fasta.empty())
    {
      LOG_INFO << "Filtering peptides by digested protein (FASTA input)..." << endl;
      // load protein accessions from FASTA file:
      vector<FASTAFile::FASTAEntry> fasta;
      FASTAFile().load(protein_fasta, fasta);

      // Configure Enzymatic digestion
      EnzymaticDigestion digestion;
      String enzyme = getStringOption_("digest:enzyme").trim();
      if (!enzyme.empty())
      {
        digestion.setEnzyme(enzyme);
      }

      String specificity = getStringOption_("digest:specificity").trim();
      if (!specificity.empty())
      {
        digestion.setSpecificity(digestion.getSpecificityByName(specificity));
      }

      Int missed_cleavages = getIntOption_("digest:missed_cleavages");
      bool ignore_missed_cleavages = true;
      if (missed_cleavages > -1)
      {
        ignore_missed_cleavages = false;
        if (digestion.getSpecificity() == EnzymaticDigestion::SPEC_FULL)
        {
          LOG_WARN << "Specificity not full, missed_cleavages option is redundant" << endl;
        }
        digestion.setMissedCleavages(missed_cleavages);
      }
      
      bool methionine_cleavage = false;
      if (getFlag_("digest:methionine_cleavage"))
      {
        methionine_cleavage = true;
      }

      // Build the digest filter function
      IDFilter::DigestionFilter filter(fasta, 
                                       digestion, 
                                       ignore_missed_cleavages, 
                                       methionine_cleavage);
      // Filter peptides
      filter.filterPeptideEvidences(peptides);
    }


    if (getFlag_("var_mods"))
    {
      LOG_INFO << "Filtering for variable modifications..." << endl;
      // gather possible variable modifications from search parameters:
      set<String> var_mods;
      for (vector<ProteinIdentification>::iterator prot_it = proteins.begin();
           prot_it != proteins.end(); ++prot_it)
      {
        const ProteinIdentification::SearchParameters& params =
          prot_it->getSearchParameters();
        for (vector<String>::const_iterator mod_it =
               params.variable_modifications.begin(); mod_it !=
               params.variable_modifications.end(); ++mod_it)
        {
          var_mods.insert(*mod_it);
        }
      }
      IDFilter::keepPeptidesWithMatchingModifications(peptides, var_mods);
    }

    double pep_score = getDoubleOption_("score:pep");
    // @TODO: what if 0 is a reasonable cut-off for some score?
    if (pep_score != 0)
    {
      LOG_INFO << "Filtering by peptide score..." << endl;
      IDFilter::filterHitsByScore(peptides, pep_score);
    }

    Int min_charge = numeric_limits<Int>::min(), max_charge =
      numeric_limits<Int>::max();
    if (parseRange_(getStringOption_("charge"), min_charge, max_charge))
    {
      LOG_INFO << "Filtering by peptide charge..." << endl;
      IDFilter::filterPeptidesByCharge(peptides, min_charge, max_charge);
    }

    Size best_n_pep = getIntOption_("best:n_peptide_hits");
    if (best_n_pep > 0)
    {
      LOG_INFO << "Filtering by best n peptide hits..." << endl;
      IDFilter::keepNBestHits(peptides, best_n_pep);
    }

    Int min_rank = 0, max_rank = 0;
    if (parseRange_(getStringOption_("best:n_to_m_peptide_hits"), min_rank,
                    max_rank))
    {
      LOG_INFO << "Filtering by peptide hit ranks..." << endl;
      if ((min_rank < 0) || (max_rank < 0))
      {
        LOG_ERROR << "Fatal error: negative values are not allowed for parameter 'best:n_to_m_peptide_hits'" << endl;
        return ILLEGAL_PARAMETERS;
      }
      IDFilter::filterHitsByRank(peptides, Size(min_rank), Size(max_rank));
    }

    double mz_error = getDoubleOption_("mz:error");
    if (mz_error > 0)
    {
      LOG_INFO << "Filtering by mass error..." << endl;
      bool unit_ppm = (getStringOption_("mz:unit") == "ppm");
      IDFilter::filterPeptidesByMZError(peptides, mz_error, unit_ppm);
    }


    // Filtering protein identifications according to set criteria

    double protein_significance = getDoubleOption_("thresh:prot");
    if (protein_significance > 0)
    {
      LOG_INFO << "Filtering by protein significance threshold..." << endl;
      IDFilter::filterHitsBySignificance(proteins, protein_significance);
    }

    double prot_score = getDoubleOption_("score:prot");
    // @TODO: what if 0 is a reasonable cut-off for some score?
    if (prot_score != 0)
    {
      LOG_INFO << "Filtering by protein score..." << endl;
      IDFilter::filterHitsByScore(proteins, prot_score);
    }

    Size best_n_prot = getIntOption_("best:n_protein_hits");
    if (best_n_prot > 0)
    {
      LOG_INFO << "Filtering by best n protein hits..." << endl;
      IDFilter::keepNBestHits(proteins, best_n_prot);
    }

    if (getFlag_("remove_decoys"))
    {
      LOG_INFO << "Removing decoy hits..." << endl;
      IDFilter::removeDecoyHits(peptides);
      IDFilter::removeDecoyHits(proteins);
    }


    // Clean-up:

    if (!getFlag_("keep_unreferenced_protein_hits"))
    {
      LOG_INFO << "Removing unreferenced protein hits..." << endl;
      IDFilter::removeUnreferencedProteins(proteins, peptides);
    }

    IDFilter::updateHitRanks(proteins);
    IDFilter::updateHitRanks(peptides);

    // remove non-existant protein references from peptides (and optionally:
    // remove peptides with no proteins):
    bool rm_pep = getFlag_("delete_unreferenced_peptide_hits");
    if (rm_pep) LOG_INFO << "Removing peptide hits without protein references..." << endl;
    IDFilter::updateProteinReferences(peptides, proteins, rm_pep);

    IDFilter::removeEmptyIdentifications(peptides);
    // we want to keep "empty" protein IDs because they contain search meta data

    // update protein groupings if necessary:
    for (vector<ProteinIdentification>::iterator prot_it = proteins.begin();
         prot_it != proteins.end(); ++prot_it)
    {
      bool valid = IDFilter::updateProteinGroups(prot_it->getProteinGroups(),
                                                 prot_it->getHits());
      if (!valid)
      {
        LOG_WARN << "Warning: While updating protein groups, some proteins were removed from groups that are still present. The new grouping (especially the group probabilities) may not be completely valid any more." << endl;
      }

      valid = IDFilter::updateProteinGroups(
        prot_it->getIndistinguishableProteins(), prot_it->getHits());
      if (!valid)
      {
        LOG_WARN << "Warning: While updating indistinguishable proteins, some proteins were removed from groups that are still present. The new grouping (especially the group probabilities) may not be completely valid any more." << endl;
      }
    }

    // some stats
    LOG_INFO << "Before filtering:\n"
             << n_prot_ids << " protein identification(s) with "
             << n_prot_hits << " protein hit(s),\n"
             << n_pep_ids << " peptide identification(s) with "
             << n_pep_hits << " peptides hit(s).\n"
             << "After filtering:\n"
             << proteins.size() << " protein identification(s) with "
             << IDFilter::countHits(proteins) << " protein hit(s),\n"
             << peptides.size() << " peptide identification(s) with "
             << IDFilter::countHits(peptides) << " peptides hit(s)." << endl;

    IdXMLFile().store(outputfile_name, proteins, peptides);

    return EXECUTION_OK;
  }
START_SECTION([EXTRA] ~EnzymaticDigestion())
    delete e_ptr;
END_SECTION

START_SECTION((EnzymaticDigestion(const EnzymaticDigestion &rhs)))
    EnzymaticDigestion ed;
    ed.setMissedCleavages(1234);
    ed.setEnzyme("no cleavage");
    ed.setSpecificity(EnzymaticDigestion::SPEC_SEMI);
    
    EnzymaticDigestion ed2(ed);
    
    TEST_EQUAL(ed.getMissedCleavages(), ed2.getMissedCleavages());
    TEST_EQUAL(ed.getEnzymeName(), ed2.getEnzymeName());
    TEST_EQUAL(ed.getSpecificity(), ed2.getSpecificity());

END_SECTION

START_SECTION((EnzymaticDigestion & operator=(const EnzymaticDigestion &rhs)))
    EnzymaticDigestion ed;
    ed.setMissedCleavages(1234);
    ed.setEnzyme("no cleavage");
    ed.setSpecificity(EnzymaticDigestion::SPEC_SEMI);
    
    EnzymaticDigestion ed2 = ed;
    
    TEST_EQUAL(ed.getMissedCleavages(), ed2.getMissedCleavages());
    TEST_EQUAL(ed.getEnzymeName(), ed2.getEnzymeName());
    TEST_EQUAL(ed.getSpecificity(), ed2.getSpecificity());