コード例 #1
0
  void IDDecoyProbability::apply_(vector<PeptideIdentification> & ids, const vector<double> & rev_scores, const vector<double> & fwd_scores, const vector<double> & all_scores)
  {
    Size number_of_bins(param_.getValue("number_of_bins"));



    // normalize distribution to [0, 1]
    vector<double> fwd_scores_normalized(number_of_bins, 0.0), rev_scores_normalized(number_of_bins, 0.0), diff_scores(number_of_bins, 0.0), all_scores_normalized(number_of_bins, 0.0);
    Transformation_ rev_trafo, fwd_trafo, all_trafo;
    normalizeBins_(rev_scores, rev_scores_normalized, rev_trafo);
    normalizeBins_(fwd_scores, fwd_scores_normalized, fwd_trafo);
    normalizeBins_(all_scores, all_scores_normalized, all_trafo);

    // rev scores fitting
    vector<DPosition<2> > rev_data;

    for (Size i = 0; i < number_of_bins; ++i)
    {
      DPosition<2> pos;
      pos.setX(((double)i) / (double)number_of_bins + 0.0001);    // necessary????
      pos.setY(rev_scores_normalized[i]);
      rev_data.push_back(pos);
#ifdef IDDECOYPROBABILITY_DEBUG
      cerr << pos.getX() << " " << pos.getY() << endl;
#endif
    }

    Math::GammaDistributionFitter gdf;
    Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma_1st (1.0, 3.0);
    gdf.setInitialParameters(result_gamma_1st);
    // TODO heuristic for good start parameters
    Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma = gdf.fit(rev_data);

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << gdf.getGnuplotFormula() << endl;
    String rev_filename = param_.getValue("rev_filename");
    generateDistributionImage_(rev_scores_normalized, gdf.getGnuplotFormula(), rev_filename);
#endif

    // generate diffs of distributions
    // get the fwd and rev distribution, apply all_trafo and calculate the diff
    vector<Size> fwd_bins(number_of_bins, 0), rev_bins(number_of_bins, 0);
    double min(all_trafo.min_score), diff(all_trafo.diff_score);
    Size max_bin(0);
    for (vector<double>::const_iterator it = fwd_scores.begin(); it != fwd_scores.end(); ++it)
    {
      Size bin = (Size)((*it - min) / diff * (double)(number_of_bins - 1));
      ++fwd_bins[bin];
      if (fwd_bins[bin] > max_bin)
      {
        max_bin = fwd_bins[bin];
      }
    }

    Size max_reverse_bin(0), max_reverse_bin_value(0);
    //min = rev_trafo.min_score;
    //diff = rev_trafo.diff_score;
    for (vector<double>::const_iterator it = rev_scores.begin(); it != rev_scores.end(); ++it)
    {
      Size bin = (Size)((*it - min) / diff * (double)number_of_bins);
      ++rev_bins[bin];
      if (rev_bins[bin] > max_bin)
      {
        max_bin = rev_bins[bin];
      }
      if (rev_bins[bin] > max_reverse_bin_value)
      {
        max_reverse_bin = bin;
        max_reverse_bin_value = rev_bins[bin];
      }
    }

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Trying to get diff scores" << endl;
#endif

    // get diff of fwd and rev
    for (Size i = 0; i < number_of_bins; ++i)
    {
      Size fwd(0), rev(0);
      fwd = fwd_bins[i];
      rev = rev_bins[i];
      if ((double)fwd > (double)(1.3 * rev) && max_reverse_bin < i)
      {
        diff_scores[i] = (double)(fwd - rev) / (double)max_bin;
      }
      else
      {
        diff_scores[i] = 0.0;
      }
    }
#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Gauss Fitting values size of diff scores=" << diff_scores.size() << endl;
#endif
    // diff scores fitting
    vector<DPosition<2> > diff_data;
    double gauss_A(0), gauss_x0(0), norm_factor(0);
    for (Size i = 0; i < number_of_bins; ++i)
    {
      DPosition<2> pos;
      pos.setX((double)i / (double)number_of_bins);
      pos.setY(diff_scores[i]);

      if (pos.getY() > gauss_A)
      {
        gauss_A = pos.getY();
      }
      gauss_x0 += pos.getX() * pos.getY();
      norm_factor += pos.getY();


      diff_data.push_back(pos);
    }

    double gauss_sigma(0);
    gauss_x0 /= (double)diff_data.size();
    gauss_x0 /= norm_factor;

    for (Size i = 0; i <= number_of_bins; ++i)
    {
      gauss_sigma += fabs(gauss_x0 - (double)i / (double)number_of_bins);
    }

    gauss_sigma /= (double)diff_data.size();



#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "setting initial parameters: " << endl;
#endif
    Math::GaussFitter gf;
    Math::GaussFitter::GaussFitResult result_1st(gauss_A, gauss_x0, gauss_sigma);
    gf.setInitialParameters(result_1st);
#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Initial Gauss guess: A=" << gauss_A << ", x0=" << gauss_x0 << ", sigma=" << gauss_sigma << endl;
#endif

    //TODO: fail-to-fit correction was done using the GNUPlotFormula. Seemed to be a hack.
    //Changed it to try-catch-block but I am not sure if this correction should be made
    //at all. Can someone please verify?
    Math::GaussFitter::GaussFitResult result_gauss (gauss_A, gauss_x0, gauss_sigma);
    try{
        result_gauss = gf.fit(diff_data);
    }
    catch(Exception::UnableToFit& /* e */)
    {
      result_gauss.A = gauss_A;
      result_gauss.x0 = gauss_x0;
      result_gauss.sigma = gauss_sigma;
    }

//    // fit failed?
//    if (gf.getGnuplotFormula() == "")
//    {
//      result_gauss.A = gauss_A;
//      result_gauss.x0 = gauss_x0;
//      result_gauss.sigma = gauss_sigma;
//    }

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << gf.getGnuplotFormula() << endl;
    String fwd_filename = param_.getValue("fwd_filename");
    if (gf.getGnuplotFormula() == "")
    {
      String formula("f(x)=" + String(gauss_A) + " * exp(-(x - " + String(gauss_x0) + ") ** 2 / 2 / (" + String(gauss_sigma) + ") ** 2)");
      generateDistributionImage_(diff_scores, formula, fwd_filename);
    }
    else
    {
      generateDistributionImage_(diff_scores, gf.getGnuplotFormula(), fwd_filename);
    }
#endif

#ifdef IDDECOYPROBABILITY_DEBUG
    //all_trafo.diff_score + all_trafo.min_score
    String gauss_formula("f(x)=" + String(result_gauss.A / all_trafo.max_intensity) + " * exp(-(x - " + String(result_gauss.x0 * all_trafo.diff_score + all_trafo.min_score) + ") ** 2 / 2 / (" + String(result_gauss.sigma * all_trafo.diff_score)   + ") ** 2)");

    String b_str(result_gamma.b), p_str(result_gamma.p);
    String gamma_formula = "g(x)=(" + b_str + " ** " + p_str + ") / gamma(" + p_str + ") * x ** (" + p_str + " - 1) * exp(- " + b_str + " * x)";

    generateDistributionImage_(all_scores_normalized, all_trafo, gauss_formula, gamma_formula, (String)param_.getValue("fwd_filename"));
#endif

    vector<PeptideIdentification> new_prob_ids;
    // calculate the probabilities and write them to the IDs
    for (vector<PeptideIdentification>::const_iterator it = ids.begin(); it != ids.end(); ++it)
    {
      if (it->getHits().size() > 0)
      {
        vector<PeptideHit> hits;
        String score_type = it->getScoreType() + "_score";
        for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
        {
          PeptideHit hit = *pit;
          double score = hit.getScore();
          if (!it->isHigherScoreBetter())
          {
            score = -log10(score);
          }
          hit.setMetaValue(score_type, hit.getScore());
          hit.setScore(getProbability_(result_gamma, rev_trafo, result_gauss, fwd_trafo, score));
          hits.push_back(hit);
        }
        PeptideIdentification id = *it;
        id.setHigherScoreBetter(true);
        id.setScoreType(id.getScoreType() + "_DecoyProbability");
        id.setHits(hits);

        new_prob_ids.push_back(id);
      }
    }
    ids = new_prob_ids;
  }
コード例 #2
0
  ExitCodes main_(int, const char**)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------

    StringList in_spec = getStringList_("in");
    StringList out = getStringList_("out");
    String in_lib = getStringOption_("lib");
    String compare_function = getStringOption_("compare_function");
    Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer");
    float precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance");
    //Int min_precursor_charge = getIntOption_("min_precursor_charge");
    //Int max_precursor_charge = getIntOption_("max_precursor_charge");
    float remove_peaks_below_threshold = getDoubleOption_("filter:remove_peaks_below_threshold");
    UInt min_peaks = getIntOption_("filter:min_peaks");
    UInt max_peaks = getIntOption_("filter:max_peaks");
    Int cut_peaks_below = getIntOption_("filter:cut_peaks_below");
    StringList fixed_modifications = getStringList_("fixed_modifications");
    StringList variable_modifications = getStringList_("variable_modifications");
    Int top_hits  = getIntOption_("top_hits");
    if (top_hits < -1)
    {
      writeLog_("top_hits (should be  >= -1 )");
      return ILLEGAL_PARAMETERS;
    }
    //-------------------------------------------------------------
    // loading input
    //-------------------------------------------------------------
    if (out.size() != in_spec.size())
    {
      writeLog_("out (should be as many as input files)");
      return ILLEGAL_PARAMETERS;
    }

    time_t prog_time = time(NULL);
    MSPFile spectral_library;
    RichPeakMap query, library;
    //spectrum which will be identified
    MzMLFile spectra;
    spectra.setLogType(log_type_);

    time_t start_build_time = time(NULL);
    //-------------------------------------------------------------
    //building map for faster search
    //-------------------------------------------------------------

    //library containing already identified peptide spectra
    vector<PeptideIdentification> ids;
    spectral_library.load(in_lib, ids, library);

    map<Size, vector<PeakSpectrum> > MSLibrary;
    {
      RichPeakMap::iterator s;
      vector<PeptideIdentification>::iterator i;
      ModificationsDB* mdb = ModificationsDB::getInstance();
      for (s = library.begin(), i = ids.begin(); s < library.end(); ++s, ++i)
      {
        double precursor_MZ = (*s).getPrecursors()[0].getMZ();
        Size MZ_multi = (Size)precursor_MZ * precursor_mass_multiplier;
        map<Size, vector<PeakSpectrum> >::iterator found;
        found = MSLibrary.find(MZ_multi);

        PeakSpectrum librar;
        bool variable_modifications_ok = true;
        bool fixed_modifications_ok = true;
        const AASequence& aaseq = i->getHits()[0].getSequence();
        //variable fixed modifications
        if (!fixed_modifications.empty())
        {
          for (Size i = 0; i < aaseq.size(); ++i)
          {
            const   Residue& mod  = aaseq.getResidue(i);
            for (Size s = 0; s < fixed_modifications.size(); ++s)
            {
              if (mod.getOneLetterCode() == mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification())
              {
                fixed_modifications_ok = false;
                break;
              }
            }
          }
        }
        //variable modifications
        if (aaseq.isModified() && (!variable_modifications.empty()))
        {
          for (Size i = 0; i < aaseq.size(); ++i)
          {
            if (aaseq.isModified(i))
            {
              const   Residue& mod  = aaseq.getResidue(i);
              for (Size s = 0; s < variable_modifications.size(); ++s)
              {
                if (mod.getOneLetterCode() == mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification())
                {
                  variable_modifications_ok = false;
                  break;
                }
              }
            }
          }
        }
        if (variable_modifications_ok && fixed_modifications_ok)
        {
          PeptideIdentification& translocate_pid = *i;
          librar.getPeptideIdentifications().push_back(translocate_pid);
          librar.setPrecursors(s->getPrecursors());
          //library entry transformation
          for (UInt l = 0; l < s->size(); ++l)
          {
            Peak1D peak;
            if ((*s)[l].getIntensity() >  remove_peaks_below_threshold)
            {
              const String& info = (*s)[l].getMetaValue("MSPPeakInfo");
              if (info[0] == '?')
              {
                peak.setIntensity(sqrt(0.2 * (*s)[l].getIntensity()));
              }
              else
              {
                peak.setIntensity(sqrt((*s)[l].getIntensity()));
              }

              peak.setMZ((*s)[l].getMZ());
              peak.setPosition((*s)[l].getPosition());
              librar.push_back(peak);
            }
          }
          if (found != MSLibrary.end())
          {
            found->second.push_back(librar);
          }
          else
          {
            vector<PeakSpectrum> tmp;
            tmp.push_back(librar);
            MSLibrary.insert(make_pair(MZ_multi, tmp));
          }
        }
      }
    }
    time_t end_build_time = time(NULL);
    cout << "Time needed for preprocessing data: " << (end_build_time - start_build_time) << "\n";
    //compare function
    PeakSpectrumCompareFunctor* comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function);
    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------
    double score;
    StringList::iterator in, out_file;
    for (in  = in_spec.begin(), out_file  = out.begin(); in < in_spec.end(); ++in, ++out_file)
    {
      time_t start_time = time(NULL);
      spectra.load(*in, query);
      //Will hold valuable hits
      vector<PeptideIdentification> peptide_ids;
      vector<ProteinIdentification> protein_ids;
      // Write parameters to ProteinIdentifcation
      ProteinIdentification prot_id;
      //Parameters of identificaion
      prot_id.setIdentifier("test");
      prot_id.setSearchEngineVersion("SpecLibSearcher");
      prot_id.setDateTime(DateTime::now());
      prot_id.setScoreType(compare_function);
      ProteinIdentification::SearchParameters searchparam;
      searchparam.precursor_tolerance = precursor_mass_tolerance;
      prot_id.setSearchParameters(searchparam);
      /***********SEARCH**********/
      for (UInt j = 0; j < query.size(); ++j)
      {
        //Set identifier for each identifications
        PeptideIdentification pid;
        pid.setIdentifier("test");
        pid.setScoreType(compare_function);
        ProteinHit pr_hit;
        pr_hit.setAccession(j);
        prot_id.insertHit(pr_hit);
        //RichPeak1D to Peak1D transformation for the compare function query
        PeakSpectrum quer;
        bool peak_ok = true;
        query[j].sortByIntensity(true);
        double min_high_intensity = 0;

        if (query[j].empty() || query[j].getMSLevel() != 2)
        {
          continue;
        }
        if (query[j].getPrecursors().empty())
        {
          writeLog_("Warning MS2 spectrum without precursor information");
          continue;
        }

        min_high_intensity = (1 / cut_peaks_below) * query[j][0].getIntensity();

        query[j].sortByPosition();
        for (UInt k = 0; k < query[j].size() && k < max_peaks; ++k)
        {
          if (query[j][k].getIntensity() >  remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity)
          {
            Peak1D peak;
            peak.setIntensity(sqrt(query[j][k].getIntensity()));
            peak.setMZ(query[j][k].getMZ());
            peak.setPosition(query[j][k].getPosition());
            quer.push_back(peak);
          }
        }
        if (quer.size() >= min_peaks)
        {
          peak_ok = true;
        }
        else
        {
          peak_ok = false;
        }
        double query_MZ = query[j].getPrecursors()[0].getMZ();
        if (peak_ok)
        {
          bool charge_one = false;
          Int percent = (Int) Math::round((query[j].size() / 100.0) * 3.0);
          Int margin  = (Int) Math::round((query[j].size() / 100.0) * 1.0);
          for (vector<RichPeak1D>::iterator peak = query[j].end() - 1; percent >= 0; --peak, --percent)
          {
            if (peak->getMZ() < query_MZ)
            {
              break;
            }
          }
          if (percent > margin)
          {
            charge_one = true;
          }
          float min_MZ = (query_MZ - precursor_mass_tolerance) * precursor_mass_multiplier;
          float max_MZ = (query_MZ + precursor_mass_tolerance) * precursor_mass_multiplier;
          for (Size mz = (Size)min_MZ; mz <= ((Size)max_MZ) + 1; ++mz)
          {
            map<Size, vector<PeakSpectrum> >::iterator found;
            found = MSLibrary.find(mz);
            if (found != MSLibrary.end())
            {
              vector<PeakSpectrum>& library = found->second;
              for (Size i = 0; i < library.size(); ++i)
              {
                float this_MZ  = library[i].getPrecursors()[0].getMZ() * precursor_mass_multiplier;
                if (this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) || charge_one == false))
                {
                  PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0];
                  PeakSpectrum& librar = library[i];
                  //Special treatment for SpectraST score as it computes a score based on the whole library
                  if (compare_function == "SpectraSTSimilarityScore")
                  {
                    SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor);
                    BinnedSpectrum quer_bin = sp->transform(quer);
                    BinnedSpectrum librar_bin = sp->transform(librar);
                    score = (*sp)(quer, librar); //(*sp)(quer_bin,librar_bin);
                    double dot_bias = sp->dot_bias(quer_bin, librar_bin, score);
                    hit.setMetaValue("DOTBIAS", dot_bias);
                  }
                  else
                  {
                    score = (*comparor)(quer, librar);
                  }

                  DataValue RT(library[i].getRT());
                  DataValue MZ(library[i].getPrecursors()[0].getMZ());
                  hit.setMetaValue("RT", RT);
                  hit.setMetaValue("MZ", MZ);
                  hit.setScore(score);
                  PeptideEvidence pe;
                  pe.setProteinAccession(pr_hit.getAccession());
                  hit.addPeptideEvidence(pe);
                  pid.insertHit(hit);
                }
              }
            }
          }
        }
        pid.setHigherScoreBetter(true);
        pid.sort();
        if (compare_function == "SpectraSTSimilarityScore")
        {
          if (!pid.empty() && !pid.getHits().empty())
          {
            vector<PeptideHit> final_hits;
            final_hits.resize(pid.getHits().size());
            SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor);
            Size runner_up = 1;
            for (; runner_up < pid.getHits().size(); ++runner_up)
            {
              if (pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString() || runner_up > 5)
              {
                break;
              }
            }
            double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore());
            for (Size s = 0; s < pid.getHits().size(); ++s)
            {
              final_hits[s] = pid.getHits()[s];
              final_hits[s].setMetaValue("delta D", delta_D);
              final_hits[s].setMetaValue("dot product", pid.getHits()[s].getScore());
              final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(), delta_D, pid.getHits()[s].getMetaValue("DOTBIAS")));

              //final_hits[s].removeMetaValue("DOTBIAS");
            }
            pid.setHits(final_hits);
            pid.sort();
            pid.setMZ(query[j].getPrecursors()[0].getMZ());
            pid.setRT(query_MZ);
          }
        }
        if (top_hits != -1 && (UInt)top_hits < pid.getHits().size())
        {
          vector<PeptideHit> hits;
          hits.resize(top_hits);
          for (Size i = 0; i < (UInt)top_hits; ++i)
          {
            hits[i] = pid.getHits()[i];
          }
          pid.setHits(hits);
        }
        peptide_ids.push_back(pid);
      }
      protein_ids.push_back(prot_id);
      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------
      IdXMLFile id_xml_file;
      id_xml_file.store(*out_file, protein_ids, peptide_ids);
      time_t end_time = time(NULL);
      cout << "Search time: " << difftime(end_time, start_time) << " seconds for " << *in << "\n";
    }
    time_t end_time = time(NULL);
    cout << "Total time: " << difftime(end_time, prog_time) << " secconds\n";
    return EXECUTION_OK;
  }