void InternalCalibration::calibrateMapGlobally(const FeatureMap<> & feature_map, FeatureMap<> & calibrated_feature_map, std::vector<PeptideIdentification> & ref_ids, String trafo_file_name)
  {
    checkReferenceIds_(ref_ids);

    calibrated_feature_map = feature_map;
    // clear the ids
    for (Size f = 0; f < calibrated_feature_map.size(); ++f)
    {
      calibrated_feature_map[f].getPeptideIdentifications().clear();
    }

    // map the reference ids onto the features
    IDMapper mapper;
    Param param;
    param.setValue("rt_tolerance", (DoubleReal)param_.getValue("rt_tolerance"));
    param.setValue("mz_tolerance", param_.getValue("mz_tolerance"));
    param.setValue("mz_measure", param_.getValue("mz_tolerance_unit"));
    mapper.setParameters(param);
    std::vector<ProteinIdentification> vec;
    mapper.annotate(calibrated_feature_map, ref_ids, vec);

    // calibrate
    calibrateMapGlobally(calibrated_feature_map, calibrated_feature_map, trafo_file_name);

    // copy the old ids
    calibrated_feature_map.setUnassignedPeptideIdentifications(feature_map.getUnassignedPeptideIdentifications());
    for (Size f = 0; f < feature_map.size(); ++f)
    {
      calibrated_feature_map[f].getPeptideIdentifications().clear();
      if (!feature_map[f].getPeptideIdentifications().empty())
      {
        calibrated_feature_map[f].setPeptideIdentifications(feature_map[f].getPeptideIdentifications());
      }
    }
  }
  ExitCodes main_(int, const char **)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    String in_spectra = getStringOption_("in_spectra");
    String in_identifications = getStringOption_("in_identifications");
    String outfile = getStringOption_("model_output_file");
    Int precursor_charge = getIntOption_("precursor_charge");

    //-------------------------------------------------------------
    // init SvmTheoreticalSpectrumGeneratorTrainer
    //-------------------------------------------------------------
    SvmTheoreticalSpectrumGeneratorTrainer trainer;

    Param param = getParam_().copy("algorithm:", true);
    String write_files = getFlag_("write_training_files") ? "true" : "false";
    param.setValue("write_training_files", write_files);
    trainer.setParameters(param);

    //-------------------------------------------------------------
    // loading input
    //-------------------------------------------------------------
    PeakMap map;
    MzMLFile().load(in_spectra, map);

    std::vector<PeptideIdentification> pep_ids;
    std::vector<ProteinIdentification> prot_ids;
    String tmp_str;
    IdXMLFile().load(in_identifications, prot_ids, pep_ids, tmp_str);

    IDMapper idmapper;
    Param par;
    par.setValue("rt_tolerance", 0.001);
    par.setValue("mz_tolerance", 0.001);
    idmapper.setParameters(par);
    idmapper.annotate(map, pep_ids, prot_ids);

    //generate vector of annotations
    std::vector<AASequence> annotations;
    PeakMap::iterator it;
    for (it = map.begin(); it != map.end(); ++it)
    {
      annotations.push_back(it->getPeptideIdentifications()[0].getHits()[0].getSequence());
    }

    trainer.trainModel(map, annotations, outfile, precursor_charge);
    return EXECUTION_OK;
  }
Example #3
0
  ExitCodes main_(int, const char**)
  {
    // LOG_DEBUG << "Starting..." << endl;

    //----------------------------------------------------------------
    // load ids
    //----------------------------------------------------------------
    // LOG_DEBUG << "Loading idXML..." << endl;
    String id = getStringOption_("id");
    vector<ProteinIdentification> protein_ids;
    vector<PeptideIdentification> peptide_ids;
    FileTypes::Type in_type = FileHandler::getType(id);
    if (in_type == FileTypes::IDXML)
    {
      IdXMLFile().load(id, protein_ids, peptide_ids);
    }
    else if (in_type == FileTypes::MZIDENTML)
    {
      MzIdentMLFile().load(id, protein_ids, peptide_ids);
    }
    else
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__,
                                       __PRETTY_FUNCTION__,
                                       "wrong id fileformat");
    }

    String in = getStringOption_("in");
    String out = getStringOption_("out");
    in_type = FileHandler::getType(in);
    //----------------------------------------------------------------
    //create mapper
    //----------------------------------------------------------------
    // LOG_DEBUG << "Creating mapper..." << endl;
    IDMapper mapper;
    Param p = mapper.getParameters();
    p.setValue("rt_tolerance", getDoubleOption_("rt_tolerance"));
    p.setValue("mz_tolerance", getDoubleOption_("mz_tolerance"));
    p.setValue("mz_measure", getStringOption_("mz_measure"));
    p.setValue("mz_reference", getStringOption_("mz_reference"));
    p.setValue("ignore_charge", getFlag_("ignore_charge") ? "true" : "false");
    mapper.setParameters(p);

    //----------------------------------------------------------------
    // consensusXML
    //----------------------------------------------------------------
    if (in_type == FileTypes::CONSENSUSXML)
    {
      // LOG_DEBUG << "Processing consensus map..." << endl;
      ConsensusXMLFile file;
      ConsensusMap map;
      file.load(in, map);

      bool measure_from_subelements = getFlag_("consensus:use_subelements");
      bool annotate_ids_with_subelements = getFlag_("consensus:annotate_ids_with_subelements");

      mapper.annotate(map, peptide_ids, protein_ids, measure_from_subelements, annotate_ids_with_subelements);

      //annotate output with data processing info
      addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING));

      file.store(out, map);
    }

    //----------------------------------------------------------------
    // featureXML
    //----------------------------------------------------------------
    if (in_type == FileTypes::FEATUREXML)
    {
      // LOG_DEBUG << "Processing feature map..." << endl;
      FeatureMap map;
      FeatureXMLFile file;
      file.load(in, map);

      mapper.annotate(map, peptide_ids, protein_ids,
                      getFlag_("feature:use_centroid_rt"),
                      getFlag_("feature:use_centroid_mz"));

      //annotate output with data processing info
      addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING));

      file.store(out, map);
    }

    //----------------------------------------------------------------
    // MzQuantML
    //----------------------------------------------------------------
    if (in_type == FileTypes::MZQUANTML)
    {
      // LOG_DEBUG << "Processing mzq ..." << endl;
      MSQuantifications msq;
      MzQuantMLFile file;
      file.load(in, msq);

      bool measure_from_subelements = getFlag_("consensus:use_subelements");
      for (std::vector<ConsensusMap>::iterator it = msq.getConsensusMaps().begin(); it != msq.getConsensusMaps().end(); ++it)
      {
        mapper.annotate(*it, peptide_ids, protein_ids, measure_from_subelements);
        //annotate output with data processing info
        addDataProcessing_(*it, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING));
      }

      //~ writeDebug_(msq.getConsensusMaps().size(),3);
      //~ writeDebug_(msq.getConsensusMaps().back().size(),3);
      //~ writeDebug_(msq.getAnalysisSummary().quant_type_,3);
      file.store(out, msq);
    }

    // LOG_DEBUG << "Done." << endl;
    return EXECUTION_OK;
  }
  ExitCodes main_(int, const char **)
  {
    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------

    StringList id_in(getStringList_("id_in"));
    StringList in_raw(getStringList_("in"));
    Size number_of_bins((UInt)getIntOption_("number_of_bins"));
    bool precursor_error_ppm(getFlag_("precursor_error_ppm"));
    bool fragment_error_ppm(getFlag_("fragment_error_ppm"));
    bool generate_gnuplot_scripts(DataValue(getStringOption_("generate_gnuplot_scripts")).toBool());

    if (in_raw.size() != id_in.size())
    {
      writeLog_("Number of spectrum files and identification files differs...");
      return ILLEGAL_PARAMETERS;
    }

    //-------------------------------------------------------------
    // reading input
    //-------------------------------------------------------------

    vector<vector<PeptideIdentification> > pep_ids;
    vector<vector<ProteinIdentification> > prot_ids;
    pep_ids.resize(id_in.size());
    prot_ids.resize(id_in.size());

    IdXMLFile idxmlfile;
    for (Size i = 0; i != id_in.size(); ++i)
    {
      String doc_id;
      idxmlfile.load(id_in[i], prot_ids[i], pep_ids[i], doc_id);
    }

    // read mzML files
    vector<RichPeakMap> maps_raw;
    maps_raw.resize(in_raw.size());

    MzMLFile mzml_file;
    for (Size i = 0; i != in_raw.size(); ++i)
    {
      mzml_file.load(in_raw[i], maps_raw[i]);
    }

    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------

    // mapping ids
    IDMapper mapper;
    for (Size i = 0; i != maps_raw.size(); ++i)
    {
      mapper.annotate(maps_raw[i], pep_ids[i], prot_ids[i]);
    }

    // normalize the spectra
    Normalizer normalizer;
    for (vector<RichPeakMap>::iterator it1 = maps_raw.begin(); it1 != maps_raw.end(); ++it1)
    {
      for (RichPeakMap::Iterator it2 = it1->begin(); it2 != it1->end(); ++it2)
      {
        normalizer.filterSpectrum(*it2);
      }
    }

    // generate precursor statistics
    vector<MassDifference> precursor_diffs;
    if (getStringOption_("precursor_out") != "")
    {
      for (Size i = 0; i != maps_raw.size(); ++i)
      {
        for (Size j = 0; j != maps_raw[i].size(); ++j)
        {
          if (maps_raw[i][j].getPeptideIdentifications().empty())
          {
            continue;
          }
          for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it)
          {
            if (it->getHits().size() > 0)
            {
              PeptideHit hit = *it->getHits().begin();
              MassDifference md;
              Int charge = hit.getCharge();
              if (charge == 0)
              {
                charge = 1;
              }
              md.exp_mz = it->getMZ();
              md.theo_mz = (hit.getSequence().getMonoWeight() + (double)charge * Constants::PROTON_MASS_U) / (double)charge;
              md.charge = charge;
              precursor_diffs.push_back(md);
            }
          }
        }
      }
    }

    // generate fragment ions statistics
    vector<MassDifference> fragment_diffs;
    TheoreticalSpectrumGenerator tsg;
    SpectrumAlignment sa;
    double fragment_mass_tolerance(getDoubleOption_("fragment_mass_tolerance"));
    Param sa_param(sa.getParameters());
    sa_param.setValue("tolerance", fragment_mass_tolerance);
    sa.setParameters(sa_param);

    if (getStringOption_("fragment_out") != "")
    {
      for (Size i = 0; i != maps_raw.size(); ++i)
      {
        for (Size j = 0; j != maps_raw[i].size(); ++j)
        {
          if (maps_raw[i][j].getPeptideIdentifications().empty())
          {
            continue;
          }
          for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it)
          {
            if (it->getHits().size() > 0)
            {
              PeptideHit hit = *it->getHits().begin();

              RichPeakSpectrum theo_spec;
              tsg.addPeaks(theo_spec, hit.getSequence(), Residue::YIon);
              tsg.addPeaks(theo_spec, hit.getSequence(), Residue::BIon);

              vector<pair<Size, Size> > pairs;
              sa.getSpectrumAlignment(pairs, theo_spec, maps_raw[i][j]);
              //cerr << hit.getSequence() << " " << hit.getSequence().getSuffix(1).getFormula() << " " << hit.getSequence().getSuffix(1).getFormula().getMonoWeight() << endl;
              for (vector<pair<Size, Size> >::const_iterator pit = pairs.begin(); pit != pairs.end(); ++pit)
              {
                MassDifference md;
                md.exp_mz = maps_raw[i][j][pit->second].getMZ();
                md.theo_mz = theo_spec[pit->first].getMZ();
                //cerr.precision(15);
                //cerr << md.exp_mz << " " << md.theo_mz << " " << md.exp_mz - md.theo_mz << endl;
                md.intensity = maps_raw[i][j][pit->second].getIntensity();
                md.charge = hit.getCharge();
                fragment_diffs.push_back(md);
              }
            }
          }
        }
      }
    }

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------

    String precursor_out_file(getStringOption_("precursor_out"));
    if (precursor_out_file != "")
    {
      vector<double> errors;
      ofstream precursor_out(precursor_out_file.c_str());
      double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min());
      for (Size i = 0; i != precursor_diffs.size(); ++i)
      {
        double diff = getMassDifference(precursor_diffs[i].theo_mz, precursor_diffs[i].exp_mz, precursor_error_ppm);
        precursor_out << diff << "\n";
        errors.push_back(diff);

        if (diff > max_diff)
        {
          max_diff = diff;
        }
        if (diff < min_diff)
        {
          min_diff = diff;
        }
      }
      precursor_out.close();

      // fill histogram with the collected values
      double bin_size = (max_diff - min_diff) / (double)number_of_bins;
      Histogram<double, double> hist(min_diff, max_diff, bin_size);
      for (Size i = 0; i != errors.size(); ++i)
      {
        hist.inc(errors[i], 1.0);
      }

      writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1);

      // transform the histogram into a vector<DPosition<2> > for the fitting
      vector<DPosition<2> > values;
      for (Size i = 0; i != hist.size(); ++i)
      {
        DPosition<2> p;
        p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff);
        p.setY(hist[i]);
        values.push_back(p);
      }

      double mean = Math::mean(errors.begin(), errors.end());
      double abs_dev = Math::absdev(errors.begin(), errors.end(), mean);
      double sdv = Math::sd(errors.begin(), errors.end(), mean);
      sort(errors.begin(), errors.end());
      double median = errors[(Size)(errors.size() / 2.0)];

      writeDebug_("Precursor mean error: " + String(mean), 1);
      writeDebug_("Precursor abs. dev.:  " + String(abs_dev), 1);
      writeDebug_("Precursor std. dev.:  " + String(sdv), 1);
      writeDebug_("Precursor median error:  " + String(median), 1);


      // calculate histogram for gauss fitting
      GaussFitter gf;
      GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv/500.0);
      gf.setInitialParameters(init_param);

      try
      {
        gf.fit(values);

        // write gnuplot scripts
        if (generate_gnuplot_scripts)
        {
          ofstream out(String(precursor_out_file + "_gnuplot.dat").c_str());
          for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it)
          {
            out << it->getX() << " " << it->getY() << endl;
          }
          out.close();

          ofstream gpl_out(String(precursor_out_file + "_gnuplot.gpl").c_str());
          gpl_out << "set terminal png" << endl;
          gpl_out << "set output \"" << precursor_out_file  << "_gnuplot.png\"" << endl;
          if (precursor_error_ppm)
          {
            gpl_out << "set xlabel \"error in ppm\"" << endl;
          }
          else
          {
            gpl_out << "set xlabel \"error in Da\"" << endl;
          }
          gpl_out << "set ylabel \"frequency\"" << endl;
          gpl_out << "plot '" << precursor_out_file << "_gnuplot.dat' title 'Precursor mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl;
          gpl_out.close();
        }

      }
      catch (Exception::UnableToFit)
      {
        writeLog_("Unable to fit a Gaussian distribution to the precursor mass errors");
      }
    }

    String fragment_out_file(getStringOption_("fragment_out"));
    if (fragment_out_file != "")
    {
      vector<double> errors;
      ofstream fragment_out(fragment_out_file.c_str());
      double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min());
      for (Size i = 0; i != fragment_diffs.size(); ++i)
      {
        double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm);
        fragment_out << diff << endl;
        errors.push_back(diff);

        if (diff > max_diff)
        {
          max_diff = diff;
        }
        if (diff < min_diff)
        {
          min_diff = diff;
        }
      }
      fragment_out.close();

      // fill histogram with the collected values
      // here we use the intensities to scale the error
      // low intensity peaks are likely to be random matches
      double bin_size = (max_diff - min_diff) / (double)number_of_bins;
      Histogram<double, double> hist(min_diff, max_diff, bin_size);
      for (Size i = 0; i != fragment_diffs.size(); ++i)
      {
        double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm);
        hist.inc(diff, fragment_diffs[i].intensity);
      }

      writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1);

      // transform the histogram into a vector<DPosition<2> > for the fitting
      vector<DPosition<2> > values;
      for (Size i = 0; i != hist.size(); ++i)
      {
        DPosition<2> p;
        p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff);
        p.setY(hist[i]);
        values.push_back(p);
      }

      double mean = Math::mean(errors.begin(), errors.end());
      double abs_dev = Math::absdev(errors.begin(), errors.end(), mean);
      double sdv = Math::sd(errors.begin(), errors.end(), mean);
      sort(errors.begin(), errors.end());
      double median = errors[(Size)(errors.size() / 2.0)];

      writeDebug_("Fragment mean error:  " + String(mean), 1);
      writeDebug_("Fragment abs. dev.:   " + String(abs_dev), 1);
      writeDebug_("Fragment std. dev.:   " + String(sdv), 1);
      writeDebug_("Fragment median error:   " + String(median), 1);

      // calculate histogram for gauss fitting
      GaussFitter gf;
      GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv / 100.0);
      gf.setInitialParameters(init_param);

      try
      {
        gf.fit(values);


        // write gnuplot script
        if (generate_gnuplot_scripts)
        {
          ofstream out(String(fragment_out_file + "_gnuplot.dat").c_str());
          for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it)
          {
            out << it->getX() << " " << it->getY() << endl;
          }
          out.close();

          ofstream gpl_out(String(fragment_out_file + "_gnuplot.gpl").c_str());
          gpl_out << "set terminal png" << endl;
          gpl_out << "set output \"" << fragment_out_file  << "_gnuplot.png\"" << endl;
          if (fragment_error_ppm)
          {
            gpl_out << "set xlabel \"error in ppm\"" << endl;
          }
          else
          {
            gpl_out << "set xlabel \"error in Da\"" << endl;
          }
          gpl_out << "set ylabel \"frequency\"" << endl;
          gpl_out << "plot '" << fragment_out_file << "_gnuplot.dat' title 'Fragment mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl;
          gpl_out.close();
        }
      }
      catch (Exception::UnableToFit)
      {
        writeLog_("Unable to fit a Gaussian distribution to the fragment mass errors");
      }
    }

    return EXECUTION_OK;
  }