void EDTAFile::store(const String& filename, const FeatureMap& map) const
  {
    TextFile tf;
    tf.addLine("RT\tm/z\tintensity\tcharge");

    for (Size i = 0; i < map.size(); ++i)
    {
      const Feature& f = map[i];
      tf.addLine(String(f.getRT()) + "\t" + f.getMZ() + "\t" + f.getIntensity() + "\t" + f.getCharge());
    }

    tf.store(filename);

  }
  void EDTAFile::store(const String& filename, const ConsensusMap& map) const
  {
    TextFile tf;

    // search for maximum number of sub-features (since this determines the number of columns)
    Size max_sub(0);
    for (Size i = 0; i < map.size(); ++i)
    {
      max_sub = std::max(max_sub, map[i].getFeatures().size());
    }

    // write header
    String header("RT\tm/z\tintensity\tcharge");
    for (Size i = 1; i <= max_sub; ++i)
    {
      header += "\tRT" + String(i) + "\tm/z" + String(i) + "\tintensity" + String(i) + "\tcharge" + String(i);
    }
    tf.addLine(header);

    for (Size i = 0; i < map.size(); ++i)
    {
      ConsensusFeature f = map[i];
      // consensus
      String entry = String(f.getRT()) + "\t" + f.getMZ() + "\t" + f.getIntensity() + "\t" + f.getCharge();
      // sub-features
      ConsensusFeature::HandleSetType handle = f.getFeatures();
      for (ConsensusFeature::HandleSetType::const_iterator it = handle.begin(); it != handle.end(); ++it)
      {
        entry += String("\t") + it->getRT() + "\t" + it->getMZ() + "\t" + it->getIntensity() + "\t" + it->getCharge();
      }
      // missing sub-features
      for (Size j = handle.size(); j < max_sub; ++j)
      {
        entry += "\tNA\tNA\tNA\tNA";
      }
      tf.addLine(entry);
    }


    tf.store(filename);
  }
 void writeTermTree_(const String& accession, const ControlledVocabulary& cv, TextFile& file, UInt indent)
 {
   const ControlledVocabulary::CVTerm& term = cv.getTerm(accession);
   for (set<String>::const_iterator it = term.children.begin(); it != term.children.end(); ++it)
   {
     const ControlledVocabulary::CVTerm& child_term = cv.getTerm(*it);
     String subterm_line;
     for (Size i = 0; i < 4 * indent; ++i) subterm_line += "&nbsp;";
     String description = child_term.description;
     if (child_term.synonyms.size() != 0)
     {
       description += String(" -- Synonyms: '") + ListUtils::concatenate(child_term.synonyms, ", ") + "'";
     }
     subterm_line += "- <span title=\"" + description + "\">" + child_term.id + " ! " + child_term.name + "</span>";
     StringList tags;
     if (child_term.obsolete)
     {
       tags.push_back("<font color=darkred>obsolete</font>");
     }
     if (child_term.xref_type != ControlledVocabulary::CVTerm::NONE)
     {
       tags.push_back("value-type=" + ControlledVocabulary::CVTerm::getXRefTypeName(child_term.xref_type));
     }
     if (child_term.units.size() > 0)
     {
       StringList units;
       for (set<String>::const_iterator u_it = child_term.units.begin(); u_it != child_term.units.end(); ++u_it)
       {
         units.push_back(*u_it + "!" + cv.getTerm(*u_it).name);
       }
       tags.push_back(String("units=") + ListUtils::concatenate(units, ","));
     }
     if (child_term.xref_binary.size() > 0)
     {
       StringList types;
       for (StringList::const_iterator u_it = child_term.xref_binary.begin(); u_it != child_term.xref_binary.end(); ++u_it)
       {
         types.push_back(*u_it + "!" + cv.getTerm(*u_it).name);
       }
       tags.push_back(String("binary-array-types=") + ListUtils::concatenate(types, ","));
     }
     if (tags.size() != 0)
     {
       subterm_line += String("<FONT color=\"grey\"> (") + ListUtils::concatenate(tags, ", ") + ")</FONT>";
     }
     file.addLine(subterm_line + "<BR>");
     writeTermTree_(child_term.id, cv, file, indent + 1);
   }
 }
  ExitCodes main_(int, const char**)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    StringList in = getStringList_("in");
    String edta = getStringOption_("pos");
    String out = getStringOption_("out");
    String out_sep = getStringOption_("out_separator");
    String out_TIC_debug = getStringOption_("auto_rt:out_debug_TIC");

    StringList in_header = getStringList_("in_header");


    // number of out_debug_TIC files and input files must be identical
    /*if (out_TIC_debug.size() > 0 && in.size() != out_TIC_debug.size())
    {
        LOG_FATAL_ERROR << "Error: number of input file 'in' and auto_rt:out_debug_TIC files must be identical!" << std::endl;
        return ILLEGAL_PARAMETERS;
    }*/

    // number of header files and input files must be identical
    if (in_header.size() > 0 && in.size() != in_header.size())
    {
      LOG_FATAL_ERROR << "Error: number of input file 'in' and 'in_header' files must be identical!" << std::endl;
      return ILLEGAL_PARAMETERS;
    }

    if (!getFlag_("auto_rt:enabled") && !out_TIC_debug.empty())
    {
      LOG_FATAL_ERROR << "Error: TIC output file requested, but auto_rt is not enabled! Either do not request the file or switch on 'auto_rt:enabled'." << std::endl;
      return ILLEGAL_PARAMETERS;
    }

    double rttol = getDoubleOption_("rt_tol");
    double mztol = getDoubleOption_("mz_tol");
    Size rt_collect = getIntOption_("rt_collect");

    //-------------------------------------------------------------
    // loading input
    //-------------------------------------------------------------
    MzMLFile mzml_file;
    mzml_file.setLogType(log_type_);
    MSExperiment<Peak1D> exp, exp_pp;

    EDTAFile ed;
    ConsensusMap cm;
    ed.load(edta, cm);

    StringList tf_single_header0, tf_single_header1, tf_single_header2; // header content, for each column

    std::vector<String> vec_single; // one line for each compound, multiple columns per experiment
    vec_single.resize(cm.size());
    for (Size fi = 0; fi < in.size(); ++fi)
    {
      // load raw data
      mzml_file.load(in[fi], exp);
      exp.sortSpectra(true);

      if (exp.empty())
      {
        LOG_WARN << "The given file does not contain any conventional peak data, but might"
                    " contain chromatograms. This tool currently cannot handle them, sorry." << std::endl;
        return INCOMPATIBLE_INPUT_DATA;
      }

      // try to detect RT peaks (only for the first input file -- all others should align!)
      // cm.size() might change in here...
      if (getFlag_("auto_rt:enabled") && fi == 0)
      {
        ConsensusMap cm_local = cm; // we might have different RT peaks for each map if 'auto_rt' is enabled
        cm.clear(false); // reset global list (about to be filled)

        // compute TIC
        MSChromatogram<> tic = exp.getTIC();
        MSSpectrum<> tics, tic_gf, tics_pp, tics_sn;
        for (Size ic = 0; ic < tic.size(); ++ic)
        { // rewrite Chromatogram to MSSpectrum (GaussFilter requires it)
          Peak1D peak;
          peak.setMZ(tic[ic].getRT());
          peak.setIntensity(tic[ic].getIntensity());
          tics.push_back(peak);
        }
        // smooth (no PP_CWT here due to efficiency reasons -- large FWHM take longer!)
        double fwhm = getDoubleOption_("auto_rt:FHWM");
        GaussFilter gf;
        Param p = gf.getParameters();
        p.setValue("gaussian_width", fwhm * 2); // wider than FWHM, just to be sure we have a fully smoothed peak. Merging two peaks is unlikely
        p.setValue("use_ppm_tolerance", "false");
        gf.setParameters(p);
        tic_gf = tics;
        gf.filter(tic_gf);
        // pick peaks
        PeakPickerHiRes pp;
        p = pp.getParameters();
        p.setValue("signal_to_noise", getDoubleOption_("auto_rt:SNThreshold"));
        pp.setParameters(p);
        pp.pick(tic_gf, tics_pp);

        if (tics_pp.size())
        {
          LOG_INFO << "Found " << tics_pp.size() << " auto-rt peaks at: ";
          for (Size ipp = 0; ipp != tics_pp.size(); ++ipp) LOG_INFO << " " << tics_pp[ipp].getMZ();
        }
        else
        {
          LOG_INFO << "Found no auto-rt peaks. Change threshold parameters!";
        }
        LOG_INFO << std::endl;

        if (!out_TIC_debug.empty()) // if debug file was given
        { // store intermediate steps for debug
          MSExperiment<> out_debug;
          out_debug.addChromatogram(toChromatogram(tics));
          out_debug.addChromatogram(toChromatogram(tic_gf));

          SignalToNoiseEstimatorMedian<MSSpectrum<> > snt;
          snt.init(tics);
          for (Size is = 0; is < tics.size(); ++is)
          {
            Peak1D peak;
            peak.setMZ(tic[is].getMZ());
            peak.setIntensity(snt.getSignalToNoise(tics[is]));
            tics_sn.push_back(peak);
          }
          out_debug.addChromatogram(toChromatogram(tics_sn));

          out_debug.addChromatogram(toChromatogram(tics_pp));
          // get rid of "native-id" missing warning
          for (Size id = 0; id < out_debug.size(); ++id) out_debug[id].setNativeID(String("spectrum=") + id);

          mzml_file.store(out_TIC_debug, out_debug);
          LOG_DEBUG << "Storing debug AUTO-RT: " << out_TIC_debug << std::endl;
        }

        // add target EICs: for each m/z with no/negative RT, add all combinations of that m/z with auto-RTs
        // duplicate m/z entries will be ignored!
        // all other lines with positive RT values are copied unaffected
        //do not allow doubles
        std::set<double> mz_doubles;
        for (ConsensusMap::Iterator cit = cm_local.begin(); cit != cm_local.end(); ++cit)
        {
          if (cit->getRT() < 0)
          {
            if (mz_doubles.find(cit->getMZ()) == mz_doubles.end())
            {
              mz_doubles.insert(cit->getMZ());
            }
            else
            {
              LOG_INFO << "Found duplicate m/z entry (" << cit->getMZ() << ") for auto-rt. Skipping ..." << std::endl;
              continue;
            }

            ConsensusMap cm_RT_multiplex;
            for (MSSpectrum<>::ConstIterator itp = tics_pp.begin(); itp != tics_pp.end(); ++itp)
            {
              ConsensusFeature f = *cit;
              f.setRT(itp->getMZ());
              cm.push_back(f);
            }

          }
          else
          { // default feature with no auto-rt
            LOG_INFO << "copying feature with RT " << cit->getRT() << std::endl;
            cm.push_back(*cit);
          }
        }

        // resize, since we have more positions now
        vec_single.resize(cm.size());
      }


      // search for each EIC and add up
      Int not_found(0);
      Map<Size, double> quant;

      String description;
      if (fi < in_header.size())
      {
        HeaderInfo info(in_header[fi]);
        description = info.header_description;
      }

      if (fi == 0)
      { // two additional columns for first file (theoretical RT and m/z)
        tf_single_header0 << "" << "";
        tf_single_header1 << "" << "";
        tf_single_header2 << "RT" << "mz";
      }

      // 5 entries for each input file
      tf_single_header0 << File::basename(in[fi]) << "" << "" << "" << "";
      tf_single_header1 << description << "" << "" << "" << "";
      tf_single_header2 << "RTobs" << "dRT" << "mzobs" << "dppm" << "intensity";

      for (Size i = 0; i < cm.size(); ++i)
      {
        //std::cerr << "Rt" << cm[i].getRT() << "  mz: " << cm[i].getMZ() << " R " <<  cm[i].getMetaValue("rank") << "\n";

        double mz_da = mztol * cm[i].getMZ() / 1e6; // mz tolerance in Dalton
        MSExperiment<>::ConstAreaIterator it = exp.areaBeginConst(cm[i].getRT() - rttol / 2,
                                                                  cm[i].getRT() + rttol / 2,
                                                                  cm[i].getMZ() - mz_da,
                                                                  cm[i].getMZ() + mz_da);
        Peak2D max_peak;
        max_peak.setIntensity(0);
        max_peak.setRT(cm[i].getRT());
        max_peak.setMZ(cm[i].getMZ());
        for (; it != exp.areaEndConst(); ++it)
        {
          if (max_peak.getIntensity() < it->getIntensity())
          {
            max_peak.setIntensity(it->getIntensity());
            max_peak.setRT(it.getRT());
            max_peak.setMZ(it->getMZ());
          }
        }
        double ppm = 0; // observed m/z offset

        if (max_peak.getIntensity() == 0)
        {
          ++not_found;
        }
        else
        {
          // take median for m/z found
          std::vector<double> mz;
          MSExperiment<>::Iterator itm = exp.RTBegin(max_peak.getRT());
          SignedSize low = std::min<SignedSize>(std::distance(exp.begin(), itm), rt_collect);
          SignedSize high = std::min<SignedSize>(std::distance(itm, exp.end()) - 1, rt_collect);
          MSExperiment<>::AreaIterator itt = exp.areaBegin((itm - low)->getRT() - 0.01, (itm + high)->getRT() + 0.01, cm[i].getMZ() - mz_da, cm[i].getMZ() + mz_da);
          for (; itt != exp.areaEnd(); ++itt)
          {
            mz.push_back(itt->getMZ());
            //std::cerr << "ppm: " << itt.getRT() << " " <<  itt->getMZ() << " " << itt->getIntensity() << std::endl;
          }

          if ((SignedSize)mz.size() > (low + high + 1)) LOG_WARN << "Compound " << i << " has overlapping peaks [" << mz.size() << "/" << low + high + 1 << "]" << std::endl;

          if (!mz.empty())
          {
            double avg_mz = std::accumulate(mz.begin(), mz.end(), 0.0) / double(mz.size());
            //std::cerr << "avg: " << avg_mz << "\n";
            ppm = (avg_mz - cm[i].getMZ()) / cm[i].getMZ() * 1e6;
          }

        }

        // appending the second column set requires separator
        String append_sep = (fi == 0 ? "" : out_sep);

        vec_single[i] += append_sep; // new line
        if (fi == 0)
        {
          vec_single[i] += String(cm[i].getRT()) + out_sep +
                           String(cm[i].getMZ()) + out_sep;
        }
        vec_single[i] += String(max_peak.getRT()) + out_sep +
                         String(max_peak.getRT() - cm[i].getRT()) + out_sep +
                         String(max_peak.getMZ()) + out_sep +
                         String(ppm)  + out_sep +
                         String(max_peak.getIntensity());
      }

      if (not_found) LOG_INFO << "Missing peaks for " << not_found << " compounds in file '" << in[fi] << "'.\n";
    }

    //-------------------------------------------------------------
    // create header
    //-------------------------------------------------------------
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header2, out_sep));
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header1, out_sep));
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header0, out_sep));

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------
    TextFile tf;
    for (std::vector<String>::iterator v_it = vec_single.begin(); v_it != vec_single.end(); ++v_it)
    {
      tf.addLine(*v_it);
    }
    tf.store(out);

    return EXECUTION_OK;
  }
Exemple #5
0
  ExitCodes main_(int, const char**) override
  {
    //load data
    FeatureMap features_in, features_truth;
    FeatureXMLFile().load(getStringOption_("in"), features_in);
    features_in.sortByPosition();
    FeatureXMLFile().load(getStringOption_("truth"), features_truth);
    features_truth.sortByPosition();
    FeatureMap abort_reasons;
    if (getStringOption_("abort_reasons") != "")
    {
      FeatureXMLFile().load(getStringOption_("abort_reasons"), abort_reasons);
    }
    double mz_tol = getDoubleOption_("mz_tol");
    writeDebug_(String("Final MZ tolerance: ") + mz_tol, 1);

    //determine average RT tolerance:
    //median feature RT span times given factor
    vector<double> rt_spans;
    for (Size t = 0; t < features_in.size(); ++t)
    {
      if (features_in[t].getConvexHulls().size() != 0)
      {
        rt_spans.push_back(features_in[t].getConvexHull().getBoundingBox().width());
      }
    }
    //feature convex hulls are available => relative RT span
    double rt_tol = getDoubleOption_("rt_tol_abs");
    if (rt_tol < 0.0)
    {
      if (!rt_spans.empty())
      {
        sort(rt_spans.begin(), rt_spans.end());
        rt_tol = getDoubleOption_("rt_tol") * rt_spans[rt_spans.size() / 2];
      }
      else if (features_in.empty())
      {
        // do nothing, rt_tol does not really matter, as we will not find a match anyway, but we want to have the stats
        // at the end, so we do not abort
      }
      else
      {
        writeLog_("Error: Input features do not have convex hulls. You have to set 'rt_tol_abs'!");
        return ILLEGAL_PARAMETERS;
      }
    }
    writeDebug_(String("Final RT tolerance: ") + rt_tol, 1);

    //general statistics
    std::vector<double> ints_t;
    std::vector<double> ints_i;
    std::vector<double> ints_found;
    std::vector<double> ints_missed;
    Map<String, UInt> abort_strings;

    for (Size m = 0; m < features_truth.size(); ++m)
    {
      Feature& f_t =  features_truth[m];
      UInt match_count = 0;
      bool correct_charge = false;
      bool exact_centroid_match = false;
      Size last_match_index = features_in.size() + 1;
      for (Size a = 0; a < features_in.size(); ++a)
      {
        const Feature& f_i =  features_in[a];
        //RT match
        if (fabs(f_i.getRT() - f_t.getRT()) < rt_tol)
        {
          double charge_mz_tol = mz_tol / f_t.getCharge();
          //Exact m/z match
          if (fabs(f_i.getMZ() - f_t.getMZ()) < charge_mz_tol)
          {
            ++match_count;
            exact_centroid_match = true;
            if (f_i.getCharge() == f_t.getCharge()) correct_charge = true;
            last_match_index = a;
          }
          //Centroid is one trace off, but still contained in the convex hull
          else if (f_i.getConvexHull().getBoundingBox().encloses(f_t.getPosition())
                  &&
                   (
                     fabs(f_i.getMZ() + 1.0 / f_t.getCharge() - f_t.getMZ()) < charge_mz_tol
                   ||
                     fabs(f_i.getMZ() - 1.0 / f_t.getCharge() - f_t.getMZ()) < charge_mz_tol
                   )
                   )
          {
            ++match_count;
            last_match_index = a;
            if (f_i.getCharge() == f_t.getCharge()) correct_charge = true;
          }
        }
      }

      f_t.setMetaValue("matches", match_count);
      if (match_count == 1)
      {
        //flag matched feature with addition information
        if (correct_charge)
        {
          f_t.setMetaValue("correct_charge", String("true"));
          f_t.setMetaValue("intensity_ratio", features_in[last_match_index].getIntensity() / f_t.getIntensity());
          features_in[last_match_index].setMetaValue("correct_hit", "true"); //flag the feature for ROC curve
        }
        else
        {
          f_t.setMetaValue("correct_charge", String("false"));
        }

        if (exact_centroid_match)
        {
          f_t.setMetaValue("exact_centroid_match", String("true"));
        }
        else
        {
          f_t.setMetaValue("exact_centroid_match", String("false"));
        }
      }
      //evaluation of correct features only
      if (match_count == 1 && correct_charge)
      {
        ints_t.push_back(f_t.getIntensity());
        ints_i.push_back(features_in[last_match_index].getIntensity());
        ints_found.push_back(f_t.getIntensity());
      }
      else
      {
        ints_missed.push_back(f_t.getIntensity());

        //look up the abort reason of the nearest seed
        double best_score_ab = 0;
        String reason = "";
        for (Size b = 0; b < abort_reasons.size(); ++b)
        {
          const Feature& f_ab =  abort_reasons[b];
          if (fabs(f_ab.getRT() - f_t.getRT()) <= rt_tol
             && fabs(f_ab.getMZ() - f_t.getMZ()) <= mz_tol)
          {
            double score = (1.0 - fabs(f_ab.getMZ() - f_t.getMZ()) / mz_tol) * (1.0 - fabs(f_ab.getRT() - f_t.getRT()) / rt_tol);
            if (score > best_score_ab)
            {
              best_score_ab = score;
              reason = f_ab.getMetaValue("abort_reason");
            }
          }
        }
        if (reason == "")
        {
          reason = "No seed found";
        }
        if (abort_strings.has(reason))
        {
          abort_strings[reason]++;
        }
        else
        {
          abort_strings[reason] = 1;
        }
      }
    }

    //------------------------ general statistics ------------------------
    cout << endl;
    cout << "general information:" << endl;
    cout << "====================" << endl;
    cout << "input features: " << features_in.size() << endl;
    cout << "truth features: " << features_truth.size() << endl;

    //------------------------ matches ------------------------
    cout << endl;
    cout << "feature matching statistics:" << endl;
    cout << "============================" << endl;
    Size no_match = count(features_truth, "matches", "0");
    cout << "no match: " << no_match << percentage(no_match, features_truth.size()) << endl;
    Size one_match = count(features_truth, "matches", "1");
    cout << "one match: " << one_match << percentage(one_match, features_truth.size()) << endl;
    Size charge_match = count(features_truth, "correct_charge", "true");
    cout << " - correct charge: " << charge_match << percentage(charge_match, features_truth.size()) << endl;
    Size centroid_match = count(features_truth, "exact_centroid_match", "true");
    cout << " - exact centroid match: " << centroid_match << percentage(centroid_match, features_truth.size()) << endl;
    Size multi_match = features_truth.size() - count(features_truth, "matches", "0") - count(features_truth, "matches", "1");
    cout << "multiple matches: " << multi_match << percentage(multi_match, features_truth.size()) << endl;
    Size incorrect_match = multi_match + one_match - charge_match;
    cout << "incorrect matches: " << incorrect_match << percentage(incorrect_match, features_truth.size()) << endl;
    if (abort_reasons.size())
    {
      cout << "reasons for unmatched features:" << endl;
      for (Map<String, UInt>::iterator it = abort_strings.begin(); it != abort_strings.end(); ++it)
      {
        cout << " - " << String(it->second).fillLeft(' ', 4) << ": " << it->first << endl;
      }
    }
    //------------------------ intensity ------------------------
    cout << endl;
    cout << "intensity statistics:" << endl;
    cout << "=====================" << endl;
    if (ints_i.empty())
    {
      cout << "correlation of found features: nan" << endl;
    }
    else
    {
      cout << "correlation of found features: " << pearsonCorrelationCoefficient(ints_i.begin(), ints_i.end(), ints_t.begin(), ints_t.end()) << endl;
    }
    if (ints_found.empty())
    {
      cout << "intensity distribution of found: 0.0 0.0 0.0 0.0 0.0" << endl;
    }
    else
    {
      cout << "intensity distribution of found: " << fiveNumbers(ints_found, 1) << endl;
    }
    if (ints_missed.empty())
    {
      cout << "intensity distribution of missed: 0.0 0.0 0.0 0.0 0.0" << endl;
    }
    else
    {
      cout << "intensity distribution of missed: " << fiveNumbers(ints_missed, 1) << endl;
    }

    //------------------------ charges ------------------------
    cout << endl;
    cout << "charge matches statistics:" << endl;
    cout << "===========================" << endl;
    Map<UInt, UInt> present_charges, found_charges;
    for (Size i = 0; i < features_truth.size(); ++i)
    {
      UInt charge = features_truth[i].getCharge();
      present_charges[charge]++;
      if (features_truth[i].getMetaValue("correct_charge").toString() == "true")
      {
        found_charges[charge]++;
      }
    }
    for (Map<UInt, UInt>::const_iterator it = present_charges.begin(); it != present_charges.end(); ++it)
    {
      cout << "charge " << it->first << ": " << found_charges[it->first] << "/" << it->second << percentage(found_charges[it->first], it->second) << endl;
    }

    //write output
    if (getStringOption_("out") != "")
    {
      FeatureXMLFile().store(getStringOption_("out"), features_truth);
    }

    //ROC curve
    if (getStringOption_("out_roc") != "")
    {
      TextFile tf;
      tf.addLine("false\tcorrect\tFDR\tTPR");

      features_in.sortByIntensity(true);
      UInt f_correct = 0;
      UInt f_false = 0;
      double found = features_in.size();
      double correct = features_truth.size();
      for (Size i = 0; i < features_in.size(); ++i)
      {
        if (features_in[i].metaValueExists("correct_hit"))
        {
          ++f_correct;
        }
        else
        {
          ++f_false;
        }
        tf.addLine(String(f_false) + "\t" + f_correct + "\t" + String::number(f_false / found, 3) + "\t" + String::number(f_correct / correct, 3));
      }
      tf.store(getStringOption_("out_roc"));
    }

    return EXECUTION_OK;
  }
  ExitCodes main_(int, const char**)
  {
    //----------------------------------------------------------------
    // load data
    //----------------------------------------------------------------
    StringList in_list = getStringList_("in");
    String out = getStringOption_("out");
    String out_csv = getStringOption_("out_csv");
    String format = getStringOption_("out_type");

    if (out.empty() && out_csv.empty())
    {
      LOG_ERROR << "Neither 'out' nor 'out_csv' were provided. Please assign at least one of them." << std::endl;
      return ILLEGAL_PARAMETERS;
    }

    if (!out.empty() && format == "") // get from filename
    {
      try
      {
        format = out.suffix('.');
      }
      catch (Exception::ElementNotFound& /*e*/)
      {
        format = "nosuffix";
      }
      // check if format is valid:
      if (!ListUtils::contains(out_formats_, format.toLower()))
      {
        LOG_ERROR << "No explicit image output format was provided via 'out_type', and the suffix ('" << format << "') does not resemble a valid type. Please fix one of them." << std::endl;
        return ILLEGAL_PARAMETERS;
      }
    }

    double q_min = getDoubleOption_("q_min");
    double q_max = getDoubleOption_("q_max");
    if (q_min >= q_max)
    {
      LOG_ERROR << "The parameter 'q_min' must be smaller than 'q_max'. Quitting..." << std::endl;
      return ILLEGAL_PARAMETERS;
    }


    IDEvaluationBase* mw = new IDEvaluationBase();
    Param alg_param = mw->getParameters();
    alg_param.insert("", getParam_().copy("algorithm:", true));
    mw->setParameters(alg_param);

    if (!mw->loadFiles(in_list))
    {
      LOG_ERROR << "Tool failed. See above." << std::endl;
      return INCOMPATIBLE_INPUT_DATA;
    }
    mw->setVisibleArea(q_min, q_max);

    if (!out.empty()) // save as image and exit
    {
      String error;
      bool r = mw->exportAsImage(out.toQString(), error, format.toQString());
      if (r) return EXECUTION_OK;
      else
      {
        LOG_ERROR << error << std::endl;
        return ILLEGAL_PARAMETERS;
      }
    }

    if (!out_csv.empty())
    {
      TextFile tf;
      for (Size i = 0; i < mw->getPoints().size(); ++i)
      {
        MSSpectrum s = mw->getPoints()[i];
        StringList sl1;
        StringList sl2;
        for (Size j = 0; j < s.size(); ++j)
        {
          sl1.push_back(s[j].getMZ());
          sl2.push_back(s[j].getIntensity());
        }
        tf.addLine(String("# ") + String(s.getMetaValue("search_engine")));
        tf.addLine(ListUtils::concatenate(sl1, ","));
        tf.addLine(ListUtils::concatenate(sl2, ","));
      }
      tf.store(out_csv);
    }

    delete(mw);
    return EXECUTION_OK;
  }
Exemple #7
0
  void IBSpectraFile::store(const String& filename, const ConsensusMap& cm)
  {
    // typdefs for shorter code
    typedef std::vector<ProteinHit>::iterator ProtHitIt;

    // general settings .. do we need to expose these?
    // ----------------------------------------------------------------------
    /// Allow also non-unique peptides to be exported
    bool allow_non_unique = true;
    /// Intensities below this value will be set to 0.0 to avoid numerical problems when quantifying
    double intensity_threshold = 0.00001;
    // ----------------------------------------------------------------------


    // guess experiment type
    boost::shared_ptr<IsobaricQuantitationMethod> quantMethod = guessExperimentType_(cm);

    // we need the protein identifications to reference the protein names
    ProteinIdentification protIdent;
    bool has_proteinIdentifications = false;
    if (cm.getProteinIdentifications().size() > 0)
    {
      protIdent = cm.getProteinIdentifications()[0];
      has_proteinIdentifications = true;
    }

    // start the file by adding the tsv header
    TextFile textFile;
    textFile.addLine(ListUtils::concatenate(constructHeader_(*quantMethod), "\t"));

    for (ConsensusMap::ConstIterator cm_iter = cm.begin();
         cm_iter != cm.end();
         ++cm_iter)
    {
      const ConsensusFeature& cFeature = *cm_iter;
      std::vector<IdCSV> entries;

      /// 1st we extract the identification information from the consensus feature
      if (cFeature.getPeptideIdentifications().size() == 0 || !has_proteinIdentifications)
      {
        // we store unidentified hits anyway, because the iTRAQ quant is still helpful for normalization
        entries.push_back(IdCSV());
      }
      else
      {
        // protein name:
        const PeptideHit& peptide_hit = cFeature.getPeptideIdentifications()[0].getHits()[0];
        std::set<String> protein_accessions = peptide_hit.extractProteinAccessions();
        if (protein_accessions.size() != 1)
        {
          if (!allow_non_unique) continue; // we only want unique peptides
        }

        for (std::set<String>::const_iterator prot_ac = protein_accessions.begin(); prot_ac != protein_accessions.end(); ++prot_ac)
        {
          IdCSV entry;
          entry.charge = cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge();
          entry.peptide = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString();
          entry.theo_mass = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().getMonoWeight(Residue::Full, cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge());

          // write modif
          entry.modif = getModifString_(cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence());

          ProtHitIt proteinHit = protIdent.findHit(*prot_ac);
          if (proteinHit == protIdent.getHits().end())
          {
            std::cerr << "Protein referenced in peptide not found...\n";
            continue; // protein not found
          }

          entry.accession = proteinHit->getAccession();
          entries.push_back(entry);
        }
      }

      // 2nd we add the quantitative information of the channels

      // .. skip features with 0 intensity
      if (cFeature.getIntensity() == 0)
      {
        continue;
      }

      for (std::vector<IdCSV>::iterator entry = entries.begin();
           entry != entries.end();
           ++entry)
      {
        // set parent intensity
        entry->parent_intens = cFeature.getIntensity();
        entry->retention_time = cFeature.getRT();
        entry->spectrum = cFeature.getUniqueId();
        entry->exp_mass = cFeature.getMZ();

        // create output line
        StringList currentLine;

        // add entry to currentLine
        entry->toStringList(currentLine);

        // extract channel intensities and positions
        std::map<Int, double> intensityMap;
        ConsensusFeature::HandleSetType features = cFeature.getFeatures();

        for (ConsensusFeature::HandleSetType::const_iterator fIt = features.begin();
             fIt != features.end();
             ++fIt)
        {
          intensityMap[Int(fIt->getMZ())] = (fIt->getIntensity() > intensity_threshold ? fIt->getIntensity() : 0.0);
        }
        for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin();
             it != quantMethod->getChannelInformation().end();
             ++it)
        {
          currentLine.push_back(String(it->center));
        }
        for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin();
             it != quantMethod->getChannelInformation().end();
             ++it)
        {
          currentLine.push_back(String(intensityMap[int(it->center)]));
        }

        textFile.addLine(ListUtils::concatenate(currentLine, "\t"));
      }
    }

    // write to file
    textFile.store(filename);
  }
  ExitCodes main_(int, const char**)
  {
    StringList cv_files = getStringList_("cv_files");
    StringList cv_names = getStringList_("cv_names");
    if (cv_files.size() != cv_names.size())
    {
      cerr << "Error: You have to specify an identifier for each CV file. Aborting!" << endl;
      return ILLEGAL_PARAMETERS;
    }

    // load cv terms
    ControlledVocabulary cv;
    for (Size i = 0; i < cv_files.size(); ++i)
    {
      cv.loadFromOBO(cv_names[i], cv_files[i]);
    }
    Map<String, ControlledVocabulary::CVTerm> terms = cv.getTerms();

    // load mappings from mapping file
    String mapping_file = getStringOption_("mapping_file");
    CVMappings mappings;
    CVMappingFile().load(mapping_file, mappings);

    //store HTML version of mapping and CV
    if (getStringOption_("html") != "")
    {
      TextFile file;
      file.addLine("<HTML>");
      file.addLine("  <HEAD>");
      file.addLine("    <TITLE>CV mapping file</TITLE>");
      file.addLine("    <SCRIPT language=javascript type='text/javascript'>");
      file.addLine("      function toggleDiv(layer_ref,force_state) ");
      file.addLine("      {");
      file.addLine("        if (document.getElementById(layer_ref).style.display=='none' || force_state=='true')");
      file.addLine("        {");
      file.addLine("          document.getElementById(layer_ref).style.display = 'block';");
      file.addLine("        }");
      file.addLine("        else if (document.getElementById(layer_ref).style.display=='block' || force_state=='false')");
      file.addLine("        {");
      file.addLine("          document.getElementById(layer_ref).style.display = 'none';");
      file.addLine("        }");
      file.addLine("      }");
      file.addLine("    </SCRIPT>");
      file.addLine("  </HEAD>");
      file.addLine("  <BODY>");

      //count the number of terms and add button to expend/collaps all terms
      Int term_count = 0;
      for (vector<CVMappingRule>::const_iterator it = mappings.getMappingRules().begin(); it != mappings.getMappingRules().end(); ++it)
      {
        for (vector<CVMappingTerm>::const_iterator tit = it->getCVTerms().begin(); tit != it->getCVTerms().end(); ++tit)
        {
          ++term_count;
        }
      }
      String expand_all = "    <a href=\"javascript:toggleDiv('div0','true')";
      String collapse_all = "    <a href=\"javascript:toggleDiv('div0','false')";
      for (Int i = 1; i < term_count; ++i)
      {
        expand_all += String(";toggleDiv('div") + i + "','true')";
        collapse_all += String(";toggleDiv('div") + i + "','false')";
      }
      file.addLine(expand_all + "\">Expand all</a><BR>");
      file.addLine(collapse_all + "\">Collapse all</a>");
      file.addLine("    <TABLE width=100% border=0>");
      term_count = -1;
      for (vector<CVMappingRule>::const_iterator it = mappings.getMappingRules().begin(); it != mappings.getMappingRules().end(); ++it)
      {
        //create rule line
        file.addLine("      <TR><TD colspan=\"2\"><HR></TD></TR>");
        file.addLine(String("      <TR><TD>Identifier:</TD><TD><B>") + it->getIdentifier() + "</B></TD></TR>");
        file.addLine(String("      <TR><TD>Element:</TD><TD><B>") + it->getElementPath() + "</B></TD></TR>");
        if (it->getRequirementLevel() == CVMappingRule::MUST)
        {
          file.addLine("      <TR><TD>Requirement level:</TD><TD><FONT color=\"red\">MUST</FONT></TD></TR>");
        }
        else if (it->getRequirementLevel() == CVMappingRule::SHOULD)
        {
          file.addLine("      <TR><TD>Requirement level:</TD><TD><FONT color=\"orange\">SHOULD</FONT></TD></TR>");
        }
        else if (it->getRequirementLevel() == CVMappingRule::MAY)
        {
          file.addLine("      <TR><TD>Requirement level:</TD><TD><FONT color=\"green\">MAY</FONT></TD></TR>");
        }
        if (it->getCombinationsLogic() == CVMappingRule::AND)
        {
          file.addLine("      <TR><TD>Combination logic:</TD><TD><FONT color=\"red\">AND</FONT></TD></TR>");
        }
        else if (it->getCombinationsLogic() == CVMappingRule::XOR)
        {
          file.addLine("      <TR><TD>Combination logic:</TD><TD><FONT color=\"orange\">XOR</FONT></TD></TR>");
        }
        else if (it->getCombinationsLogic() == CVMappingRule::OR)
        {
          file.addLine("      <TR><TD>Combination logic:</TD><TD><FONT color=\"green\">OR</FONT></TD></TR>");
        }

        //create table with terms
        for (vector<CVMappingTerm>::const_iterator tit = it->getCVTerms().begin(); tit != it->getCVTerms().end(); ++tit)
        {
          //create term line
          String term_line = String("      <TR><TD valign=\"top\">Term:</TD><TD>");
          if (tit->getAllowChildren())
          {
            ++term_count;
            term_line += String("<a href=\"javascript:toggleDiv('div") + term_count + "','')\" style=\"text-decoration:none\" >+</a> ";
          }
          else
          {
            term_line += String("&nbsp;&nbsp;");
          }
          //add Term accession, name and description (as popup)
          if (cv.exists(tit->getAccession()))
          {
            const ControlledVocabulary::CVTerm& child_term = cv.getTerm(tit->getAccession());

            String description = child_term.description;
            if (child_term.synonyms.size() != 0)
            {
              description += String(" -- Synonyms: '") + ListUtils::concatenate(child_term.synonyms, ", ") + "'";
            }
            term_line += "<span title=\"" + description + "\">";
          }
          term_line += tit->getAccession() + " ! " + tit->getTermName();
          if (cv.exists(tit->getAccession()))
          {
            term_line += "</span>";
            //check if term accession and term name correspond to the CV
            const ControlledVocabulary::CVTerm& main_term = cv.getTerm(tit->getAccession());
            if (main_term.name != tit->getTermName())
            {
              cerr << "Warning: Accession '" << tit->getAccession() << "' and name '" << tit->getTermName() << "' do not match. Name should be '" << main_term.name << "'." << endl;
            }
          }
          //tags
          StringList tags;
          if (!tit->getUseTerm())
          {
            tags.push_back("children only");
          }
          if (tit->getIsRepeatable())
          {
            tags.push_back("repeatable");
          }
          if (cv.exists(tit->getAccession()))
          {
            const ControlledVocabulary::CVTerm& term = cv.getTerm(tit->getAccession());
            if (term.obsolete)
            {
              tags.push_back("<font color=darkred>obsolete</font>");
            }
            if (term.xref_type != ControlledVocabulary::CVTerm::NONE)
            {
              tags.push_back("value-type=" + ControlledVocabulary::CVTerm::getXRefTypeName(term.xref_type));
            }
            if (term.units.size() > 0)
            {
              StringList units;
              for (set<String>::const_iterator u_it = term.units.begin(); u_it != term.units.end(); ++u_it)
              {
                units.push_back(*u_it + "!" + cv.getTerm(*u_it).name);
              }
              tags.push_back(String("units=") + ListUtils::concatenate(units, ","));
            }
            if (term.xref_binary.size() > 0)
            {
              StringList types;
              for (StringList::const_iterator u_it = term.xref_binary.begin(); u_it != term.xref_binary.end(); ++u_it)
              {
                types.push_back(*u_it + "!" + cv.getTerm(*u_it).name);
              }
              tags.push_back(String("binary-array-types=") + ListUtils::concatenate(types, ","));
            }
          }
          if (tags.size() != 0)
          {
            term_line += String("<FONT color=\"grey\"> (") + ListUtils::concatenate(tags, ", ") + ")</FONT>";
          }
          file.addLine(term_line);

          // check whether we need the whole tree, or just the term itself
          if (tit->getAllowChildren())
          {
            file.addLine(String("        <div id=\"div") + term_count + "\" style=\"display: none\">");
            if (cv.exists(tit->getAccession()))
            {
              writeTermTree_(tit->getAccession(), cv, file, 1);
              //BEGIN - THIS IS NEEDED FOR WRITING PARSERS ONLY
              /*
              set<String> allowed_terms;
              cv.getAllChildTerms(allowed_terms, tit->getAccession());
              for (set<String>::const_iterator atit=allowed_terms.begin(); atit!=allowed_terms.end(); ++atit)
              {
                  const ControlledVocabulary::CVTerm& child_term = cv.getTerm(*atit);
                  String parser_string = String("os << \"&lt;cvParam cvRef=\\\"MS\\\" accession=\\\"") + child_term.id + "\\\" name=\\\"" + child_term.name + "\\\"";
                  for (Size i=0; i<child_term.unparsed.size(); ++i)
                  {
                      //TODO this does not work anymore. The type is now stored as a member
                      if (child_term.unparsed[i].hasSubstring("value-type:xsd\\:int") || child_term.unparsed[i].hasSubstring("value-type:xsd\\:float") || child_term.unparsed[i].hasSubstring("value-type:xsd\\:string"))
                      {
                          parser_string += " value=\\\"\" &lt;&lt; &lt;&lt; \"\\\"";
                      }
                  }
                  parser_string += "/&gt;\\n\";<BR>";
                  file.push_back(parser_string);
              }*/
            }
            else
            {
              file.addLine("          &nbsp;&nbsp;&nbsp;- Missing terms, CV not loaded...");
              cerr << "Warning: no child terms for " << tit->getAccession() << " found!" << endl;
            }
            file.addLine("          </div>");
            file.addLine("        </TD></TD></TR>");
          }
        }
      }
      file.addLine("    </TABLE>");
      file.addLine("  </BODY>");
      file.addLine("</HTML>");
      file.store(getStringOption_("html"));
      return EXECUTION_OK;
    }

    // iterator over all mapping rules and store the mentioned terms
    StringList ignore_namespaces = getStringList_("ignore_cv");
    set<String> ignore_cv_list;
    for (StringList::const_iterator it = ignore_namespaces.begin(); it != ignore_namespaces.end(); ++it)
    {
      ignore_cv_list.insert(*it);
    }
    set<String> used_terms;
    for (vector<CVMappingRule>::const_iterator it = mappings.getMappingRules().begin(); it != mappings.getMappingRules().end(); ++it)
    {
      set<String> allowed_terms;
      // iterate over all allowed terms
      for (vector<CVMappingTerm>::const_iterator tit = it->getCVTerms().begin(); tit != it->getCVTerms().end(); ++tit)
      {
        // check whether the term itself it allowed, or only its children
        if (tit->getUseTerm())
        {
          allowed_terms.insert(tit->getAccession());
        }

        // check whether we need the whole tree, or just the term itself
        if (tit->getAllowChildren())
        {
          // check whether we want to ignore this term
          if (!(tit->getAccession().has(':') && ignore_cv_list.find(tit->getAccession().prefix(':')) != ignore_cv_list.end()))
          {
            cv.getAllChildTerms(allowed_terms, tit->getAccession());
          }

          // also add the term itself to the used_terms, because all the children are allowed
          used_terms.insert(tit->getAccession());
        }
      }

      // print the allowed terms for the rule
      cout << "MappingRule: id=" << it->getIdentifier() << ", elementPath=" << it->getElementPath() << ", #terms=" << it->getCVTerms().size() << endl;
      for (set<String>::const_iterator ait = allowed_terms.begin(); ait != allowed_terms.end(); ++ait)
      {
        cout << *ait << " " << terms[*ait].name << endl;
      }
      used_terms.insert(allowed_terms.begin(), allowed_terms.end());
    }

    // find unused terms, which CANNOT be used in the XML due to the mapping file
    set<String> unused_terms;
    for (Map<String, ControlledVocabulary::CVTerm>::ConstIterator it = terms.begin(); it != terms.end(); ++it)
    {
      if (used_terms.find(it->first) == used_terms.end())
      {
        unused_terms.insert(it->first);
      }
    }

    cout << "\n\nCVTerms which are unused in the mapping file and therefore MUST NOT be used in an instance document" << endl;
    for (set<String>::const_iterator it = unused_terms.begin(); it != unused_terms.end(); ++it)
    {
      cout << *it << " " << terms[*it].name;

      // print also parent names
      for (set<String>::const_iterator pit = terms[*it].parents.begin(); pit != terms[*it].parents.end(); ++pit)
      {
        cout << " " << terms[*pit].id << " " << terms[*pit].name;
      }
      cout << endl;
    }


    return EXECUTION_OK;
  }