CachedmzML cacheFile(std::string & tmp_filename, PeakMap& exp)
{
  NEW_TMP_FILE(tmp_filename);

  // Load experiment
  MzMLFile().load(OPENMS_GET_TEST_DATA_PATH("MzMLFile_1.mzML"), exp);
  TEST_EQUAL(exp.getNrSpectra() > 0, true)
  TEST_EQUAL(exp.getNrChromatograms() > 0, true)

  // Cache the experiment to a temporary file
  CachedmzML cache;
  cache.writeMemdump(exp, tmp_filename);
  // Create the index from the given file
  cache.createMemdumpIndex(tmp_filename);
  return cache;
}
  ExitCodes main_(int , const char**)
  {
    String out_meta = getStringOption_("out");
    String out_cached = out_meta + ".cached";
    bool convert_back =  getFlag_("convert_back");

    FileHandler fh;

    //input file type
    String in = getStringOption_("in");
    String in_cached = in + ".cached";
    FileTypes::Type in_type = FileTypes::nameToType(getStringOption_("in_type"));

    if (in_type == FileTypes::UNKNOWN)
    {
      in_type = fh.getType(in);
      writeDebug_(String("Input file type: ") + FileTypes::typeToName(in_type), 2);
    }

    if (in_type == FileTypes::UNKNOWN)
    {
      writeLog_("Error: Could not determine input file type!");
      return PARSE_ERROR;
    }

    //output file names and types
    String out = getStringOption_("out");
    FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type"));

    if (out_type == FileTypes::UNKNOWN)
    {
      out_type = fh.getTypeByFileName(out);
    }

    if (out_type == FileTypes::UNKNOWN)
    {
      writeLog_("Error: Could not determine output file type!");
      return PARSE_ERROR;
    }

    if (in_type == FileTypes::SQMASS && out_type == FileTypes::MZML)
    {
      MapType exp;
      SqMassFile sqfile;
      MzMLFile f;
      sqfile.load(in, exp);
      f.store(out, exp);
      return EXECUTION_OK;
    }
    else if (in_type == FileTypes::MZML && out_type == FileTypes::SQMASS)
    {
      MzMLFile f;
      SqMassFile sqfile;
      MapType exp;
      f.load(in, exp);
      sqfile.store(out, exp);
      return EXECUTION_OK;
    }


    if (!convert_back)
    {
      MapType exp;
      CachedmzML cacher;
      MzMLFile f;

      cacher.setLogType(log_type_);
      f.setLogType(log_type_);

      f.load(in,exp);
      cacher.writeMemdump(exp, out_cached);
      cacher.writeMetadata(exp, out_meta, true);
    }
    else
    {
      MzMLFile f;
      MapType meta_exp;
      CachedmzML cacher;
      MapType exp_reading;

      cacher.setLogType(log_type_);
      f.setLogType(log_type_);

      f.load(in,meta_exp);
      cacher.readMemdump(exp_reading, in_cached);

      std::cout << " read back, got " << exp_reading.size() << " spectra " << exp_reading.getChromatograms().size() << " chromats " << std::endl;

      {
      for (Size i=0; i<meta_exp.size(); ++i)
      {
        for (Size j = 0; j < meta_exp[i].getDataProcessing().size(); j++)
        {
          if (meta_exp[i].getDataProcessing()[j]->metaValueExists("cached_data"))
          {
            meta_exp[i].getDataProcessing()[j]->removeMetaValue("cached_data");
          }
        }
      }

      for (Size i=0; i < meta_exp.getNrChromatograms(); ++i)
      {
        for (Size j = 0; j < meta_exp.getChromatogram(i).getDataProcessing().size(); j++)
        {
          if (meta_exp.getChromatogram(i).getDataProcessing()[j]->metaValueExists("cached_data"))
          {
            meta_exp.getChromatogram(i).getDataProcessing()[j]->removeMetaValue("cached_data");
          }
        }
      }
      }

      if (meta_exp.size() != exp_reading.size())
      {
        std::cerr << " Both experiments need to have the same size!";
      }

      for (Size i=0; i<exp_reading.size(); ++i)
      {
        for (Size j = 0; j < exp_reading[i].size(); j++)
        {
          meta_exp[i].push_back(exp_reading[i][j]);
        }
      }
      std::vector<MSChromatogram<ChromatogramPeak> > chromatograms = exp_reading.getChromatograms();
      std::vector<MSChromatogram<ChromatogramPeak> > old_chromatograms = meta_exp.getChromatograms();
      for (Size i=0; i<chromatograms.size(); ++i)
      {
        for (Size j = 0; j < chromatograms[i].size(); j++)
        {
          old_chromatograms[i].push_back(chromatograms[i][j]);
        }
      }
      meta_exp.setChromatograms(old_chromatograms);


      f.store(out_meta,meta_exp);
    }

    return EXECUTION_OK;
  }
  ExitCodes main_(int , const char**)
  {
    String in = getStringOption_("in");
    String out_meta = getStringOption_("out");
    String in_cached = in + ".cached";
    String out_cached = out_meta + ".cached";
    bool convert_back =  getFlag_("convert_back");

    if (!convert_back)
    {
      MapType exp;
      CachedmzML cacher;
      MzMLFile f;

      cacher.setLogType(log_type_);
      f.setLogType(log_type_);

      f.load(in,exp);
      cacher.writeMemdump(exp, out_cached);

      DataProcessing dp;
      std::set<DataProcessing::ProcessingAction> actions;
      actions.insert(DataProcessing::FORMAT_CONVERSION);
      dp.setProcessingActions(actions);
      dp.setMetaValue("cached_data", "true");
      for (Size i=0; i<exp.size(); ++i)
      {
        exp[i].getDataProcessing().push_back(dp);
      }
      std::vector<MSChromatogram<ChromatogramPeak> > chromatograms = exp.getChromatograms();
      for (Size i=0; i<chromatograms.size(); ++i)
      {
        chromatograms[i].getDataProcessing().push_back(dp);
      }
      exp.setChromatograms(chromatograms);
      cacher.writeMetadata(exp, out_meta);
    }
    else
    {
      MzMLFile f;
      MapType meta_exp;
      CachedmzML cacher;
      MapType exp_reading;

      cacher.setLogType(log_type_);
      f.setLogType(log_type_);

      f.load(in,meta_exp);
      cacher.readMemdump(exp_reading, in_cached);

      std::cout << " read back, got " << exp_reading.size() << " spectra " << exp_reading.getChromatograms().size() << " chromats " << std::endl;

      {
      for (Size i=0; i<meta_exp.size(); ++i)
      {
        for (Size j = 0; j < meta_exp[i].getDataProcessing().size(); j++)
        {
          DataProcessing& dp = meta_exp[i].getDataProcessing()[j];
          if (dp.metaValueExists("cached_data"))
          {
            dp.removeMetaValue("cached_data");
          }
        }
      }

      std::vector<MSChromatogram<ChromatogramPeak> > chromatograms = meta_exp.getChromatograms();
      for (Size i=0; i<chromatograms.size(); ++i)
      {
        for (Size j = 0; j < chromatograms[i].getDataProcessing().size(); j++)
        {
          DataProcessing& dp = chromatograms[i].getDataProcessing()[j];
          if (dp.metaValueExists("cached_data"))
          {
            dp.removeMetaValue("cached_data");
          }
        }
      }
      }

      if (meta_exp.size() != exp_reading.size())
      {
        std::cerr << " Both experiments need to have the same size!";
      }

      for (Size i=0; i<exp_reading.size(); ++i)
      {
        for (Size j = 0; j < exp_reading[i].size(); j++)
        {
          meta_exp[i].push_back(exp_reading[i][j]);
        }
      }
      std::vector<MSChromatogram<ChromatogramPeak> > chromatograms = exp_reading.getChromatograms();
      std::vector<MSChromatogram<ChromatogramPeak> > old_chromatograms = meta_exp.getChromatograms();
      for (Size i=0; i<chromatograms.size(); ++i)
      {
        for (Size j = 0; j < chromatograms[i].size(); j++)
        {
          old_chromatograms[i].push_back(chromatograms[i][j]);
        }
      }
      meta_exp.setChromatograms(old_chromatograms);


      f.store(out_meta,meta_exp);
    }

    return EXECUTION_OK;
  }
Beispiel #4
0
  ExitCodes main_(int, const char**) override
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------

    //input file names
    String in = getStringOption_("in");
    String read_method = getStringOption_("read_method");
    bool load_data = getStringOption_("loadData") == "true";

    if (read_method == "streaming")
    {
      std::cout << "Read method: streaming" << std::endl;

      // Create the consumer, set output file name, transform
      TICConsumer consumer;
      MzMLFile mzml;
      mzml.setLogType(log_type_);

      PeakFileOptions opt = mzml.getOptions();
      opt.setFillData(load_data); // whether to actually load any data
      opt.setSkipXMLChecks(true); // save time by not checking base64 strings for whitespaces 
      opt.setMaxDataPoolSize(100);
      opt.setAlwaysAppendData(false);
      mzml.setOptions(opt);
      mzml.transform(in, &consumer, true, true);

      std::cout << "There are " << consumer.nr_spectra << " spectra and " << consumer.nr_peaks << " peaks in the input file." << std::endl;
      std::cout << "The total ion current is " << consumer.TIC << std::endl;
      size_t after;
      SysInfo::getProcessMemoryConsumption(after);
      std::cout << " Memory consumption after " << after << std::endl;
    }
    else if (read_method == "regular")
    {
      std::cout << "Read method: regular" << std::endl;

      MzMLFile mzml;
      mzml.setLogType(log_type_);
      PeakFileOptions opt = mzml.getOptions();
      opt.setFillData(load_data); // whether to actually load any data
      opt.setSkipXMLChecks(true); // save time by not checking base64 strings for whitespaces 
      mzml.setOptions(opt);
      PeakMap map;
      mzml.load(in, map);
      double TIC = 0.0;
      long int nr_peaks = 0;
      for (Size i =0; i < map.size(); i++)
      {
        nr_peaks += map[i].size();
        for (Size j = 0; j < map[i].size(); j++)
        {
          TIC += map[i][j].getIntensity();
        }
      }

      std::cout << "There are " << map.size() << " spectra and " << nr_peaks << " peaks in the input file." << std::endl;
      std::cout << "The total ion current is " << TIC << std::endl;
      size_t after;
      SysInfo::getProcessMemoryConsumption(after);
      std::cout << " Memory consumption after " << after << std::endl;
    }
    else if (read_method == "indexed")
    {
      std::cout << "Read method: indexed" << std::endl;
      
      IndexedMzMLFileLoader imzml;
      // load data from an indexed MzML file
      OnDiscPeakMap map;
      imzml.load(in, map);
      double TIC = 0.0;
      long int nr_peaks = 0;
      if (load_data)
      {
        for (Size i =0; i < map.getNrSpectra(); i++)
        {
          OpenMS::Interfaces::SpectrumPtr sptr = map.getSpectrumById(i);

          nr_peaks += sptr->getIntensityArray()->data.size();
          TIC += std::accumulate(sptr->getIntensityArray()->data.begin(), sptr->getIntensityArray()->data.end(), 0.0);
        }
      }

      std::cout << "There are " << map.getNrSpectra() << " spectra and " << nr_peaks << " peaks in the input file." << std::endl;
      std::cout << "The total ion current is " << TIC << std::endl;
      size_t after;
      SysInfo::getProcessMemoryConsumption(after);
      std::cout << " Memory consumption after " << after << std::endl;
    }
    else if (read_method == "indexed_parallel")
    {
      std::cout << "Read method: indexed (parallel)" << std::endl;
      
      IndexedMzMLFileLoader imzml;
      PeakFileOptions opt = imzml.getOptions();
      opt.setFillData(load_data); // whether to actually load any data
      imzml.setOptions(opt);

      // load data from an indexed MzML file
      OnDiscPeakMap map;
      map.openFile(in, true);
      map.setSkipXMLChecks(true);

      double TIC = 0.0;
      long int nr_peaks = 0;

      if (load_data)
      {

        // firstprivate means that each thread has its own instance of the
        // variable, each copy initialized with the initial value 
#ifdef _OPENMP
#pragma omp parallel for firstprivate(map) 
#endif
        for (SignedSize i =0; i < (SignedSize)map.getNrSpectra(); i++)
        {
          OpenMS::Interfaces::SpectrumPtr sptr = map.getSpectrumById(i);
          double nr_peaks_l = sptr->getIntensityArray()->data.size();
          double TIC_l = std::accumulate(sptr->getIntensityArray()->data.begin(), sptr->getIntensityArray()->data.end(), 0.0);
#ifdef _OPENMP
#pragma omp critical (indexed)
#endif
          {
            TIC += TIC_l;
            nr_peaks += nr_peaks_l;
          }
        }

      }

      std::cout << "There are " << map.getNrSpectra() << " spectra and " << nr_peaks << " peaks in the input file." << std::endl;
      std::cout << "The total ion current is " << TIC << std::endl;
      size_t after;
      SysInfo::getProcessMemoryConsumption(after);
      std::cout << " Memory consumption after " << after << std::endl;
    }
    else if (read_method == "cached")
    {
      std::cout << "Read method: cached" << std::endl;

      // Special handling of cached mzML as input types: 
      // we expect two paired input files which we should read into exp
      std::vector<String> split_out;
      in.split(".cachedMzML", split_out);
      if (split_out.size() != 2)
      {
        LOG_ERROR << "Cannot deduce base path from input '" << in << 
          "' (note that '.cachedMzML' should only occur once as the final ending)" << std::endl;
        return ILLEGAL_PARAMETERS;
      }
      String in_meta = split_out[0] + ".mzML";

      MzMLFile f;
      f.setLogType(log_type_);
      CachedmzML cacher;
      cacher.setLogType(log_type_);

      CachedmzML cache;
      cache.createMemdumpIndex(in);
      const std::vector<std::streampos> spectra_index = cache.getSpectraIndex();

      std::ifstream ifs_;
      ifs_.open(in.c_str(), std::ios::binary);

      double TIC = 0.0;
      long int nr_peaks = 0;
      for (Size i=0; i < spectra_index.size(); ++i)
      {

        BinaryDataArrayPtr mz_array(new BinaryDataArray);
        BinaryDataArrayPtr intensity_array(new BinaryDataArray);
        int ms_level = -1;
        double rt = -1.0;
        ifs_.seekg(spectra_index[i]);
        CachedmzML::readSpectrumFast(mz_array, intensity_array, ifs_, ms_level, rt);

        nr_peaks += intensity_array->data.size();
        for (Size j = 0; j < intensity_array->data.size(); j++)
        {
          TIC += intensity_array->data[j];
        }
      }

      std::cout << "There are " << spectra_index.size() << " spectra and " << nr_peaks << " peaks in the input file." << std::endl;
      std::cout << "The total ion current is " << TIC << std::endl;
      size_t after;
      SysInfo::getProcessMemoryConsumption(after);
      std::cout << " Memory consumption after " << after << std::endl;
    }
    else if (read_method == "cached_parallel")
    {
      std::cout << "Read method: cached parallel" << std::endl;

      // Special handling of cached mzML as input types: 
      // we expect two paired input files which we should read into exp
      std::vector<String> split_out;
      in.split(".cachedMzML", split_out);
      if (split_out.size() != 2)
      {
        LOG_ERROR << "Cannot deduce base path from input '" << in << 
          "' (note that '.cachedMzML' should only occur once as the final ending)" << std::endl;
        return ILLEGAL_PARAMETERS;
      }
      String in_meta = split_out[0] + ".mzML";

      MzMLFile f;
      f.setLogType(log_type_);
      CachedmzML cacher;
      cacher.setLogType(log_type_);

      CachedmzML cache;
      cache.createMemdumpIndex(in);
      const std::vector<std::streampos> spectra_index = cache.getSpectraIndex();

      FileAbstraction filestream(in);

      double TIC = 0.0;
      long int nr_peaks = 0;
#ifdef _OPENMP
#pragma omp parallel for firstprivate(filestream) 
#endif
      for (SignedSize i=0; i < (SignedSize)spectra_index.size(); ++i)
      {
        BinaryDataArrayPtr mz_array(new BinaryDataArray);
        BinaryDataArrayPtr intensity_array(new BinaryDataArray);
        int ms_level = -1;
        double rt = -1.0;
        // we only change the position of the thread-local filestream
        filestream.getStream().seekg(spectra_index[i]);
        CachedmzML::readSpectrumFast(mz_array, intensity_array, filestream.getStream(), ms_level, rt);

        double nr_peaks_l = intensity_array->data.size();
        double TIC_l = std::accumulate(intensity_array->data.begin(), intensity_array->data.end(), 0.0);
#ifdef _OPENMP
#pragma omp critical (indexed)
#endif
        {
          TIC += TIC_l;
          nr_peaks += nr_peaks_l;
        }
      }

      std::cout << "There are " << spectra_index.size() << " spectra and " << nr_peaks << " peaks in the input file." << std::endl;
      std::cout << "The total ion current is " << TIC << std::endl;
      size_t after;
      SysInfo::getProcessMemoryConsumption(after);
      std::cout << " Memory consumption after " << after << std::endl;
    }

    return EXECUTION_OK;
  }