#include <OpenMS/KERNEL/FeatureMap.h> #include <OpenMS/KERNEL/MSSpectrum.h> #include <OpenMS/KERNEL/MSExperiment.h> #include <OpenMS/KERNEL/RichPeak1D.h> START_TEST(FileHandler, "$Id$") ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// using namespace OpenMS; using namespace std; START_SECTION((static FileTypes::Type getTypeByFileName(const String &filename))) FileHandler tmp; TEST_EQUAL(tmp.getTypeByFileName("test.bla"), FileTypes::UNKNOWN) TEST_EQUAL(tmp.getTypeByFileName("test.dta"), FileTypes::DTA) TEST_EQUAL(tmp.getTypeByFileName("test.DTA2D"), FileTypes::DTA2D) TEST_EQUAL(tmp.getTypeByFileName("test.MzData"), FileTypes::MZDATA) TEST_EQUAL(tmp.getTypeByFileName("test.MZXML"), FileTypes::MZXML) TEST_EQUAL(tmp.getTypeByFileName("test.featureXML"), FileTypes::FEATUREXML) TEST_EQUAL(tmp.getTypeByFileName("test.idXML"), FileTypes::IDXML) TEST_EQUAL(tmp.getTypeByFileName("test.consensusXML"), FileTypes::CONSENSUSXML) TEST_EQUAL(tmp.getTypeByFileName("test.mGf"), FileTypes::MGF) TEST_EQUAL(tmp.getTypeByFileName("test.ini"), FileTypes::INI) TEST_EQUAL(tmp.getTypeByFileName("test.toPPas"), FileTypes::TOPPAS) TEST_EQUAL(tmp.getTypeByFileName("test.TraFoXML"), FileTypes::TRANSFORMATIONXML) TEST_EQUAL(tmp.getTypeByFileName("test.MzML"), FileTypes::MZML) TEST_EQUAL(tmp.getTypeByFileName(OPENMS_GET_TEST_DATA_PATH("MzMLFile_6_uncompressed.mzML.bz2")), FileTypes::MZML) TEST_EQUAL(tmp.getTypeByFileName(OPENMS_GET_TEST_DATA_PATH("MzMLFile_6_uncompressed.mzML.gz")), FileTypes::MZML) TEST_EQUAL(tmp.getTypeByFileName("test.mS2"), FileTypes::MS2)
ExitCodes main_(int, const char **) { vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; PeptideIdentification peptide_identification; DateTime date_time = DateTime::now(); String date_time_string = date_time.get(); peptide_identification.setIdentifier("In-silico_digestion" + date_time_string); ProteinIdentification protein_identification; protein_identifications.push_back(ProteinIdentification()); //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); //input file type FileHandler fh; FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(outputfile_name); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2); } if (out_type == FileTypes::UNKNOWN) { LOG_ERROR << ("Error: Could not determine output file type!") << std::endl; return PARSE_ERROR; } Size min_size = getIntOption_("min_length"); Size max_size = getIntOption_("max_length"); Size missed_cleavages = getIntOption_("missed_cleavages"); bool has_FASTA_output = (out_type == FileTypes::FASTA); //------------------------------------------------------------- // reading input //------------------------------------------------------------- std::vector<FASTAFile::FASTAEntry> protein_data; FASTAFile().load(inputfile_name, protein_data); //------------------------------------------------------------- // calculations //------------------------------------------------------------- // This should be updated if more cleavage enzymes are available ProteinIdentification::SearchParameters search_parameters; String enzyme = getStringOption_("enzyme"); EnzymaticDigestion digestor; if (enzyme == "Trypsin") { digestor.setEnzyme(EnzymaticDigestion::ENZYME_TRYPSIN); digestor.setMissedCleavages(missed_cleavages); search_parameters.enzyme = ProteinIdentification::TRYPSIN; } else if (enzyme == "none") { search_parameters.enzyme = ProteinIdentification::NO_ENZYME; } else { LOG_ERROR << "Internal error in Digestor, when evaluating enzyme name! Please report this!" << std::endl; return ILLEGAL_PARAMETERS; } vector<String> protein_accessions(1); PeptideHit temp_peptide_hit; protein_identifications[0].setSearchParameters(search_parameters); protein_identifications[0].setDateTime(date_time); protein_identifications[0].setSearchEngine("In-silico digestion"); protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string); std::vector<FASTAFile::FASTAEntry> all_peptides; Size dropped_bylength(0); // stats for removing candidates for (Size i = 0; i < protein_data.size(); ++i) { if (!has_FASTA_output) { protein_accessions[0] = protein_data[i].identifier; ProteinHit temp_protein_hit; temp_protein_hit.setSequence(protein_data[i].sequence); temp_protein_hit.setAccession(protein_accessions[0]); protein_identifications[0].insertHit(temp_protein_hit); temp_peptide_hit.setProteinAccessions(protein_accessions); } vector<AASequence> temp_peptides; if (enzyme == "none") { temp_peptides.push_back(AASequence(protein_data[i].sequence)); } else { digestor.digest(AASequence(protein_data[i].sequence), temp_peptides); } for (Size j = 0; j < temp_peptides.size(); ++j) { if ((temp_peptides[j].size() >= min_size) && (temp_peptides[j].size() <= max_size)) { if (!has_FASTA_output) { temp_peptide_hit.setSequence(temp_peptides[j]); peptide_identification.insertHit(temp_peptide_hit); identifications.push_back(peptide_identification); peptide_identification.setHits(std::vector<PeptideHit>()); // clear } else // for FASTA file output { FASTAFile::FASTAEntry pep(protein_data[i].identifier, protein_data[i].description, temp_peptides[j].toString()); all_peptides.push_back(pep); } } else { ++dropped_bylength; } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- if (has_FASTA_output) { FASTAFile().store(outputfile_name, all_peptides); } else { IdXMLFile().store(outputfile_name, protein_identifications, identifications); } Size pep_remaining_count = (has_FASTA_output ? all_peptides.size() : identifications.size()); LOG_INFO << "Statistics:\n" << " total #peptides after digestion: " << pep_remaining_count + dropped_bylength << "\n" << " removed #peptides (length restrictions): " << dropped_bylength << "\n" << " remaining #peptides: " << pep_remaining_count << std::endl; return EXECUTION_OK; }
ExitCodes main_(int, const char**) override { vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; PeptideIdentification peptide_identification; DateTime date_time = DateTime::now(); String date_time_string = date_time.get(); peptide_identification.setIdentifier("In-silico_digestion" + date_time_string); ProteinIdentification protein_identification; protein_identifications.push_back(ProteinIdentification()); //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); FASTAID FASTA_ID = getStringOption_("FASTA:ID") == "parent" ? PARENT : (getStringOption_("FASTA:ID") == "number" ? NUMBER : BOTH); bool keep_FASTA_desc = (getStringOption_("FASTA:description") == "keep"); // output file type FileHandler fh; FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(outputfile_name); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2); } if (out_type == FileTypes::UNKNOWN) { LOG_ERROR << ("Error: Could not determine output file type!") << std::endl; return PARSE_ERROR; } Size min_size = getIntOption_("min_length"); Size max_size = getIntOption_("max_length"); Size missed_cleavages = getIntOption_("missed_cleavages"); bool has_FASTA_output = (out_type == FileTypes::FASTA); //------------------------------------------------------------- // reading input //------------------------------------------------------------- FASTAFile ff; ff.readStart(inputfile_name); if (has_FASTA_output) ff.writeStart(outputfile_name); //------------------------------------------------------------- // calculations //------------------------------------------------------------- // This should be updated if more cleavage enzymes are available ProteinIdentification::SearchParameters search_parameters; String enzyme = getStringOption_("enzyme"); ProteaseDigestion digestor; digestor.setEnzyme(enzyme); digestor.setMissedCleavages(missed_cleavages); search_parameters.digestion_enzyme = *ProteaseDB::getInstance()->getEnzyme(enzyme); PeptideHit temp_peptide_hit; PeptideEvidence temp_pe; protein_identifications[0].setSearchParameters(search_parameters); protein_identifications[0].setDateTime(date_time); protein_identifications[0].setSearchEngine("In-silico digestion"); protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string); Size dropped_by_length(0); // stats for removing candidates Size fasta_out_count(0); FASTAFile::FASTAEntry fe; while (ff.readNext(fe)) { if (!has_FASTA_output) { ProteinHit temp_protein_hit; temp_protein_hit.setSequence(fe.sequence); temp_protein_hit.setAccession(fe.identifier); protein_identifications[0].insertHit(temp_protein_hit); temp_pe.setProteinAccession(fe.identifier); temp_peptide_hit.setPeptideEvidences(vector<PeptideEvidence>(1, temp_pe)); } vector<AASequence> current_digest; if (enzyme == "none") { current_digest.push_back(AASequence::fromString(fe.sequence)); } else { dropped_by_length += digestor.digest(AASequence::fromString(fe.sequence), current_digest, min_size, max_size); } String id = fe.identifier; for (auto const& s : current_digest) { if (!has_FASTA_output) { temp_peptide_hit.setSequence(s); peptide_identification.insertHit(temp_peptide_hit); identifications.push_back(peptide_identification); peptide_identification.setHits(std::vector<PeptideHit>()); // clear } else // for FASTA file output { ++fasta_out_count; switch (FASTA_ID) { case PARENT: break; case NUMBER: id = String(fasta_out_count); break; case BOTH: id = fe.identifier + "_" + String(fasta_out_count); break; } ff.writeNext(FASTAFile::FASTAEntry(id, keep_FASTA_desc ? fe.description : "", s.toString())); } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- if (has_FASTA_output) { ff.writeEnd(); } else { IdXMLFile().store(outputfile_name, protein_identifications, identifications); } Size pep_remaining_count = (has_FASTA_output ? fasta_out_count : identifications.size()); LOG_INFO << "Statistics:\n" << " file: " << inputfile_name << "\n" << " total #peptides after digestion: " << pep_remaining_count + dropped_by_length << "\n" << " removed #peptides (length restrictions): " << dropped_by_length << "\n" << " remaining #peptides: " << pep_remaining_count << std::endl; return EXECUTION_OK; }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // general variables and data //------------------------------------------------------------- FileHandler fh; vector<PeptideIdentification> peptide_identifications; vector<ProteinIdentification> protein_identifications; //------------------------------------------------------------- // reading input //------------------------------------------------------------- const String in = getStringOption_("in"); ProgressLogger logger; logger.setLogType(ProgressLogger::CMD); logger.startProgress(0, 1, "Loading..."); if (File::isDirectory(in)) { const String in_directory = File::absolutePath(in).ensureLastChar('/'); const String mz_file = getStringOption_("mz_file"); const bool ignore_proteins_per_peptide = getFlag_("ignore_proteins_per_peptide"); UInt i = 0; FileHandler fh; FileTypes::Type type; MSExperiment<Peak1D> msexperiment; // Note: we had issues with leading zeroes, so let us represent scan numbers as Int (next line used to be map<String, float> num_and_rt;) However, now String::toInt() might throw. map<Int, float> num_and_rt; vector<String> NativeID; // The mz-File (if given) if (!mz_file.empty()) { type = fh.getTypeByFileName(mz_file); fh.loadExperiment(mz_file, msexperiment, type); for (MSExperiment<Peak1D>::Iterator spectra_it = msexperiment.begin(); spectra_it != msexperiment.end(); ++spectra_it) { String(spectra_it->getNativeID()).split('=', NativeID); try { num_and_rt[NativeID[1].toInt()] = spectra_it->getRT(); // cout << "num_and_rt: " << NativeID[1] << " = " << NativeID[1].toInt() << " : " << num_and_rt[NativeID[1].toInt()] << endl; // CG debuggging 2009-07-01 } catch (Exception::ConversionError& e) { writeLog_(String("Error: Cannot read scan number as integer. '") + e.getMessage()); } } } // Get list of the actual Sequest .out-Files StringList in_files; if (!File::fileList(in_directory, String("*.out"), in_files)) { writeLog_(String("Error: No .out files found in '") + in_directory + "'. Aborting!"); } // Now get to work ... for (vector<String>::const_iterator in_files_it = in_files.begin(); in_files_it != in_files.end(); ++in_files_it) { vector<PeptideIdentification> peptide_ids_seq; ProteinIdentification protein_id_seq; vector<double> pvalues_seq; vector<String> in_file_vec; SequestOutfile sequest_outfile; writeDebug_(String("Reading file ") + *in_files_it, 3); try { sequest_outfile.load((String) (in_directory + *in_files_it), peptide_ids_seq, protein_id_seq, 1.0, pvalues_seq, "Sequest", ignore_proteins_per_peptide); in_files_it->split('.', in_file_vec); for (Size j = 0; j < peptide_ids_seq.size(); ++j) { // We have to explicitly set the identifiers, because the normal set ones are composed of search engine name and date, which is the same for a bunch of sequest out-files. peptide_ids_seq[j].setIdentifier(*in_files_it + "_" + i); Int scan_number = 0; if (!mz_file.empty()) { try { scan_number = in_file_vec[2].toInt(); peptide_ids_seq[j].setRT(num_and_rt[scan_number]); } catch (Exception::ConversionError& e) { writeLog_(String("Error: Cannot read scan number as integer. '") + e.getMessage()); } catch (exception& e) { writeLog_(String("Error: Cannot read scan number as integer. '") + e.what()); } //double real_mz = ( peptide_ids_seq[j].getMZ() - hydrogen_mass )/ (double)peptide_ids_seq[j].getHits()[0].getCharge(); // ???? semantics of mz const double real_mz = peptide_ids_seq[j].getMZ() / (double) peptide_ids_seq[j].getHits()[0].getCharge(); peptide_ids_seq[j].setMZ(real_mz); } writeDebug_(String("scan: ") + String(scan_number) + String(" RT: ") + String(peptide_ids_seq[j].getRT()) + " MZ: " + String(peptide_ids_seq[j].getMZ()) + " Ident: " + peptide_ids_seq[j].getIdentifier(), 4); peptide_identifications.push_back(peptide_ids_seq[j]); } protein_id_seq.setIdentifier(*in_files_it + "_" + i); protein_identifications.push_back(protein_id_seq); ++i; } catch (Exception::ParseError& pe) { writeLog_(pe.getMessage() + String("(file: ") + *in_files_it + ")"); throw; } catch (...) { writeLog_(String("Error reading file: ") + *in_files_it); throw; } } writeDebug_("All files processed.", 3); } // ! directory else { FileTypes::Type in_type = fh.getType(in); if (in_type == FileTypes::PEPXML) { String exp_name = getStringOption_("mz_file"); String orig_name = getStringOption_("mz_name"); bool use_precursor_data = getFlag_("use_precursor_data"); if (exp_name.empty()) { PepXMLFile().load(in, protein_identifications, peptide_identifications, orig_name); } else { MSExperiment<> exp; fh.loadExperiment(exp_name, exp); if (!orig_name.empty()) { exp_name = orig_name; } PepXMLFile().load(in, protein_identifications, peptide_identifications, exp_name, exp, use_precursor_data); } } else if (in_type == FileTypes::IDXML) { IdXMLFile().load(in, protein_identifications, peptide_identifications); } else if (in_type == FileTypes::MZIDENTML) { LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; MzIdentMLFile().load(in, protein_identifications, peptide_identifications); } else if (in_type == FileTypes::PROTXML) { protein_identifications.resize(1); peptide_identifications.resize(1); ProtXMLFile().load(in, protein_identifications[0], peptide_identifications[0]); } else if (in_type == FileTypes::OMSSAXML) { protein_identifications.resize(1); OMSSAXMLFile().load(in, protein_identifications[0], peptide_identifications, true); } else if (in_type == FileTypes::MASCOTXML) { String scan_regex = getStringOption_("scan_regex"); String exp_name = getStringOption_("mz_file"); MascotXMLFile::RTMapping rt_mapping; if (!exp_name.empty()) { PeakMap exp; // load only MS2 spectra: fh.getOptions().addMSLevel(2); fh.loadExperiment(exp_name, exp, FileTypes::MZML, log_type_); MascotXMLFile::generateRTMapping(exp.begin(), exp.end(), rt_mapping); } protein_identifications.resize(1); MascotXMLFile().load(in, protein_identifications[0], peptide_identifications, rt_mapping, scan_regex); } else if (in_type == FileTypes::XML) { ProteinIdentification protein_id; XTandemXMLFile().load(in, protein_id, peptide_identifications); protein_id.setSearchEngineVersion(""); protein_id.setSearchEngine("XTandem"); protein_identifications.push_back(protein_id); String exp_name = getStringOption_("mz_file"); if (!exp_name.empty()) { PeakMap exp; fh.getOptions().addMSLevel(2); fh.loadExperiment(exp_name, exp, FileTypes::MZML, log_type_); for (vector<PeptideIdentification>::iterator it = peptide_identifications.begin(); it != peptide_identifications.end(); ++it) { UInt id = (Int)it->getMetaValue("spectrum_id"); --id; // native IDs were written 1-based if (id < exp.size()) { it->setRT(exp[id].getRT()); double pre_mz(0.0); if (!exp[id].getPrecursors().empty()) pre_mz = exp[id].getPrecursors()[0].getMZ(); it->setMZ(pre_mz); it->removeMetaValue("spectrum_id"); } else { LOG_ERROR << "XTandem xml: Error: id '" << id << "' not found in peak map!" << endl; } } } } else { writeLog_("Unknown input file type given. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } } logger.endProgress(); //------------------------------------------------------------- // writing output //------------------------------------------------------------- const String out = getStringOption_("out"); FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(out); } if (out_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine output file type!"); return PARSE_ERROR; } logger.startProgress(0, 1, "Storing..."); if (out_type == FileTypes::PEPXML) { bool peptideprophet_analyzed = getFlag_("peptideprophet_analyzed"); String mz_file = getStringOption_("mz_file"); String mz_name = getStringOption_("mz_name"); PepXMLFile().store(out, protein_identifications, peptide_identifications, mz_file, mz_name, peptideprophet_analyzed); } else if (out_type == FileTypes::IDXML) { IdXMLFile().store(out, protein_identifications, peptide_identifications); } else if (out_type == FileTypes::MZIDENTML) { MzIdentMLFile().store(out, protein_identifications, peptide_identifications); } else if (out_type == FileTypes::FASTA) { Size count = 0; ofstream fasta(out.c_str(), ios::out); for (Size i = 0; i < peptide_identifications.size(); ++i) { for (Size l = 0; l < peptide_identifications[i].getHits().size(); ++l) { const PeptideHit& hit = peptide_identifications[i].getHits()[l]; fasta << ">" << hit.getSequence().toUnmodifiedString() << "|" << count++ << "|" << hit.getSequence().toString() << endl; String seq = hit.getSequence().toUnmodifiedString(); // FASTA files should have at most 60 characters of sequence info per line for (Size j = 0; j < seq.size(); j += 60) { Size k = min(j + 60, seq.size()); fasta << string(seq[j], seq[k]) << endl; } } } } else { writeLog_("Unsupported output file type given. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } logger.endProgress(); return EXECUTION_OK; }
ExitCodes main_(int , const char**) { String out_meta = getStringOption_("out"); String out_cached = out_meta + ".cached"; bool convert_back = getFlag_("convert_back"); FileHandler fh; //input file type String in = getStringOption_("in"); String in_cached = in + ".cached"; FileTypes::Type in_type = FileTypes::nameToType(getStringOption_("in_type")); if (in_type == FileTypes::UNKNOWN) { in_type = fh.getType(in); writeDebug_(String("Input file type: ") + FileTypes::typeToName(in_type), 2); } if (in_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine input file type!"); return PARSE_ERROR; } //output file names and types String out = getStringOption_("out"); FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(out); } if (out_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine output file type!"); return PARSE_ERROR; } if (in_type == FileTypes::SQMASS && out_type == FileTypes::MZML) { MapType exp; SqMassFile sqfile; MzMLFile f; sqfile.load(in, exp); f.store(out, exp); return EXECUTION_OK; } else if (in_type == FileTypes::MZML && out_type == FileTypes::SQMASS) { MzMLFile f; SqMassFile sqfile; MapType exp; f.load(in, exp); sqfile.store(out, exp); return EXECUTION_OK; } if (!convert_back) { MapType exp; CachedmzML cacher; MzMLFile f; cacher.setLogType(log_type_); f.setLogType(log_type_); f.load(in,exp); cacher.writeMemdump(exp, out_cached); cacher.writeMetadata(exp, out_meta, true); } else { MzMLFile f; MapType meta_exp; CachedmzML cacher; MapType exp_reading; cacher.setLogType(log_type_); f.setLogType(log_type_); f.load(in,meta_exp); cacher.readMemdump(exp_reading, in_cached); std::cout << " read back, got " << exp_reading.size() << " spectra " << exp_reading.getChromatograms().size() << " chromats " << std::endl; { for (Size i=0; i<meta_exp.size(); ++i) { for (Size j = 0; j < meta_exp[i].getDataProcessing().size(); j++) { if (meta_exp[i].getDataProcessing()[j]->metaValueExists("cached_data")) { meta_exp[i].getDataProcessing()[j]->removeMetaValue("cached_data"); } } } for (Size i=0; i < meta_exp.getNrChromatograms(); ++i) { for (Size j = 0; j < meta_exp.getChromatogram(i).getDataProcessing().size(); j++) { if (meta_exp.getChromatogram(i).getDataProcessing()[j]->metaValueExists("cached_data")) { meta_exp.getChromatogram(i).getDataProcessing()[j]->removeMetaValue("cached_data"); } } } } if (meta_exp.size() != exp_reading.size()) { std::cerr << " Both experiments need to have the same size!"; } for (Size i=0; i<exp_reading.size(); ++i) { for (Size j = 0; j < exp_reading[i].size(); j++) { meta_exp[i].push_back(exp_reading[i][j]); } } std::vector<MSChromatogram<ChromatogramPeak> > chromatograms = exp_reading.getChromatograms(); std::vector<MSChromatogram<ChromatogramPeak> > old_chromatograms = meta_exp.getChromatograms(); for (Size i=0; i<chromatograms.size(); ++i) { for (Size j = 0; j < chromatograms[i].size(); j++) { old_chromatograms[i].push_back(chromatograms[i][j]); } } meta_exp.setChromatograms(old_chromatograms); f.store(out_meta,meta_exp); } return EXECUTION_OK; }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input file names String in = getStringOption_("in"); //input file type FileHandler fh; FileTypes::Type in_type = FileTypes::nameToType(getStringOption_("in_type")); if (in_type == FileTypes::UNKNOWN) { in_type = fh.getType(in); writeDebug_(String("Input file type: ") + FileTypes::typeToName(in_type), 2); } if (in_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine input file type!"); return PARSE_ERROR; } //output file names and types String out = getStringOption_("out"); FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(out); } if (out_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine output file type!"); return PARSE_ERROR; } bool TIC_DTA2D = getFlag_("TIC_DTA2D"); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 1); //------------------------------------------------------------- // reading input //------------------------------------------------------------- typedef MSExperiment<Peak1D> MSExperimentType; MSExperimentType exp; typedef MSExperimentType::SpectrumType SpectrumType; typedef FeatureMap<> FeatureMapType; FeatureMapType fm; ConsensusMap cm; writeDebug_(String("Loading input file"), 1); if (in_type == FileTypes::CONSENSUSXML) { ConsensusXMLFile().load(in, cm); cm.sortByPosition(); if ((out_type != FileTypes::FEATUREXML) && (out_type != FileTypes::CONSENSUSXML)) { // You you will lose information and waste memory. Enough reasons to issue a warning! writeLog_("Warning: Converting consensus features to peaks. You will lose information!"); exp.set2DData(cm); } } else if (in_type == FileTypes::EDTA) { EDTAFile().load(in, cm); cm.sortByPosition(); if ((out_type != FileTypes::FEATUREXML) && (out_type != FileTypes::CONSENSUSXML)) { // You you will lose information and waste memory. Enough reasons to issue a warning! writeLog_("Warning: Converting consensus features to peaks. You will lose information!"); exp.set2DData(cm); } } else if (in_type == FileTypes::FEATUREXML || in_type == FileTypes::TSV || in_type == FileTypes::PEPLIST || in_type == FileTypes::KROENIK) { fh.loadFeatures(in, fm, in_type); fm.sortByPosition(); if ((out_type != FileTypes::FEATUREXML) && (out_type != FileTypes::CONSENSUSXML)) { // You will lose information and waste memory. Enough reasons to issue a warning! writeLog_("Warning: Converting features to peaks. You will lose information! Mass traces are added, if present as 'num_of_masstraces' and 'masstrace_intensity_<X>' (X>=0) meta values."); exp.set2DData<true>(fm); } } else { fh.loadExperiment(in, exp, in_type, log_type_); } //------------------------------------------------------------- // writing output //------------------------------------------------------------- writeDebug_(String("Writing output file"), 1); if (out_type == FileTypes::MZML) { //add data processing entry addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: CONVERSION_MZML)); MzMLFile f; f.setLogType(log_type_); ChromatogramTools().convertSpectraToChromatograms(exp, true); f.store(out, exp); } else if (out_type == FileTypes::MZDATA) { //annotate output with data processing info addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: CONVERSION_MZDATA)); MzDataFile f; f.setLogType(log_type_); ChromatogramTools().convertChromatogramsToSpectra<MSExperimentType>(exp); f.store(out, exp); } else if (out_type == FileTypes::MZXML) { //annotate output with data processing info addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: CONVERSION_MZXML)); MzXMLFile f; f.setLogType(log_type_); ChromatogramTools().convertChromatogramsToSpectra<MSExperimentType>(exp); f.store(out, exp); } else if (out_type == FileTypes::DTA2D) { //add data processing entry addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); DTA2DFile f; f.setLogType(log_type_); ChromatogramTools().convertChromatogramsToSpectra<MSExperimentType>(exp); if (TIC_DTA2D) { // store the total ion chromatogram (TIC) f.storeTIC(out, exp); } else { // store entire experiment f.store(out, exp); } } else if (out_type == FileTypes::MGF) { //add data processing entry addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); MascotGenericFile f; f.setLogType(log_type_); f.store(out, exp); } else if (out_type == FileTypes::FEATUREXML) { if ((in_type == FileTypes::FEATUREXML) || (in_type == FileTypes::TSV) || (in_type == FileTypes::PEPLIST) || (in_type == FileTypes::KROENIK)) { fm.applyMemberFunction(&UniqueIdInterface::setUniqueId); } else if (in_type == FileTypes::CONSENSUSXML || in_type == FileTypes::EDTA) { ConsensusMap::convert(cm, true, fm); } else // not loaded as feature map or consensus map { // The feature specific information is only defaulted. Enough reasons to issue a warning! writeLog_("Warning: Converting peaks to features will lead to incomplete features!"); fm.clear(); fm.reserve(exp.getSize()); typedef FeatureMapType::FeatureType FeatureType; FeatureType feature; feature.setQuality(0, 1); // override default feature.setQuality(1, 1); // override default feature.setOverallQuality(1); // override default for (MSExperimentType::ConstIterator spec_iter = exp.begin(); spec_iter != exp.end(); ++spec_iter ) { feature.setRT(spec_iter->getRT()); for (SpectrumType::ConstIterator peak1_iter = spec_iter->begin(); peak1_iter != spec_iter->end(); ++peak1_iter ) { feature.setMZ(peak1_iter->getMZ()); feature.setIntensity(peak1_iter->getIntensity()); feature.setUniqueId(); fm.push_back(feature); } } fm.updateRanges(); } addDataProcessing_(fm, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); FeatureXMLFile().store(out, fm); } else if (out_type == FileTypes::CONSENSUSXML) { if ((in_type == FileTypes::FEATUREXML) || (in_type == FileTypes::TSV) || (in_type == FileTypes::PEPLIST) || (in_type == FileTypes::KROENIK)) { fm.applyMemberFunction(&UniqueIdInterface::setUniqueId); ConsensusMap::convert(0, fm, cm); } // nothing to do for consensus input else if (in_type == FileTypes::CONSENSUSXML || in_type == FileTypes::EDTA) { } else // experimental data { ConsensusMap::convert(0, exp, cm, exp.size()); } addDataProcessing_(cm, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); ConsensusXMLFile().store(out, cm); } else if (out_type == FileTypes::EDTA) { if (fm.size() > 0 && cm.size() > 0) { LOG_ERROR << "Internal error: cannot decide on container (Consensus or Feature)! This is a bug. Please report it!"; return INTERNAL_ERROR; } if (fm.size() > 0) EDTAFile().store(out, fm); else if (cm.size() > 0) EDTAFile().store(out, cm); } else { writeLog_("Unknown output file type given. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } return EXECUTION_OK; }
ExitCodes main_(int, const char**) override { vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; PeptideIdentification peptide_identification; DateTime date_time = DateTime::now(); String date_time_string = date_time.get(); peptide_identification.setIdentifier("In-silico_digestion" + date_time_string); ProteinIdentification protein_identification; protein_identifications.push_back(ProteinIdentification()); //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); // output file type FileHandler fh; FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(outputfile_name); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2); } if (out_type == FileTypes::UNKNOWN) { LOG_ERROR << ("Error: Could not determine output file type!") << std::endl; return PARSE_ERROR; } Size min_size = getIntOption_("min_length"); Size max_size = getIntOption_("max_length"); Size missed_cleavages = getIntOption_("missed_cleavages"); bool has_FASTA_output = (out_type == FileTypes::FASTA); //------------------------------------------------------------- // reading input //------------------------------------------------------------- std::vector<FASTAFile::FASTAEntry> protein_data; FASTAFile().load(inputfile_name, protein_data); //------------------------------------------------------------- // calculations //------------------------------------------------------------- // This should be updated if more cleavage enzymes are available ProteinIdentification::SearchParameters search_parameters; String enzyme = getStringOption_("enzyme"); ProteaseDigestion digestor; digestor.setEnzyme(enzyme); digestor.setMissedCleavages(missed_cleavages); search_parameters.digestion_enzyme = *ProteaseDB::getInstance()->getEnzyme(enzyme); PeptideHit temp_peptide_hit; PeptideEvidence temp_pe; protein_identifications[0].setSearchParameters(search_parameters); protein_identifications[0].setDateTime(date_time); protein_identifications[0].setSearchEngine("In-silico digestion"); protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string); std::vector<FASTAFile::FASTAEntry> all_peptides; Size dropped_bylength(0); // stats for removing candidates for (Size i = 0; i < protein_data.size(); ++i) { if (!has_FASTA_output) { ProteinHit temp_protein_hit; temp_protein_hit.setSequence(protein_data[i].sequence); temp_protein_hit.setAccession(protein_data[i].identifier); protein_identifications[0].insertHit(temp_protein_hit); temp_pe.setProteinAccession(protein_data[i].identifier); temp_peptide_hit.setPeptideEvidences(vector<PeptideEvidence>(1, temp_pe)); } vector<AASequence> temp_peptides; if (enzyme == "none") { temp_peptides.push_back(AASequence::fromString(protein_data[i].sequence)); } else { vector<AASequence> current_digest; digestor.digest(AASequence::fromString(protein_data[i].sequence), current_digest); // keep peptides that match length restrictions (and count those that don't match) std::copy_if(current_digest.begin(), current_digest.end(), std::back_inserter(temp_peptides), [&dropped_bylength, &min_size, &max_size](const AASequence& s) -> bool { bool valid_length = (s.size() >= min_size && s.size() <= max_size); if (!valid_length) { ++dropped_bylength; return false; } return true; }); } for (auto s : temp_peptides) { if (!has_FASTA_output) { temp_peptide_hit.setSequence(s); peptide_identification.insertHit(temp_peptide_hit); identifications.push_back(peptide_identification); peptide_identification.setHits(std::vector<PeptideHit>()); // clear } else // for FASTA file output { FASTAFile::FASTAEntry pep(protein_data[i].identifier, protein_data[i].description, s.toString()); all_peptides.push_back(pep); } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- if (has_FASTA_output) { FASTAFile().store(outputfile_name, all_peptides); } else { IdXMLFile().store(outputfile_name, protein_identifications, identifications); } Size pep_remaining_count = (has_FASTA_output ? all_peptides.size() : identifications.size()); LOG_INFO << "Statistics:\n" << " total #peptides after digestion: " << pep_remaining_count + dropped_bylength << "\n" << " removed #peptides (length restrictions): " << dropped_bylength << "\n" << " remaining #peptides: " << pep_remaining_count << std::endl; return EXECUTION_OK; }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input file names String in = getStringOption_("in"); bool write_mzML_index = getFlag_("write_mzML_index"); //input file type FileHandler fh; FileTypes::Type in_type = FileTypes::nameToType(getStringOption_("in_type")); if (in_type == FileTypes::UNKNOWN) { in_type = fh.getType(in); writeDebug_(String("Input file type: ") + FileTypes::typeToName(in_type), 2); } if (in_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine input file type!"); return PARSE_ERROR; } //output file names and types String out = getStringOption_("out"); FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(out); } if (out_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine output file type!"); return PARSE_ERROR; } bool TIC_DTA2D = getFlag_("TIC_DTA2D"); bool process_lowmemory = getFlag_("process_lowmemory"); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 1); String uid_postprocessing = getStringOption_("UID_postprocessing"); //------------------------------------------------------------- // reading input //------------------------------------------------------------- typedef MSExperiment<Peak1D> MSExperimentType; MSExperimentType exp; typedef MSExperimentType::SpectrumType SpectrumType; typedef FeatureMap FeatureMapType; FeatureMapType fm; ConsensusMap cm; writeDebug_(String("Loading input file"), 1); if (in_type == FileTypes::CONSENSUSXML) { ConsensusXMLFile().load(in, cm); cm.sortByPosition(); if ((out_type != FileTypes::FEATUREXML) && (out_type != FileTypes::CONSENSUSXML)) { // You you will lose information and waste memory. Enough reasons to issue a warning! writeLog_("Warning: Converting consensus features to peaks. You will lose information!"); exp.set2DData(cm); } } else if (in_type == FileTypes::EDTA) { EDTAFile().load(in, cm); cm.sortByPosition(); if ((out_type != FileTypes::FEATUREXML) && (out_type != FileTypes::CONSENSUSXML)) { // You you will lose information and waste memory. Enough reasons to issue a warning! writeLog_("Warning: Converting consensus features to peaks. You will lose information!"); exp.set2DData(cm); } } else if (in_type == FileTypes::FEATUREXML || in_type == FileTypes::TSV || in_type == FileTypes::PEPLIST || in_type == FileTypes::KROENIK) { fh.loadFeatures(in, fm, in_type); fm.sortByPosition(); if ((out_type != FileTypes::FEATUREXML) && (out_type != FileTypes::CONSENSUSXML)) { // You will lose information and waste memory. Enough reasons to issue a warning! writeLog_("Warning: Converting features to peaks. You will lose information! Mass traces are added, if present as 'num_of_masstraces' and 'masstrace_intensity_<X>' (X>=0) meta values."); exp.set2DData<true>(fm); } } else if (process_lowmemory) { // Special switch for the low memory options: // We can transform the complete experiment directly without first // loading the complete data into memory. PlainMSDataWritingConsumer will // write out mzML to disk as they are read from the input. if (in_type == FileTypes::MZML && out_type == FileTypes::MZML) { PlainMSDataWritingConsumer consumer(out); consumer.getOptions().setWriteIndex(write_mzML_index); consumer.addDataProcessing(getProcessingInfo_(DataProcessing::CONVERSION_MZML)); MzMLFile mzmlfile; mzmlfile.setLogType(log_type_); mzmlfile.transform(in, &consumer); return EXECUTION_OK; } else if (in_type == FileTypes::MZXML && out_type == FileTypes::MZML) { PlainMSDataWritingConsumer consumer(out); consumer.getOptions().setWriteIndex(write_mzML_index); consumer.addDataProcessing(getProcessingInfo_(DataProcessing::CONVERSION_MZML)); MzXMLFile mzxmlfile; mzxmlfile.setLogType(log_type_); mzxmlfile.transform(in, &consumer); return EXECUTION_OK; } else { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Process_lowmemory option can only be used with mzML / mzXML input and mzML output data types."); } } else { fh.loadExperiment(in, exp, in_type, log_type_); } //------------------------------------------------------------- // writing output //------------------------------------------------------------- writeDebug_(String("Writing output file"), 1); if (out_type == FileTypes::MZML) { //add data processing entry addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: CONVERSION_MZML)); MzMLFile f; f.setLogType(log_type_); f.getOptions().setWriteIndex(write_mzML_index); ChromatogramTools().convertSpectraToChromatograms(exp, true); f.store(out, exp); } else if (out_type == FileTypes::MZDATA) { //annotate output with data processing info addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: CONVERSION_MZDATA)); MzDataFile f; f.setLogType(log_type_); ChromatogramTools().convertChromatogramsToSpectra<MSExperimentType>(exp); f.store(out, exp); } else if (out_type == FileTypes::MZXML) { //annotate output with data processing info addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: CONVERSION_MZXML)); MzXMLFile f; f.setLogType(log_type_); ChromatogramTools().convertChromatogramsToSpectra<MSExperimentType>(exp); f.store(out, exp); } else if (out_type == FileTypes::DTA2D) { //add data processing entry addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); DTA2DFile f; f.setLogType(log_type_); ChromatogramTools().convertChromatogramsToSpectra<MSExperimentType>(exp); if (TIC_DTA2D) { // store the total ion chromatogram (TIC) f.storeTIC(out, exp); } else { // store entire experiment f.store(out, exp); } } else if (out_type == FileTypes::MGF) { //add data processing entry addDataProcessing_(exp, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); MascotGenericFile f; f.setLogType(log_type_); f.store(out, exp, getFlag_("MGF_compact")); } else if (out_type == FileTypes::FEATUREXML) { if ((in_type == FileTypes::FEATUREXML) || (in_type == FileTypes::TSV) || (in_type == FileTypes::PEPLIST) || (in_type == FileTypes::KROENIK)) { if (uid_postprocessing == "ensure") { fm.applyMemberFunction(&UniqueIdInterface::ensureUniqueId); } else if (uid_postprocessing == "reassign") { fm.applyMemberFunction(&UniqueIdInterface::setUniqueId); } } else if (in_type == FileTypes::CONSENSUSXML || in_type == FileTypes::EDTA) { MapConversion::convert(cm, true, fm); } else // not loaded as feature map or consensus map { // The feature specific information is only defaulted. Enough reasons to issue a warning! writeLog_("Warning: Converting peaks to features will lead to incomplete features!"); fm.clear(); fm.reserve(exp.getSize()); typedef FeatureMapType::FeatureType FeatureType; FeatureType feature; feature.setQuality(0, 1); // override default feature.setQuality(1, 1); // override default feature.setOverallQuality(1); // override default for (MSExperimentType::ConstIterator spec_iter = exp.begin(); spec_iter != exp.end(); ++spec_iter ) { feature.setRT(spec_iter->getRT()); for (SpectrumType::ConstIterator peak1_iter = spec_iter->begin(); peak1_iter != spec_iter->end(); ++peak1_iter ) { feature.setMZ(peak1_iter->getMZ()); feature.setIntensity(peak1_iter->getIntensity()); feature.setUniqueId(); fm.push_back(feature); } } fm.updateRanges(); } addDataProcessing_(fm, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); FeatureXMLFile().store(out, fm); } else if (out_type == FileTypes::CONSENSUSXML) { if ((in_type == FileTypes::FEATUREXML) || (in_type == FileTypes::TSV) || (in_type == FileTypes::PEPLIST) || (in_type == FileTypes::KROENIK)) { if (uid_postprocessing == "ensure") { fm.applyMemberFunction(&UniqueIdInterface::ensureUniqueId); } else if (uid_postprocessing == "reassign") { fm.applyMemberFunction(&UniqueIdInterface::setUniqueId); } MapConversion::convert(0, fm, cm); } // nothing to do for consensus input else if (in_type == FileTypes::CONSENSUSXML || in_type == FileTypes::EDTA) { } else // experimental data { MapConversion::convert(0, exp, cm, exp.size()); } addDataProcessing_(cm, getProcessingInfo_(DataProcessing:: FORMAT_CONVERSION)); ConsensusXMLFile().store(out, cm); } else if (out_type == FileTypes::EDTA) { if (fm.size() > 0 && cm.size() > 0) { LOG_ERROR << "Internal error: cannot decide on container (Consensus or Feature)! This is a bug. Please report it!"; return INTERNAL_ERROR; } if (fm.size() > 0) EDTAFile().store(out, fm); else if (cm.size() > 0) EDTAFile().store(out, cm); } else if (out_type == FileTypes::CSV) { // as ibspectra is currently the only csv/text based format we assume // that out_type == FileTypes::CSV means ibspectra, if more formats // are added we need a more intelligent strategy to decide which // conversion is requested // IBSpectra selected as output type if (in_type != FileTypes::CONSENSUSXML) { LOG_ERROR << "Incompatible input data: FileConverter can only convert consensusXML files to ibspectra format."; return INCOMPATIBLE_INPUT_DATA; } IBSpectraFile ibfile; ibfile.store(out, cm); } else { writeLog_("Unknown output file type given. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } return EXECUTION_OK; }