/** * Generic tokenizer. * Splits source into tokens and tries to lexically cast them to TARGET. * If that fails, boost::bad_lexical_cast is thrown. * \param source the source string to be split up * \param separator regular expression to delimit the tokens (defaults to \\s+) * \param prefix regular expression for text to be removed from the string before it is split up * ("^" if not given, will be added at the beginning) * \param postfix regular expression for text to be removed from the string before it is split up * ("$" if not given, will be added at the end) * \returns a list of the casted tokens */ template<typename TARGET> std::list<TARGET> stringToList( std::string source, const boost::regex &separator, boost::regex prefix, boost::regex postfix ) { std::list<TARGET> ret; assert( ! separator.empty() ); if ( ! prefix.empty() ) { if ( prefix.str()[0] != '^' ) prefix = boost::regex( std::string( "^" ) + prefix.str(), prefix.flags() ); source = boost::regex_replace( source, prefix, "", boost::format_first_only | boost::match_default ); } if ( ! postfix.empty() ) { if ( postfix.str()[postfix.size() - 1] != '$' ) postfix = boost::regex( postfix.str() + "$", postfix.flags() ); source = boost::regex_replace( source, postfix, "", boost::format_first_only | boost::match_default ); } boost::sregex_token_iterator i = boost::make_regex_token_iterator( source, separator, -1 ); const boost::sregex_token_iterator token_end; while ( i != token_end ) { ret.push_back( boost::lexical_cast<TARGET>( ( i++ )->str() ) ); } return ret; }
int ChessEngineGnu::match(const string &str, boost::regex &re, vector<string> &matches) { boost::cmatch what; matches.clear(); GC_TRACE("match(): %s\n", re.str().c_str()); if( boost::regex_match(str.c_str(), what, re) ) { // what[0] is the whole string for(size_t i=1; i<what.size(); ++i) { GC_TRACE(" \"%s\"\n", what[i].str().c_str()); matches.push_back(what[i].str()); } } GC_TRACE(" %zu matches\n", matches.size()); return (int)matches.size(); }
TestCaseReader::TestCaseReader(const boost::filesystem::path & testCaseDir) { const boost::regex expr("(\\d{3,4})-(.*)\\.xml"); for (boost::filesystem::directory_iterator it = boost::filesystem::directory_iterator(testCaseDir); it != boost::filesystem::directory_iterator(); ++it) { const boost::filesystem::path path = it->path(); //ignore files that don't end in ".xml" if (path.extension() != ".xml") { continue; } #if defined (BOOST_FILESYSTEM_VERSION) && BOOST_FILESYSTEM_VERSION == 3 const std::string filename = path.filename().string(); #else const std::string filename = path.filename(); #endif boost::smatch matchResults; if (boost::regex_match(filename,matchResults,expr)) { //std::wcout << "Found testcase " << matchResults[1].str().c_str() << " brief description '" << matchResults[2].str().c_str()<< "'" << std::endl; const size_t tc = boost::lexical_cast<size_t>(matchResults[1]); if (m_testCases.size() < (tc + 1)) { m_testCases.resize(tc + 1); } if (m_testCases[tc] != NULL) { std::wcerr << "There appears to be two test cases with number " << tc << std::endl; exit(1); } std::ostringstream xml; xml << boost::filesystem::ifstream(path).rdbuf(); //std::wcout << "Read xml (" << xml.str().size() << " bytes) '" << xml.str().c_str() << "'" << std::endl; try { m_testCases[tc] = boost::dynamic_pointer_cast<DoseTest::Items::TestCase> (Safir::Dob::Typesystem::Serialization::ToObject(Safir::Dob::Typesystem::Utilities::ToWstring(xml.str()))); } catch (const std::exception & exc) { std::wcerr << "Failed to read file '" << path.string().c_str() << "' due to exception with message" << std::endl <<exc.what() << std::endl; exit(2); } } else { std::wcerr << "File '" << path.filename().c_str() << "' did not match the pattern for test case files: '" << expr.str().c_str() << "'" << std::endl; } } }
void write( const string& sourceFilepath, pwiz::identdata::IdentDataFile::Format outputFormat, const string& filenameSuffix, const string& searchEngineName, const string& searchEngineVersion, const string& searchEngineURI, const string& searchDatabase, boost::regex cleavageAgentRegex, const string& decoyPrefix, const RunTimeVariableMap& vars ) const { using namespace pwiz::identdata; namespace msdata = pwiz::msdata; namespace proteome = pwiz::proteome; IdentData mzid; mzid.id = sourceFilepath + " " + searchDatabase + " " + searchEngineName + " " + searchEngineVersion; mzid.creationDate = GetDateTime(); // add default CVs mzid.cvs = defaultCVList(); // add the SpectrumIdentificationProtocol SpectrumIdentificationProtocolPtr sipPtr(new SpectrumIdentificationProtocol("SIP")); mzid.analysisProtocolCollection.spectrumIdentificationProtocol.push_back(sipPtr); CVTranslator cvTranslator; CVID searchEngineCVID = cvTranslator.translate(searchEngineName); // add analysis software sipPtr->analysisSoftwarePtr.reset(new AnalysisSoftware("AS")); mzid.analysisSoftwareList.push_back(sipPtr->analysisSoftwarePtr); // set software name if (searchEngineCVID != CVID_Unknown) sipPtr->analysisSoftwarePtr->softwareName.set(searchEngineCVID); else sipPtr->analysisSoftwarePtr->softwareName.set(MS_custom_unreleased_software_tool, searchEngineName); // set version and URI sipPtr->analysisSoftwarePtr->version = searchEngineVersion; sipPtr->analysisSoftwarePtr->URI = searchEngineURI; // set search type sipPtr->searchType.cvid = MS_ms_ms_search; // add a mass table for all MS levels MassTablePtr massTable(new MassTable("MT")); massTable->msLevel.push_back(1); massTable->msLevel.push_back(2); massTable->msLevel.push_back(3); sipPtr->massTable.push_back(massTable); // specify amino acid masses used const char* residueSymbols = "ACDEFGHIKLMNPQRSTUVWY"; for (int i=0; i < 21; ++i) { const AminoAcid::Info::Record& record = AminoAcid::Info::record(residueSymbols[i]); ResiduePtr rp(new Residue); rp->code = record.symbol; rp->mass = record.residueFormula.monoisotopicMass(); massTable->residues.push_back(rp); } // add the SpectrumIdentificationList SpectrumIdentificationListPtr silPtr(new SpectrumIdentificationList("SIL")); mzid.dataCollection.analysisData.spectrumIdentificationList.push_back(silPtr); if (vars.count("SearchStats: Overall")) { string searchStats = vars.find("SearchStats: Overall")->second; silPtr->numSequencesSearched = lexical_cast<int>(searchStats.substr(0, searchStats.find_first_of(' '))); } // add the SpectrumIdentification SpectrumIdentificationPtr siPtr(new SpectrumIdentification("SI")); siPtr->spectrumIdentificationListPtr = silPtr; siPtr->spectrumIdentificationProtocolPtr = sipPtr; siPtr->activityDate = mzid.creationDate; mzid.analysisCollection.spectrumIdentification.push_back(siPtr); // add search database SearchDatabasePtr sdb(new SearchDatabase("SDB")); sdb->fileFormat.cvid = MS_FASTA_format; sdb->location = searchDatabase; sdb->name = bfs::path(searchDatabase).filename(); sdb->set(MS_database_type_amino_acid); sdb->databaseName.userParams.push_back(UserParam("database name", sdb->name, "xsd:string")); mzid.dataCollection.inputs.searchDatabase.push_back(sdb); mzid.analysisCollection.spectrumIdentification[0]->searchDatabase.push_back(sdb); // add source file SpectraDataPtr spectraData(new SpectraData("SD")); spectraData->location = sourceFilepath; spectraData->name = bfs::path(spectraData->location).filename(); mzid.dataCollection.inputs.spectraData.push_back(spectraData); mzid.analysisCollection.spectrumIdentification[0]->inputSpectra.push_back(spectraData); // set source file format (required for a semantically valid mzIdentML file) msdata::ReaderPtr readers(new msdata::FullReaderList); CVID sourceFileFormat = msdata::identifyFileFormat(readers, sourceFilepath); if (sourceFileFormat != CVID_Unknown) spectraData->fileFormat.cvid = sourceFileFormat; else if (outputFormat == IdentDataFile::Format_MzIdentML) throw runtime_error("[SearchSpectraList::write] unable to determine source file format of \"" + sourceFilepath + "\""); { msdata::MSDataFile msd(sourceFilepath, readers.get()); spectraData->spectrumIDFormat.cvid = msdata::id::getDefaultNativeIDFormat(msd); } // add the cleavage rules EnzymePtr enzyme(new Enzyme); enzyme->id = "ENZ_" + lexical_cast<string>(sipPtr->enzymes.enzymes.size()+1); enzyme->terminalSpecificity = (proteome::Digestion::Specificity) lexical_cast<int>(vars.find("Config: MinTerminiCleavages")->second); enzyme->nTermGain = "H"; enzyme->cTermGain = "OH"; enzyme->missedCleavages = lexical_cast<int>(vars.find("Config: MaxMissedCleavages")->second); enzyme->minDistance = 1; enzyme->siteRegexp = cleavageAgentRegex.str(); CVID cleavageAgent = proteome::Digestion::getCleavageAgentByRegex(enzyme->siteRegexp); if (cleavageAgent != CVID_Unknown) enzyme->enzymeName.set(cleavageAgent); sipPtr->enzymes.enzymes.push_back(enzyme); // use monoisotopic mass unless PrecursorMzToleranceRule forces average bool forceAverageMass = vars.find("Config: PrecursorMzToleranceRule")->second == "avg"; if (forceAverageMass) sipPtr->additionalSearchParams.set(MS_parent_mass_type_average); else sipPtr->additionalSearchParams.set(MS_parent_mass_type_mono); sipPtr->additionalSearchParams.set(MS_fragment_mass_type_mono); MZTolerance precursorMzTolerance; string precursorMassType = forceAverageMass ? "Avg" : "Mono"; parse(precursorMzTolerance, vars.find("Config: " + precursorMassType + "PrecursorMzTolerance")->second); sipPtr->parentTolerance.set(MS_search_tolerance_minus_value, precursorMzTolerance.value); sipPtr->parentTolerance.set(MS_search_tolerance_plus_value, precursorMzTolerance.value); sipPtr->parentTolerance.cvParams[0].units = sipPtr->parentTolerance.cvParams[1].units = precursorMzTolerance.units == MZTolerance::PPM ? UO_parts_per_million : UO_dalton; MZTolerance fragmentMzTolerance; parse(fragmentMzTolerance, vars.find("Config: FragmentMzTolerance")->second); sipPtr->fragmentTolerance.set(MS_search_tolerance_minus_value, fragmentMzTolerance.value); sipPtr->fragmentTolerance.set(MS_search_tolerance_plus_value, fragmentMzTolerance.value); sipPtr->fragmentTolerance.cvParams[0].units = sipPtr->fragmentTolerance.cvParams[1].units = fragmentMzTolerance.units == MZTolerance::PPM ? UO_parts_per_million : UO_dalton; sipPtr->threshold.set(MS_no_threshold); string fragmentationRule = vars.find("Config: FragmentationRule")->second; if (bal::icontains(fragmentationRule, "cid")) translateIonSeriesConsidered(*sipPtr, "b,y"); if (bal::icontains(fragmentationRule, "etd")) translateIonSeriesConsidered(*sipPtr, "c,z+1"); if (bal::icontains(fragmentationRule, "manual")) translateIonSeriesConsidered(*sipPtr, fragmentationRule.substr(7)); // skip "manual:" DynamicModSet dynamicMods( vars.find("Config: DynamicMods")->second ); BOOST_FOREACH(const DynamicMod& mod, dynamicMods) { SearchModificationPtr searchModification(new SearchModification); switch( mod.unmodChar ) { case PEPTIDE_N_TERMINUS_SYMBOL: searchModification->massDelta = mod.modMass; searchModification->fixedMod = false; searchModification->specificityRules.cvid = MS_modification_specificity_N_term; break; case PEPTIDE_C_TERMINUS_SYMBOL: searchModification->massDelta = mod.modMass; searchModification->fixedMod = false; searchModification->specificityRules.cvid = MS_modification_specificity_C_term; break; default: { string specificity; // either empty, n, or c, but not both if (mod.NTerminalFilters.size() == 1 && mod.NTerminalFilters[0].m_filter[PEPTIDE_N_TERMINUS_SYMBOL]) specificity += 'n'; else if (mod.CTerminalFilters.size() == 1 && mod.CTerminalFilters[0].m_filter[PEPTIDE_C_TERMINUS_SYMBOL]) specificity += 'c'; searchModification->massDelta = mod.modMass; searchModification->residues.push_back(mod.unmodChar); searchModification->fixedMod = false; if (specificity == "n") searchModification->specificityRules.cvid = MS_modification_specificity_N_term; else if (specificity == "c") searchModification->specificityRules.cvid = MS_modification_specificity_C_term; break; } } sipPtr->modificationParams.push_back(searchModification); }