Beispiel #1
0
/**
 * Generic tokenizer.
 * Splits source into tokens and tries to lexically cast them to TARGET.
 * If that fails, boost::bad_lexical_cast is thrown.
 * \param source the source string to be split up
 * \param separator regular expression to delimit the tokens (defaults to \\s+)
 * \param prefix regular expression for text to be removed from the string before it is split up
 * ("^" if not given, will be added at the beginning)
 * \param postfix regular expression for text to be removed from the string before it is split up
 * ("$" if not given, will be added at the end)
 * \returns a list of the casted tokens
 */
template<typename TARGET> std::list<TARGET> stringToList(
	std::string source, const boost::regex &separator,
	boost::regex prefix, boost::regex postfix )
{
	std::list<TARGET> ret;
	assert( ! separator.empty() );

	if ( ! prefix.empty() ) {
		if ( prefix.str()[0] != '^' )
			prefix = boost::regex( std::string( "^" ) + prefix.str(), prefix.flags() );

		source = boost::regex_replace( source, prefix, "", boost::format_first_only | boost::match_default );
	}

	if ( ! postfix.empty() ) {
		if ( postfix.str()[postfix.size() - 1] != '$' )
			postfix = boost::regex( postfix.str() + "$", postfix.flags() );

		source = boost::regex_replace( source, postfix, "", boost::format_first_only | boost::match_default );
	}

	boost::sregex_token_iterator i = boost::make_regex_token_iterator( source, separator, -1 );
	const boost::sregex_token_iterator token_end;

	while ( i != token_end ) {
		ret.push_back( boost::lexical_cast<TARGET>( ( i++ )->str() ) );
	}

	return ret;
}
int ChessEngineGnu::match(const string   &str,
                          boost::regex   &re,
                          vector<string> &matches)
{
  boost::cmatch what;

  matches.clear();

  GC_TRACE("match(): %s\n", re.str().c_str());

  if( boost::regex_match(str.c_str(), what, re) )
  {
    // what[0] is the whole string
    for(size_t i=1; i<what.size(); ++i)
    {
      GC_TRACE("  \"%s\"\n", what[i].str().c_str());
      matches.push_back(what[i].str());
    }
  }

  GC_TRACE("  %zu matches\n", matches.size());

  return (int)matches.size();
}
TestCaseReader::TestCaseReader(const boost::filesystem::path & testCaseDir)
{
    const boost::regex expr("(\\d{3,4})-(.*)\\.xml");
    for (boost::filesystem::directory_iterator it = boost::filesystem::directory_iterator(testCaseDir);
         it != boost::filesystem::directory_iterator(); ++it)
    {
        const boost::filesystem::path path = it->path();
        //ignore files that don't end in ".xml"
        if (path.extension() != ".xml")
        {
            continue;
        }

#if defined (BOOST_FILESYSTEM_VERSION) && BOOST_FILESYSTEM_VERSION == 3
        const std::string filename = path.filename().string();
#else
        const std::string filename = path.filename();
#endif

        boost::smatch matchResults;

        if (boost::regex_match(filename,matchResults,expr))
        {
            //std::wcout << "Found testcase " << matchResults[1].str().c_str() << " brief description '" << matchResults[2].str().c_str()<< "'" << std::endl;
            const size_t tc = boost::lexical_cast<size_t>(matchResults[1]);

            if (m_testCases.size() < (tc + 1))
            {
                m_testCases.resize(tc + 1);
            }

            if (m_testCases[tc] != NULL)
            {
                std::wcerr << "There appears to be two test cases with number " << tc << std::endl;
                exit(1);
            }

            std::ostringstream xml;
            xml << boost::filesystem::ifstream(path).rdbuf();
            //std::wcout << "Read xml (" << xml.str().size() << " bytes) '" << xml.str().c_str() << "'" << std::endl;
            try
            {
                m_testCases[tc] = boost::dynamic_pointer_cast<DoseTest::Items::TestCase>
                    (Safir::Dob::Typesystem::Serialization::ToObject(Safir::Dob::Typesystem::Utilities::ToWstring(xml.str())));
            }
            catch (const std::exception & exc)
            {
                std::wcerr << "Failed to read file '" << path.string().c_str() << "' due to exception with message" << std::endl
                           <<exc.what() << std::endl;
                exit(2);
            }
        }
        else
        {
            std::wcerr << "File '"
                       << path.filename().c_str()
                       << "' did not match the pattern for test case files: '"
                       << expr.str().c_str()
                       << "'"  << std::endl;
        }
    }
}
void write( const string& sourceFilepath,
            pwiz::identdata::IdentDataFile::Format outputFormat,
            const string& filenameSuffix,
            const string& searchEngineName,
            const string& searchEngineVersion,
            const string& searchEngineURI,
            const string& searchDatabase,
            boost::regex cleavageAgentRegex,
            const string& decoyPrefix,
            const RunTimeVariableMap& vars ) const
{
    using namespace pwiz::identdata;
    namespace msdata = pwiz::msdata;
    namespace proteome = pwiz::proteome;

    IdentData mzid;

    mzid.id = sourceFilepath + " " + searchDatabase + " " + searchEngineName + " " + searchEngineVersion;
    mzid.creationDate = GetDateTime();

    // add default CVs
    mzid.cvs = defaultCVList();

    // add the SpectrumIdentificationProtocol
    SpectrumIdentificationProtocolPtr sipPtr(new SpectrumIdentificationProtocol("SIP"));
    mzid.analysisProtocolCollection.spectrumIdentificationProtocol.push_back(sipPtr);

    CVTranslator cvTranslator;
    CVID searchEngineCVID = cvTranslator.translate(searchEngineName);

    // add analysis software
    sipPtr->analysisSoftwarePtr.reset(new AnalysisSoftware("AS"));
    mzid.analysisSoftwareList.push_back(sipPtr->analysisSoftwarePtr);

    // set software name
    if (searchEngineCVID != CVID_Unknown)
        sipPtr->analysisSoftwarePtr->softwareName.set(searchEngineCVID);
    else
        sipPtr->analysisSoftwarePtr->softwareName.set(MS_custom_unreleased_software_tool, searchEngineName);

    // set version and URI
    sipPtr->analysisSoftwarePtr->version = searchEngineVersion;
    sipPtr->analysisSoftwarePtr->URI = searchEngineURI;

    // set search type
    sipPtr->searchType.cvid = MS_ms_ms_search;

    // add a mass table for all MS levels
    MassTablePtr massTable(new MassTable("MT"));
    massTable->msLevel.push_back(1);
    massTable->msLevel.push_back(2);
    massTable->msLevel.push_back(3);
    sipPtr->massTable.push_back(massTable);

    // specify amino acid masses used
    const char* residueSymbols = "ACDEFGHIKLMNPQRSTUVWY";
    for (int i=0; i < 21; ++i)
    {
        const AminoAcid::Info::Record& record = AminoAcid::Info::record(residueSymbols[i]);       
        ResiduePtr rp(new Residue);
        rp->code = record.symbol;
        rp->mass = record.residueFormula.monoisotopicMass();
        massTable->residues.push_back(rp);
    }

    // add the SpectrumIdentificationList
    SpectrumIdentificationListPtr silPtr(new SpectrumIdentificationList("SIL"));
    mzid.dataCollection.analysisData.spectrumIdentificationList.push_back(silPtr);

    if (vars.count("SearchStats: Overall"))
    {
        string searchStats = vars.find("SearchStats: Overall")->second;
        silPtr->numSequencesSearched = lexical_cast<int>(searchStats.substr(0, searchStats.find_first_of(' ')));
    }

    // add the SpectrumIdentification
    SpectrumIdentificationPtr siPtr(new SpectrumIdentification("SI"));
    siPtr->spectrumIdentificationListPtr = silPtr;
    siPtr->spectrumIdentificationProtocolPtr = sipPtr;
    siPtr->activityDate = mzid.creationDate;
    mzid.analysisCollection.spectrumIdentification.push_back(siPtr);

    // add search database
    SearchDatabasePtr sdb(new SearchDatabase("SDB"));
    sdb->fileFormat.cvid = MS_FASTA_format;
    sdb->location = searchDatabase;
    sdb->name = bfs::path(searchDatabase).filename();
    sdb->set(MS_database_type_amino_acid);
    sdb->databaseName.userParams.push_back(UserParam("database name", sdb->name, "xsd:string"));
    mzid.dataCollection.inputs.searchDatabase.push_back(sdb);
    mzid.analysisCollection.spectrumIdentification[0]->searchDatabase.push_back(sdb);

    // add source file
    SpectraDataPtr spectraData(new SpectraData("SD"));
    spectraData->location = sourceFilepath;
    spectraData->name = bfs::path(spectraData->location).filename();
    mzid.dataCollection.inputs.spectraData.push_back(spectraData);
    mzid.analysisCollection.spectrumIdentification[0]->inputSpectra.push_back(spectraData);

    // set source file format (required for a semantically valid mzIdentML file)
    msdata::ReaderPtr readers(new msdata::FullReaderList);
    CVID sourceFileFormat = msdata::identifyFileFormat(readers, sourceFilepath);
    if (sourceFileFormat != CVID_Unknown)
        spectraData->fileFormat.cvid = sourceFileFormat;
    else if (outputFormat == IdentDataFile::Format_MzIdentML)
        throw runtime_error("[SearchSpectraList::write] unable to determine source file format of \"" + sourceFilepath + "\"");

    {
        msdata::MSDataFile msd(sourceFilepath, readers.get());
        spectraData->spectrumIDFormat.cvid = msdata::id::getDefaultNativeIDFormat(msd);
    }

    // add the cleavage rules
    EnzymePtr enzyme(new Enzyme);
    enzyme->id = "ENZ_" + lexical_cast<string>(sipPtr->enzymes.enzymes.size()+1);
    enzyme->terminalSpecificity = (proteome::Digestion::Specificity) lexical_cast<int>(vars.find("Config: MinTerminiCleavages")->second);
    enzyme->nTermGain = "H";
    enzyme->cTermGain = "OH";
    enzyme->missedCleavages = lexical_cast<int>(vars.find("Config: MaxMissedCleavages")->second);
    enzyme->minDistance = 1;
    enzyme->siteRegexp = cleavageAgentRegex.str();

    CVID cleavageAgent = proteome::Digestion::getCleavageAgentByRegex(enzyme->siteRegexp);
    if (cleavageAgent != CVID_Unknown)
        enzyme->enzymeName.set(cleavageAgent);

    sipPtr->enzymes.enzymes.push_back(enzyme);


    // use monoisotopic mass unless PrecursorMzToleranceRule forces average
    bool forceAverageMass = vars.find("Config: PrecursorMzToleranceRule")->second == "avg";

    if (forceAverageMass)
        sipPtr->additionalSearchParams.set(MS_parent_mass_type_average);
    else
        sipPtr->additionalSearchParams.set(MS_parent_mass_type_mono);

    sipPtr->additionalSearchParams.set(MS_fragment_mass_type_mono);

    MZTolerance precursorMzTolerance;
    string precursorMassType = forceAverageMass ? "Avg" : "Mono";
    parse(precursorMzTolerance, vars.find("Config: " + precursorMassType + "PrecursorMzTolerance")->second);
    sipPtr->parentTolerance.set(MS_search_tolerance_minus_value, precursorMzTolerance.value);
    sipPtr->parentTolerance.set(MS_search_tolerance_plus_value, precursorMzTolerance.value);
    sipPtr->parentTolerance.cvParams[0].units = sipPtr->parentTolerance.cvParams[1].units =
        precursorMzTolerance.units == MZTolerance::PPM ? UO_parts_per_million : UO_dalton;

    MZTolerance fragmentMzTolerance;
    parse(fragmentMzTolerance, vars.find("Config: FragmentMzTolerance")->second);
    sipPtr->fragmentTolerance.set(MS_search_tolerance_minus_value, fragmentMzTolerance.value);
    sipPtr->fragmentTolerance.set(MS_search_tolerance_plus_value, fragmentMzTolerance.value);
    sipPtr->fragmentTolerance.cvParams[0].units = sipPtr->fragmentTolerance.cvParams[1].units =
        fragmentMzTolerance.units == MZTolerance::PPM ? UO_parts_per_million : UO_dalton;

    sipPtr->threshold.set(MS_no_threshold);

    string fragmentationRule = vars.find("Config: FragmentationRule")->second;
    if (bal::icontains(fragmentationRule, "cid"))     translateIonSeriesConsidered(*sipPtr, "b,y");
    if (bal::icontains(fragmentationRule, "etd"))     translateIonSeriesConsidered(*sipPtr, "c,z+1");
    if (bal::icontains(fragmentationRule, "manual"))  translateIonSeriesConsidered(*sipPtr, fragmentationRule.substr(7)); // skip "manual:"


    DynamicModSet dynamicMods( vars.find("Config: DynamicMods")->second );
    BOOST_FOREACH(const DynamicMod& mod, dynamicMods)
    {
        SearchModificationPtr searchModification(new SearchModification);

        switch( mod.unmodChar )
        {
            case PEPTIDE_N_TERMINUS_SYMBOL:
                searchModification->massDelta = mod.modMass;
                searchModification->fixedMod = false;
                searchModification->specificityRules.cvid = MS_modification_specificity_N_term;
                break;

            case PEPTIDE_C_TERMINUS_SYMBOL:
                searchModification->massDelta = mod.modMass;
                searchModification->fixedMod = false;
                searchModification->specificityRules.cvid = MS_modification_specificity_C_term;
                break;

            default:
            {
                string specificity; // either empty, n, or c, but not both

                if (mod.NTerminalFilters.size() == 1 &&
                    mod.NTerminalFilters[0].m_filter[PEPTIDE_N_TERMINUS_SYMBOL])
                    specificity += 'n';
                else if (mod.CTerminalFilters.size() == 1 &&
                         mod.CTerminalFilters[0].m_filter[PEPTIDE_C_TERMINUS_SYMBOL])
                specificity += 'c';

                searchModification->massDelta = mod.modMass;
                searchModification->residues.push_back(mod.unmodChar);
                searchModification->fixedMod = false;

                if (specificity == "n")
                    searchModification->specificityRules.cvid = MS_modification_specificity_N_term;
                else if (specificity == "c")
                    searchModification->specificityRules.cvid = MS_modification_specificity_C_term;
                break;
            }
        }
        sipPtr->modificationParams.push_back(searchModification);
    }