bool FastSearchFormat::ObtainTarget(OBConversion* pConv, vector<OBMol>& patternMols, const string& indexname) { //Obtains an OBMol from: // the filename in the -s option or // the SMARTS string in the -s option or // by converting the file in the -S or -aS options (deprecated). // If there is no -s -S or -aS option, information on the index file is displayed. OBMol patternMol; patternMol.SetIsPatternStructure(); const char* p = pConv->IsOption("s",OBConversion::GENOPTIONS); bool OldSOption=false; //If no -s option, make OBMol from file in -S option or -aS option (both deprecated) if(!p) { p = pConv->IsOption("S",OBConversion::GENOPTIONS); if(!p) p = pConv->IsOption("S",OBConversion::INOPTIONS);//for GUI mainly OldSOption = true; } if(p) { vector<string> vec; tokenize(vec, p); //ignore leading ~ (not relevant to fastsearch) if(vec[0][0]=='~') vec[0].erase(0,1); if(vec.size()>1 && vec[1]=="exact") pConv->AddOption("e", OBConversion::INOPTIONS); OBConversion patternConv; OBFormat* pFormat; //Interpret as a filename if possible string& txt =vec [0]; if( txt.empty() || txt.find('.')==string::npos || !(pFormat = patternConv.FormatFromExt(txt.c_str())) || !patternConv.SetInFormat(pFormat) || !patternConv.ReadFile(&patternMol, txt) || patternMol.NumAtoms()==0) //if false, have a valid patternMol from a file { //is SMARTS/SMILES //Replace e.g. [#6] in SMARTS by C so that it can be converted as SMILES //for the fingerprint phase, but allow more generality in the SMARTS phase. for(;;) { string::size_type pos1, pos2; pos1 = txt.find("[#"); if(pos1==string::npos) break; pos2 = txt.find(']'); int atno; if(pos2!=string::npos && (atno = atoi(txt.substr(pos1+2, pos2-pos1-2).c_str())) && atno>0) txt.replace(pos1, pos2-pos1+1, etab.GetSymbol(atno)); else { obErrorLog.ThrowError(__FUNCTION__,"Ill-formed [#n] atom in SMARTS", obError); return false; } } bool hasTildeBond; if( (hasTildeBond = (txt.find('~')!=string::npos)) ) // extra parens to indicate truth value { //Find ~ bonds and make versions of query molecule with a single and aromatic bonds //To avoid having to parse the SMILES here, replace ~ by $ (quadruple bond) //and then replace this in patternMol. Check first that there are no $ already //Sadly, isocynanides may have $ bonds. if(txt.find('$')!=string::npos) { obErrorLog.ThrowError(__FUNCTION__, "Cannot use ~ bonds in patterns with $ (quadruple) bonds.)", obError); return false; } replace(txt.begin(),txt.end(), '~' , '$'); } //read as standard SMILES patternConv.SetInFormat("smi"); if(!patternConv.ReadString(&patternMol, vec[0])) { obErrorLog.ThrowError(__FUNCTION__,"Cannot read the SMILES string",obError); return false; } if(hasTildeBond) { AddPattern(patternMols, patternMol, 0); //recursively add all combinations of tilde bond values return true; } } else { // target(s) are in a file patternMols.push_back(patternMol); while(patternConv.Read(&patternMol)) patternMols.push_back(patternMol); return true; } } if(OldSOption) //only when using deprecated -S and -aS options { //make -s option for later SMARTS test OBConversion conv; if(conv.SetOutFormat("smi")) { string optiontext = conv.WriteString(&patternMol, true); pConv->AddOption("s", OBConversion::GENOPTIONS, optiontext.c_str()); } } if(!p) { //neither -s or -S options provided. Output info rather than doing search const FptIndexHeader& header = fs.GetIndexHeader(); string id(header.fpid); if(id.empty()) id = "default"; clog << indexname << " is an index of\n " << header.datafilename << ".\n It contains " << header.nEntries << " molecules. The fingerprint type is " << id << " with " << OBFingerprint::Getbitsperint() * header.words << " bits.\n" << "Typical usage for a substructure search:\n" << "obabel indexfile.fs -osmi -sSMILES\n" << "(-s option in GUI is 'Convert only if match SMARTS or mols in file')" << endl; return false; } patternMols.push_back(patternMol); return true; }
bool FastSearchFormat::ObtainTarget(OBConversion* pConv, OBMol& patternMol, const string& indexname) { //Obtains an OBMol // either from the SMARTS string in the -s option // or by converting the file in the -S option //or, if neither option is provided, displays information on the index file. stringstream smiles(stringstream::out); ifstream patternstream; OBConversion PatternConv(&patternstream,&smiles); const char* p = pConv->IsOption("s",OBConversion::GENOPTIONS); string txt; if(p) { // Use the -s option txt=p; stringstream smarts(txt, stringstream::in); OBConversion Convsm(&smarts); if(!Convsm.SetInFormat("smi")) return false; Convsm.Read(&patternMol); //erase -s option in GeneralOptions since it will be rewritten pConv->RemoveOption("s",OBConversion::GENOPTIONS); if(patternMol.Empty()) { obErrorLog.ThrowError(__FUNCTION__, "Could not make a molecule from " + smarts.str() + "\nThis needs to be valid SMILES when using fastsearch." "You can use the more versatile SMARTS in a normal substructure search." , obError); return false; } } else { // or Make OBMol from file in -S option or -aS option p = pConv->IsOption("S",OBConversion::GENOPTIONS); if(!p) p = pConv->IsOption("S",OBConversion::INOPTIONS);//for GUI mainly } if(!p) { //neither -s or -S options provided. Output info rather than doing search const FptIndexHeader& header = fs.GetIndexHeader(); string id(header.fpid); if(id.empty()) id = "default"; clog << indexname << " is an index of\n " << header.datafilename << ".\n It contains " << header.nEntries << " molecules. The fingerprint type is " << id << " with " << OBFingerprint::Getbitsperint() * header.words << " bits.\n" << "Typical usage for a substructure search:\n" << "babel indexfile.fs -osmi -sSMILES" << endl; return false; } if(p && patternMol.Empty()) { txt=p; string::size_type pos = txt.find_last_of('.'); if(pos==string::npos) { obErrorLog.ThrowError(__FUNCTION__, "Filename of pattern molecule in -S option must have an extension", obError); return false; } patternstream.open(txt.c_str()); if(!patternstream) { stringstream errorMsg; errorMsg << "Cannot open " << txt << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } PatternConv.SetOneObjectOnly(); if(PatternConv.SetInFormat(txt.substr(pos+1).c_str())) PatternConv.Read(&patternMol); } if(patternMol.Empty()) { obErrorLog.ThrowError(__FUNCTION__, "Cannot derive a molecule from the -s or -S options", obWarning); return false; } patternMol.ConvertDativeBonds();//use standard form for dative bonds //Convert to SMILES and generate a -s option for use in the final filtering if(!PatternConv.SetOutFormat("smi")) return false; PatternConv.Write(&patternMol); //remove name to leave smiles string string smilesstr(smiles.str()); string::size_type pos = smilesstr.find_first_of(" \t\r\n"); if(pos!=string::npos) smilesstr = smilesstr.substr(0,pos); pConv->AddOption("s", OBConversion::GENOPTIONS, smilesstr.c_str()); return true; }
bool FastSearchFormat::ReadChemObject(OBConversion* pConv) { //Searches index file for structural matches //This function is called only once per search std::string auditMsg = "OpenBabel::Read fastsearch index "; std::string description(Description()); auditMsg += description.substr(0,description.find('\n')); obErrorLog.ThrowError(__FUNCTION__, auditMsg, obAuditMsg); //Derive index name string indexname = pConv->GetInFilename(); string::size_type pos=indexname.find_last_of('.'); if(pos!=string::npos) { indexname.erase(pos); indexname += ".fs"; } //Have to open input stream again because needs to be in binary mode ifstream ifs; stringstream errorMsg; if(!indexname.empty()) ifs.open(indexname.c_str(),ios::binary); if(!ifs) { errorMsg << "Couldn't open " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } string datafilename = fs.ReadIndex(&ifs); if(datafilename.empty()) { errorMsg << "Difficulty reading from index " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } vector<OBMol> patternMols; if(!ObtainTarget(pConv, patternMols, indexname)) return false; bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option //Open the datafile and put it in pConv //datafile name derived from index file probably won't have a file path //but indexname may. Derive a full datafile name string path; pos = indexname.find_last_of("/\\"); if(pos==string::npos) path = datafilename; else path = indexname.substr(0,pos+1) + datafilename; ifstream datastream(path.c_str()); if(!datastream) { errorMsg << "Difficulty opening " << path << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } pConv->SetInStream(&datastream); //Input format is currently fs; set it appropriately if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat())) return false; // If target has dative bonds like -[N+](=O)[O-] convert it to the uncharged form // (-N(=O)=O and add uncharged form to vector of mols which are sent to // the -s (SMARTS)filter. // Also check whether the target has dative bonds in the uncharged form and supply // the charged form to the -s filter. // Together with the automatic conversion to the uncharged form when the fs index is made, // this ensures that both forms are found however they occur in the datafile or the taget. vector<OBBase*> extraSMARTSMols; vector<OBMol>extraUnchargedMols; for(unsigned i=0;i<patternMols.size();++i) { if(patternMols[i].ConvertDativeBonds()) extraSMARTSMols.push_back(&patternMols[i]); else { // If target has uncharged dative bonds, still use it for fastsearching, // but add the charged form for -s filter. extraUnchargedMols.push_back(patternMols[i]); if(extraUnchargedMols.back().MakeDativeBonds()) extraSMARTSMols.push_back(&extraUnchargedMols.back()); } } OBOp* sFilter = OBOp::FindType("s"); if(sFilter) sFilter->ProcessVec(extraSMARTSMols); //Now do searching const char* p = pConv->IsOption("t",OBConversion::INOPTIONS); if(p) { //Do a similarity search multimap<double, unsigned int> SeekposMap; string txt=p; if(txt.find('.')==string::npos) { //Finds n molecules with largest Tanimoto int n = atoi(p); fs.FindSimilar(&patternMols[0], SeekposMap, n); } else { //Finds molecules with Tanimoto > MinTani double MaxTani = 1.1; size_t pos = txt.find(','); if( pos != string::npos ) { MaxTani = atof( txt.substr( pos + 1 ).c_str() ); } double MinTani = atof( txt.substr( 0, pos ).c_str() ); fs.FindSimilar(&patternMols[0], SeekposMap, MinTani, MaxTani); } //Don't want to filter through SMARTS filter pConv->RemoveOption("s", OBConversion::GENOPTIONS); //also because op names are case independent pConv->RemoveOption("S", OBConversion::GENOPTIONS); multimap<double, unsigned int>::reverse_iterator itr; for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) { datastream.seekg(itr->second); if(pConv->IsOption("a", OBConversion::INOPTIONS)) { //Adds Tanimoto coeff to title //First remove any previous value pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS); stringstream ss; ss << " " << itr->first; pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str()); } pConv->SetOneObjectOnly(); if(itr != --SeekposMap.rend()) pConv->SetMoreFilesToCome();//so that not seen as last on output pConv->Convert(NULL,NULL); } } else { //Structure search int MaxCandidates = 4000; p = pConv->IsOption("l",OBConversion::INOPTIONS); if(p && atoi(p)) MaxCandidates = atoi(p); vector<unsigned int> SeekPositions; if(exactmatch) { //Find mols where all fingerprint bits are the same as the target fs.FindMatch(&patternMols[0], SeekPositions, MaxCandidates); // ensure that SMARTS filter in transform.cpp looks only for an exact match // by setting an option with the number of heavy atoms in the pattern mol included. stringstream ss; ss << patternMols[0].NumHvyAtoms(); pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str()); } else { //Do a substructure search for each target vector<OBMol>::iterator iter; for(iter=patternMols.begin();iter!=patternMols.end();++iter) fs.Find(&*iter, SeekPositions, MaxCandidates); clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl; } vector<unsigned int>::iterator seekitr, begin = SeekPositions.begin(), end = SeekPositions.end(); if(patternMols.size()>1)//only sort and eliminate duplicates if necessary { sort(begin, end); end = unique(begin, end); //removed duplicates are after new end } //Output the candidate molecules, filtering through s filter, unless it was not requested if(pConv->IsOption("n", OBConversion::INOPTIONS) ) pConv->RemoveOption("s",OBConversion::GENOPTIONS); pConv->SetLast(false); for(seekitr=begin; seekitr!=end; ++seekitr) { datastream.seekg(*seekitr); if(!pConv->GetInFormat()->ReadChemObject(pConv)) return false; pConv->SetFirstInput(false); //needed for OpSort } } return false; //To finish }
bool FastSearchFormat::ReadChemObject(OBConversion* pConv) { //Searches index file for structural matches //This function is called only once per search std::string auditMsg = "OpenBabel::Read fastsearch index "; std::string description(Description()); auditMsg += description.substr(0,description.find('\n')); obErrorLog.ThrowError(__FUNCTION__, auditMsg, obAuditMsg); //Derive index name string indexname = pConv->GetInFilename(); string::size_type pos=indexname.find_last_of('.'); if(pos!=string::npos) { indexname.erase(pos); indexname += ".fs"; } //Have to open input stream again because needs to be in binary mode ifstream ifs; stringstream errorMsg; if(!indexname.empty()) ifs.open(indexname.c_str(),ios::binary); if(!ifs) { errorMsg << "Couldn't open " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } string datafilename = fs.ReadIndex(&ifs); if(datafilename.empty()) { errorMsg << "Difficulty reading from index " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } OBMol patternMol; bool doSubset = pConv->IsOption("s",OBConversion::INOPTIONS)!=NULL;// -as option bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option if(!doSubset) { //Similarity or substructure if(!ObtainTarget(pConv, patternMol, indexname)) return false; } //Open the datafile and put it in pConv //datafile name derived from index file probably won't have a file path //but indexname may. Derive a full datafile name string path; pos = indexname.find_last_of("/\\"); if(pos==string::npos) path = datafilename; else path = indexname.substr(0,pos+1) + datafilename; ifstream datastream(path.c_str()); if(!datastream) { errorMsg << "Difficulty opening " << path << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } pConv->SetInStream(&datastream); //Input format is currently fs; set it appropriately if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat())) return false; pConv->AddOption("b",OBConversion::GENOPTIONS); //Now do searching const char* p = pConv->IsOption("t",OBConversion::INOPTIONS); if(p) { //Do a similarity search multimap<double, unsigned int> SeekposMap; string txt=p; if(txt.find('.')==string::npos) { //Finds n molecules with largest Tanimoto int n = atoi(p); fs.FindSimilar(&patternMol, SeekposMap, n); } else { //Finds molecules with Tanimoto > MinTani double MinTani = atof(txt.c_str()); // if(doSubset) // fs.FindSubset(SeekposMap, MinTani); // else fs.FindSimilar(&patternMol, SeekposMap, MinTani); } //Don't want to filter through SMARTS filter pConv->RemoveOption("s", OBConversion::GENOPTIONS); multimap<double, unsigned int>::reverse_iterator itr; for(itr=SeekposMap.rbegin(); itr!=SeekposMap.rend(); ++itr) { datastream.seekg(itr->second); if(pConv->IsOption("a", OBConversion::INOPTIONS)) { //Adds Tanimoto coeff to title //First remove any previous value pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS); stringstream ss; ss << " " << itr->first; pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str()); } pConv->SetOneObjectOnly(); if(itr != --SeekposMap.rend()) pConv->SetMoreFilesToCome();//so that not seen as last on output pConv->Convert(NULL,NULL); } } else { //Structure search int MaxCandidates = 4000; p = pConv->IsOption("l",OBConversion::INOPTIONS); if(p && atoi(p)) MaxCandidates = atoi(p); vector<unsigned int> SeekPositions; if(exactmatch) { //Find mols where all fingerprint bits are the same as the target fs.FindMatch(&patternMol, SeekPositions, MaxCandidates); // ensure that SMARTS filter in transform.cpp looks only for an exact match // by setting an option with the number of heavy atoms in the pattern mol included. stringstream ss; ss << patternMol.NumHvyAtoms(); pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str()); } else { //Do a substructure search fs.Find(&patternMol, SeekPositions, MaxCandidates); clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl; } //Output the candidate molecules //filtering through s filter, unless the fingerprint type does not require it if(fs.GetFingerprint()->Flags() & OBFingerprint::FPT_UNIQUEBITS) pConv->RemoveOption("s",OBConversion::GENOPTIONS); vector<unsigned int>::iterator itr; for(itr=SeekPositions.begin(); itr!=SeekPositions.end(); itr++) { datastream.seekg(*itr); // datastream.seekg(*itr - datastream.tellg(), ios_base::cur); //Avoid retrieving start //debugging kludge to output all candidates directly if(pConv->IsOption("c",OBConversion::GENOPTIONS)) { string ln; getline(datastream,ln); datastream.seekg(*itr); *pConv->GetOutStream() << "** " << ln << endl; } pConv->SetOneObjectOnly(); pConv->SetLast(itr+1 == SeekPositions.end()); pConv->Convert(NULL,NULL); } } return false; //To finish }