bool FastSearchFormat::ReadChemObject(OBConversion* pConv) { //Searches index file for structural matches //This function is called only once per search std::string auditMsg = "OpenBabel::Read fastsearch index "; std::string description(Description()); auditMsg += description.substr(0,description.find('\n')); obErrorLog.ThrowError(__FUNCTION__, auditMsg, obAuditMsg); //Derive index name string indexname = pConv->GetInFilename(); string::size_type pos=indexname.find_last_of('.'); if(pos!=string::npos) { indexname.erase(pos); indexname += ".fs"; } //Have to open input stream again because needs to be in binary mode ifstream ifs; stringstream errorMsg; if(!indexname.empty()) ifs.open(indexname.c_str(),ios::binary); if(!ifs) { errorMsg << "Couldn't open " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } string datafilename = fs.ReadIndex(&ifs); if(datafilename.empty()) { errorMsg << "Difficulty reading from index " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } vector<OBMol> patternMols; if(!ObtainTarget(pConv, patternMols, indexname)) return false; bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option //Open the datafile and put it in pConv //datafile name derived from index file probably won't have a file path //but indexname may. Derive a full datafile name string path; pos = indexname.find_last_of("/\\"); if(pos==string::npos) path = datafilename; else path = indexname.substr(0,pos+1) + datafilename; ifstream datastream(path.c_str()); if(!datastream) { errorMsg << "Difficulty opening " << path << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } pConv->SetInStream(&datastream); //Input format is currently fs; set it appropriately if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat())) return false; // If target has dative bonds like -[N+](=O)[O-] convert it to the uncharged form // (-N(=O)=O and add uncharged form to vector of mols which are sent to // the -s (SMARTS)filter. // Also check whether the target has dative bonds in the uncharged form and supply // the charged form to the -s filter. // Together with the automatic conversion to the uncharged form when the fs index is made, // this ensures that both forms are found however they occur in the datafile or the taget. vector<OBBase*> extraSMARTSMols; vector<OBMol>extraUnchargedMols; for(unsigned i=0;i<patternMols.size();++i) { if(patternMols[i].ConvertDativeBonds()) extraSMARTSMols.push_back(&patternMols[i]); else { // If target has uncharged dative bonds, still use it for fastsearching, // but add the charged form for -s filter. extraUnchargedMols.push_back(patternMols[i]); if(extraUnchargedMols.back().MakeDativeBonds()) extraSMARTSMols.push_back(&extraUnchargedMols.back()); } } OBOp* sFilter = OBOp::FindType("s"); if(sFilter) sFilter->ProcessVec(extraSMARTSMols); //Now do searching const char* p = pConv->IsOption("t",OBConversion::INOPTIONS); if(p) { //Do a similarity search multimap<double, unsigned int> SeekposMap; string txt=p; if(txt.find('.')==string::npos) { //Finds n molecules with largest Tanimoto int n = atoi(p); fs.FindSimilar(&patternMols[0], SeekposMap, n); } else { //Finds molecules with Tanimoto > MinTani double MaxTani = 1.1; size_t pos = txt.find(','); if( pos != string::npos ) { MaxTani = atof( txt.substr( pos + 1 ).c_str() ); } double MinTani = atof( txt.substr( 0, pos ).c_str() ); fs.FindSimilar(&patternMols[0], SeekposMap, MinTani, MaxTani); } //Don't want to filter through SMARTS filter pConv->RemoveOption("s", OBConversion::GENOPTIONS); //also because op names are case independent pConv->RemoveOption("S", OBConversion::GENOPTIONS); multimap<double, unsigned int>::reverse_iterator itr; for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) { datastream.seekg(itr->second); if(pConv->IsOption("a", OBConversion::INOPTIONS)) { //Adds Tanimoto coeff to title //First remove any previous value pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS); stringstream ss; ss << " " << itr->first; pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str()); } pConv->SetOneObjectOnly(); if(itr != --SeekposMap.rend()) pConv->SetMoreFilesToCome();//so that not seen as last on output pConv->Convert(NULL,NULL); } } else { //Structure search int MaxCandidates = 4000; p = pConv->IsOption("l",OBConversion::INOPTIONS); if(p && atoi(p)) MaxCandidates = atoi(p); vector<unsigned int> SeekPositions; if(exactmatch) { //Find mols where all fingerprint bits are the same as the target fs.FindMatch(&patternMols[0], SeekPositions, MaxCandidates); // ensure that SMARTS filter in transform.cpp looks only for an exact match // by setting an option with the number of heavy atoms in the pattern mol included. stringstream ss; ss << patternMols[0].NumHvyAtoms(); pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str()); } else { //Do a substructure search for each target vector<OBMol>::iterator iter; for(iter=patternMols.begin();iter!=patternMols.end();++iter) fs.Find(&*iter, SeekPositions, MaxCandidates); clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl; } vector<unsigned int>::iterator seekitr, begin = SeekPositions.begin(), end = SeekPositions.end(); if(patternMols.size()>1)//only sort and eliminate duplicates if necessary { sort(begin, end); end = unique(begin, end); //removed duplicates are after new end } //Output the candidate molecules, filtering through s filter, unless it was not requested if(pConv->IsOption("n", OBConversion::INOPTIONS) ) pConv->RemoveOption("s",OBConversion::GENOPTIONS); pConv->SetLast(false); for(seekitr=begin; seekitr!=end; ++seekitr) { datastream.seekg(*seekitr); if(!pConv->GetInFormat()->ReadChemObject(pConv)) return false; pConv->SetFirstInput(false); //needed for OpSort } } return false; //To finish }
bool FastSearchFormat::ReadChemObject(OBConversion* pConv) { //Searches index file for structural matches //This function is called only once per search std::string auditMsg = "OpenBabel::Read fastsearch index "; std::string description(Description()); auditMsg += description.substr(0,description.find('\n')); obErrorLog.ThrowError(__FUNCTION__, auditMsg, obAuditMsg); //Derive index name string indexname = pConv->GetInFilename(); string::size_type pos=indexname.find_last_of('.'); if(pos!=string::npos) { indexname.erase(pos); indexname += ".fs"; } //Have to open input stream again because needs to be in binary mode ifstream ifs; stringstream errorMsg; if(!indexname.empty()) ifs.open(indexname.c_str(),ios::binary); if(!ifs) { errorMsg << "Couldn't open " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } string datafilename = fs.ReadIndex(&ifs); if(datafilename.empty()) { errorMsg << "Difficulty reading from index " << indexname << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } OBMol patternMol; bool doSubset = pConv->IsOption("s",OBConversion::INOPTIONS)!=NULL;// -as option bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option if(!doSubset) { //Similarity or substructure if(!ObtainTarget(pConv, patternMol, indexname)) return false; } //Open the datafile and put it in pConv //datafile name derived from index file probably won't have a file path //but indexname may. Derive a full datafile name string path; pos = indexname.find_last_of("/\\"); if(pos==string::npos) path = datafilename; else path = indexname.substr(0,pos+1) + datafilename; ifstream datastream(path.c_str()); if(!datastream) { errorMsg << "Difficulty opening " << path << endl; obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); return false; } pConv->SetInStream(&datastream); //Input format is currently fs; set it appropriately if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat())) return false; pConv->AddOption("b",OBConversion::GENOPTIONS); //Now do searching const char* p = pConv->IsOption("t",OBConversion::INOPTIONS); if(p) { //Do a similarity search multimap<double, unsigned int> SeekposMap; string txt=p; if(txt.find('.')==string::npos) { //Finds n molecules with largest Tanimoto int n = atoi(p); fs.FindSimilar(&patternMol, SeekposMap, n); } else { //Finds molecules with Tanimoto > MinTani double MinTani = atof(txt.c_str()); // if(doSubset) // fs.FindSubset(SeekposMap, MinTani); // else fs.FindSimilar(&patternMol, SeekposMap, MinTani); } //Don't want to filter through SMARTS filter pConv->RemoveOption("s", OBConversion::GENOPTIONS); multimap<double, unsigned int>::reverse_iterator itr; for(itr=SeekposMap.rbegin(); itr!=SeekposMap.rend(); ++itr) { datastream.seekg(itr->second); if(pConv->IsOption("a", OBConversion::INOPTIONS)) { //Adds Tanimoto coeff to title //First remove any previous value pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS); stringstream ss; ss << " " << itr->first; pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str()); } pConv->SetOneObjectOnly(); if(itr != --SeekposMap.rend()) pConv->SetMoreFilesToCome();//so that not seen as last on output pConv->Convert(NULL,NULL); } } else { //Structure search int MaxCandidates = 4000; p = pConv->IsOption("l",OBConversion::INOPTIONS); if(p && atoi(p)) MaxCandidates = atoi(p); vector<unsigned int> SeekPositions; if(exactmatch) { //Find mols where all fingerprint bits are the same as the target fs.FindMatch(&patternMol, SeekPositions, MaxCandidates); // ensure that SMARTS filter in transform.cpp looks only for an exact match // by setting an option with the number of heavy atoms in the pattern mol included. stringstream ss; ss << patternMol.NumHvyAtoms(); pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str()); } else { //Do a substructure search fs.Find(&patternMol, SeekPositions, MaxCandidates); clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl; } //Output the candidate molecules //filtering through s filter, unless the fingerprint type does not require it if(fs.GetFingerprint()->Flags() & OBFingerprint::FPT_UNIQUEBITS) pConv->RemoveOption("s",OBConversion::GENOPTIONS); vector<unsigned int>::iterator itr; for(itr=SeekPositions.begin(); itr!=SeekPositions.end(); itr++) { datastream.seekg(*itr); // datastream.seekg(*itr - datastream.tellg(), ios_base::cur); //Avoid retrieving start //debugging kludge to output all candidates directly if(pConv->IsOption("c",OBConversion::GENOPTIONS)) { string ln; getline(datastream,ln); datastream.seekg(*itr); *pConv->GetOutStream() << "** " << ln << endl; } pConv->SetOneObjectOnly(); pConv->SetLast(itr+1 == SeekPositions.end()); pConv->Convert(NULL,NULL); } } return false; //To finish }