bool FastSearchFormat::ReadChemObject(OBConversion* pConv)
  {
    //Searches index file for structural matches
    //This function is called only once per search

    std::string auditMsg = "OpenBabel::Read fastsearch index ";
    std::string description(Description());
    auditMsg += description.substr(0,description.find('\n'));
    obErrorLog.ThrowError(__FUNCTION__,
                          auditMsg,
                          obAuditMsg);

    //Derive index name
    string indexname = pConv->GetInFilename();
    string::size_type pos=indexname.find_last_of('.');
    if(pos!=string::npos)
      {
        indexname.erase(pos);
        indexname += ".fs";
      }

    //Have to open input stream again because needs to be in binary mode
    ifstream ifs;
    stringstream errorMsg;
    if(!indexname.empty())
      ifs.open(indexname.c_str(),ios::binary);
    if(!ifs)
      {
        errorMsg << "Couldn't open " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
      }

    string datafilename = fs.ReadIndex(&ifs);
    if(datafilename.empty())
      {
        errorMsg << "Difficulty reading from index " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
      }

    vector<OBMol> patternMols;
    if(!ObtainTarget(pConv, patternMols, indexname))
      return false;

    bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option

    //Open the datafile and put it in pConv
    //datafile name derived from index file probably won't have a file path
    //but indexname may. Derive a full datafile name
    string path;
    pos = indexname.find_last_of("/\\");
    if(pos==string::npos)
      path = datafilename;
    else
      path = indexname.substr(0,pos+1) + datafilename;

    ifstream datastream(path.c_str());
    if(!datastream)
      {
        errorMsg << "Difficulty opening " << path << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
      }
    pConv->SetInStream(&datastream);

    //Input format is currently fs; set it appropriately
    if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat()))
      return false;

    // If target has dative bonds like -[N+](=O)[O-] convert it to the uncharged form
    // (-N(=O)=O and add uncharged form to vector of mols which are sent to
    // the -s (SMARTS)filter.
    // Also check whether the target has dative bonds in the uncharged form and supply
    // the charged form to the -s filter.
    // Together with the automatic conversion to the uncharged form when the fs index is made,
    // this ensures that both forms are found however they occur in the datafile or the taget.
    vector<OBBase*> extraSMARTSMols;
    vector<OBMol>extraUnchargedMols;
    for(unsigned i=0;i<patternMols.size();++i)
    {
      if(patternMols[i].ConvertDativeBonds())
        extraSMARTSMols.push_back(&patternMols[i]);
      else 
      {
        // If target has uncharged dative bonds, still use it for fastsearching,
        // but add the charged form for -s filter.
        extraUnchargedMols.push_back(patternMols[i]);
        if(extraUnchargedMols.back().MakeDativeBonds())
          extraSMARTSMols.push_back(&extraUnchargedMols.back());
      }
    }
    OBOp* sFilter = OBOp::FindType("s");
    if(sFilter)
      sFilter->ProcessVec(extraSMARTSMols);

    //Now do searching
    const char* p = pConv->IsOption("t",OBConversion::INOPTIONS);
    if(p)
      {
        //Do a similarity search
        multimap<double, unsigned int> SeekposMap;
        string txt=p;
        if(txt.find('.')==string::npos)
          {
            //Finds n molecules with largest Tanimoto
            int n = atoi(p);
            fs.FindSimilar(&patternMols[0], SeekposMap, n);
          }
        else
          {
            //Finds molecules with Tanimoto > MinTani
            double MaxTani = 1.1;
            size_t pos = txt.find(',');
            if( pos != string::npos ) {
              MaxTani = atof( txt.substr( pos + 1 ).c_str() );
            }
            double MinTani = atof( txt.substr( 0, pos ).c_str() );
            fs.FindSimilar(&patternMols[0], SeekposMap, MinTani, MaxTani);
          }

        //Don't want to filter through SMARTS filter
        pConv->RemoveOption("s", OBConversion::GENOPTIONS);
        //also because op names are case independent
        pConv->RemoveOption("S", OBConversion::GENOPTIONS);

        multimap<double, unsigned int>::reverse_iterator itr;
        for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
          {
            datastream.seekg(itr->second);

            if(pConv->IsOption("a", OBConversion::INOPTIONS))
              {
                //Adds Tanimoto coeff to title
                //First remove any previous value
                pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS);
                stringstream ss;
                ss << " " << itr->first;
                pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str());

              }
            pConv->SetOneObjectOnly();
            if(itr != --SeekposMap.rend())
              pConv->SetMoreFilesToCome();//so that not seen as last on output
            pConv->Convert(NULL,NULL);
          }
      }

    else
    {
      //Structure search
      int MaxCandidates = 4000;
      p = pConv->IsOption("l",OBConversion::INOPTIONS);
      if(p && atoi(p))
        MaxCandidates = atoi(p);

      vector<unsigned int> SeekPositions;

      if(exactmatch)
      {
        //Find mols where all fingerprint bits are the same as the target
        fs.FindMatch(&patternMols[0], SeekPositions, MaxCandidates);
        // ensure that SMARTS filter in transform.cpp looks only for an exact match
        // by setting an option with the number of heavy atoms in the pattern mol included.
        stringstream ss;
        ss << patternMols[0].NumHvyAtoms();
        pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str());
      }

      else
      {
        //Do a substructure search for each target
        vector<OBMol>::iterator iter;
        for(iter=patternMols.begin();iter!=patternMols.end();++iter)
          fs.Find(&*iter, SeekPositions, MaxCandidates);
        clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl;
      }

      vector<unsigned int>::iterator seekitr,
          begin = SeekPositions.begin(), end = SeekPositions.end();

      if(patternMols.size()>1)//only sort and eliminate duplicates if necessary
      {
        sort(begin, end);
        end = unique(begin, end); //removed duplicates are after new end
      }

      //Output the candidate molecules, filtering through s filter, unless it was not requested
      if(pConv->IsOption("n", OBConversion::INOPTIONS) )
        pConv->RemoveOption("s",OBConversion::GENOPTIONS);

      pConv->SetLast(false);
      for(seekitr=begin; seekitr!=end; ++seekitr)
      {
        datastream.seekg(*seekitr);
        if(!pConv->GetInFormat()->ReadChemObject(pConv))
          return false;
        pConv->SetFirstInput(false); //needed for OpSort
      }
    }
    return false;	//To finish
  }
Beispiel #2
0
bool FastSearchFormat::ReadChemObject(OBConversion* pConv)
{
    //Searches index file for structural matches
    //This function is called only once per search

    std::string auditMsg = "OpenBabel::Read fastsearch index ";
    std::string description(Description());
    auditMsg += description.substr(0,description.find('\n'));
    obErrorLog.ThrowError(__FUNCTION__,
                          auditMsg,
                          obAuditMsg);

    //Derive index name
    string indexname = pConv->GetInFilename();
    string::size_type pos=indexname.find_last_of('.');
    if(pos!=string::npos)
    {
        indexname.erase(pos);
        indexname += ".fs";
    }

    //Have to open input stream again because needs to be in binary mode
    ifstream ifs;
    stringstream errorMsg;
    if(!indexname.empty())
        ifs.open(indexname.c_str(),ios::binary);
    if(!ifs)
    {
        errorMsg << "Couldn't open " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
    }

    string datafilename = fs.ReadIndex(&ifs);
    if(datafilename.empty())
    {
        errorMsg << "Difficulty reading from index " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
    }

    OBMol patternMol;
    bool doSubset = pConv->IsOption("s",OBConversion::INOPTIONS)!=NULL;// -as option
    bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option
    if(!doSubset)
    {
        //Similarity or substructure
        if(!ObtainTarget(pConv, patternMol, indexname))
            return false;
    }

    //Open the datafile and put it in pConv
    //datafile name derived from index file probably won't have a file path
    //but indexname may. Derive a full datafile name
    string path;
    pos = indexname.find_last_of("/\\");
    if(pos==string::npos)
        path = datafilename;
    else
        path = indexname.substr(0,pos+1) + datafilename;

    ifstream datastream(path.c_str());
    if(!datastream)
    {
        errorMsg << "Difficulty opening " << path << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
    }
    pConv->SetInStream(&datastream);

    //Input format is currently fs; set it appropriately
    if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat()))
        return false;
    pConv->AddOption("b",OBConversion::GENOPTIONS);


    //Now do searching
    const char* p = pConv->IsOption("t",OBConversion::INOPTIONS);
    if(p)
    {
        //Do a similarity search
        multimap<double, unsigned int> SeekposMap;
        string txt=p;
        if(txt.find('.')==string::npos)
        {
            //Finds n molecules with largest Tanimoto
            int n = atoi(p);
            fs.FindSimilar(&patternMol, SeekposMap, n);
        }
        else
        {
            //Finds molecules with Tanimoto > MinTani
            double MinTani = atof(txt.c_str());
//            if(doSubset)
//              fs.FindSubset(SeekposMap, MinTani);
//            else
            fs.FindSimilar(&patternMol, SeekposMap, MinTani);
        }

        //Don't want to filter through SMARTS filter
        pConv->RemoveOption("s", OBConversion::GENOPTIONS);

        multimap<double, unsigned int>::reverse_iterator itr;
        for(itr=SeekposMap.rbegin(); itr!=SeekposMap.rend(); ++itr)
        {
            datastream.seekg(itr->second);

            if(pConv->IsOption("a", OBConversion::INOPTIONS))
            {
                //Adds Tanimoto coeff to title
                //First remove any previous value
                pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS);
                stringstream ss;
                ss << " " << itr->first;
                pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str());

            }
            pConv->SetOneObjectOnly();
            if(itr != --SeekposMap.rend())
                pConv->SetMoreFilesToCome();//so that not seen as last on output
            pConv->Convert(NULL,NULL);
        }
    }

    else
    {
        //Structure search
        int MaxCandidates = 4000;
        p = pConv->IsOption("l",OBConversion::INOPTIONS);
        if(p && atoi(p))
            MaxCandidates = atoi(p);

        vector<unsigned int> SeekPositions;
        if(exactmatch)
        {
            //Find mols where all fingerprint bits are the same as the target
            fs.FindMatch(&patternMol, SeekPositions, MaxCandidates);
            // ensure that SMARTS filter in transform.cpp looks only for an exact match
            // by setting an option with the number of heavy atoms in the pattern mol included.
            stringstream ss;
            ss << patternMol.NumHvyAtoms();
            pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str());
        }

        else
        {
            //Do a substructure search
            fs.Find(&patternMol, SeekPositions, MaxCandidates);
            clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl;
        }

        //Output the candidate molecules
        //filtering through s filter, unless the fingerprint type does not require it
        if(fs.GetFingerprint()->Flags() & OBFingerprint::FPT_UNIQUEBITS)
            pConv->RemoveOption("s",OBConversion::GENOPTIONS);

        vector<unsigned int>::iterator itr;
        for(itr=SeekPositions.begin(); itr!=SeekPositions.end(); itr++)
        {
            datastream.seekg(*itr);
            //	datastream.seekg(*itr - datastream.tellg(), ios_base::cur); //Avoid retrieving start

            //debugging kludge to output all candidates directly
            if(pConv->IsOption("c",OBConversion::GENOPTIONS))
            {
                string ln;
                getline(datastream,ln);
                datastream.seekg(*itr);
                *pConv->GetOutStream() << "** " << ln << endl;
            }
            pConv->SetOneObjectOnly();
            pConv->SetLast(itr+1 == SeekPositions.end());
            pConv->Convert(NULL,NULL);
        }
    }
    return false;	//To finish
}