bool FastSearchFormat::ObtainTarget(OBConversion* pConv, vector<OBMol>& patternMols, const string& indexname)
  {
    //Obtains an OBMol from:
    // the filename in the -s option or
    // the SMARTS string in the -s option or
    // by converting the file in the -S or -aS options (deprecated).
    // If there is no -s -S or -aS option, information on the index file is displayed.

    OBMol patternMol;
    patternMol.SetIsPatternStructure();

    const char* p = pConv->IsOption("s",OBConversion::GENOPTIONS);

    bool OldSOption=false;
    //If no -s option, make OBMol from file in -S option or -aS option (both deprecated)
    if(!p)
    {
      p = pConv->IsOption("S",OBConversion::GENOPTIONS);
      if(!p)
        p = pConv->IsOption("S",OBConversion::INOPTIONS);//for GUI mainly
      OldSOption = true;
    }
    if(p)
    {
      vector<string> vec;
      tokenize(vec, p);

      //ignore leading ~ (not relevant to fastsearch)
      if(vec[0][0]=='~')
        vec[0].erase(0,1);

      if(vec.size()>1 && vec[1]=="exact")
        pConv->AddOption("e", OBConversion::INOPTIONS);

      OBConversion patternConv;
      OBFormat* pFormat;
      //Interpret as a filename if possible
      string& txt =vec [0];
      if( txt.empty() ||
          txt.find('.')==string::npos ||
          !(pFormat = patternConv.FormatFromExt(txt.c_str())) ||
          !patternConv.SetInFormat(pFormat) ||
          !patternConv.ReadFile(&patternMol, txt) ||
          patternMol.NumAtoms()==0)
        //if false, have a valid patternMol from a file
      {
        //is SMARTS/SMILES
        //Replace e.g. [#6] in SMARTS by C so that it can be converted as SMILES
        //for the fingerprint phase, but allow more generality in the SMARTS phase.
        for(;;)
        {
          string::size_type pos1, pos2;
          pos1 = txt.find("[#");
          if(pos1==string::npos)
            break;
          pos2 = txt.find(']');
          int atno;
          if(pos2!=string::npos &&  (atno = atoi(txt.substr(pos1+2, pos2-pos1-2).c_str())) && atno>0)
            txt.replace(pos1, pos2-pos1+1, etab.GetSymbol(atno));
          else
          {
            obErrorLog.ThrowError(__FUNCTION__,"Ill-formed [#n] atom in SMARTS", obError);
            return false;
          }
        }

        bool hasTildeBond;
        if( (hasTildeBond = (txt.find('~')!=string::npos)) ) // extra parens to indicate truth value
        {
          //Find ~ bonds and make versions of query molecule with a single and aromatic bonds
          //To avoid having to parse the SMILES here, replace ~ by $ (quadruple bond)
          //and then replace this in patternMol. Check first that there are no $ already
          //Sadly, isocynanides may have $ bonds.
          if(txt.find('$')!=string::npos)
          {
            obErrorLog.ThrowError(__FUNCTION__,
              "Cannot use ~ bonds in patterns with $ (quadruple) bonds.)", obError);
            return false;
          }
          replace(txt.begin(),txt.end(), '~' , '$');
        }

        //read as standard SMILES
        patternConv.SetInFormat("smi");
        if(!patternConv.ReadString(&patternMol, vec[0]))
        {
          obErrorLog.ThrowError(__FUNCTION__,"Cannot read the SMILES string",obError);
          return false;
        }
        if(hasTildeBond)
        {
          AddPattern(patternMols, patternMol, 0); //recursively add all combinations of tilde bond values
          return true;
        }
      }
      else
      {
        // target(s) are in a file
        patternMols.push_back(patternMol);
        while(patternConv.Read(&patternMol))
          patternMols.push_back(patternMol);
        return true;
      }
    }

    if(OldSOption) //only when using deprecated -S and -aS options
    {
      //make -s option for later SMARTS test
      OBConversion conv;
      if(conv.SetOutFormat("smi"))
      {
        string optiontext = conv.WriteString(&patternMol, true);
        pConv->AddOption("s", OBConversion::GENOPTIONS, optiontext.c_str());
      }
    }

    if(!p)
    {
      //neither -s or -S options provided. Output info rather than doing search
      const FptIndexHeader& header = fs.GetIndexHeader();
      string id(header.fpid);
      if(id.empty())
        id = "default";
      clog << indexname << " is an index of\n " << header.datafilename
           << ".\n It contains " << header.nEntries
           << " molecules. The fingerprint type is " << id << " with "
           << OBFingerprint::Getbitsperint() * header.words << " bits.\n"
           << "Typical usage for a substructure search:\n"
           << "obabel indexfile.fs -osmi -sSMILES\n"
           << "(-s option in GUI is 'Convert only if match SMARTS or mols in file')" << endl;
      return false;
    }

    patternMols.push_back(patternMol);
    return true;
  }
Exemple #2
0
bool FastSearchFormat::ObtainTarget(OBConversion* pConv, OBMol& patternMol, const string& indexname)
{
    //Obtains an OBMol
    //   either from the SMARTS string in the -s option
    //   or by converting the file in the -S option
    //or, if neither option is provided, displays information on the index file.

    stringstream smiles(stringstream::out);
    ifstream patternstream;
    OBConversion PatternConv(&patternstream,&smiles);

    const char* p = pConv->IsOption("s",OBConversion::GENOPTIONS);
    string txt;
    if(p)
    {
        // Use the -s option
        txt=p;
        stringstream smarts(txt, stringstream::in);
        OBConversion Convsm(&smarts);
        if(!Convsm.SetInFormat("smi")) return false;
        Convsm.Read(&patternMol);

        //erase -s option in GeneralOptions since it will be rewritten
        pConv->RemoveOption("s",OBConversion::GENOPTIONS);
        if(patternMol.Empty())
        {
            obErrorLog.ThrowError(__FUNCTION__,
                                  "Could not make a molecule from " + smarts.str()
                                  + "\nThis needs to be valid SMILES when using fastsearch."
                                  "You can use the more versatile SMARTS in a normal substructure search." , obError);
            return false;
        }
    }
    else
    {
        // or Make OBMol from file in -S option or -aS option
        p = pConv->IsOption("S",OBConversion::GENOPTIONS);
        if(!p)
            p = pConv->IsOption("S",OBConversion::INOPTIONS);//for GUI mainly
    }

    if(!p)
    {
        //neither -s or -S options provided. Output info rather than doing search
        const FptIndexHeader& header = fs.GetIndexHeader();
        string id(header.fpid);
        if(id.empty())
            id = "default";
        clog << indexname << " is an index of\n " << header.datafilename
             << ".\n It contains " << header.nEntries
             << " molecules. The fingerprint type is " << id << " with "
             << OBFingerprint::Getbitsperint() * header.words << " bits.\n"
             << "Typical usage for a substructure search:\n"
             << "babel indexfile.fs -osmi -sSMILES" << endl;
        return false;
    }

    if(p && patternMol.Empty())
    {
        txt=p;
        string::size_type pos = txt.find_last_of('.');
        if(pos==string::npos)
        {
            obErrorLog.ThrowError(__FUNCTION__, "Filename of pattern molecule in -S option must have an extension", obError);
            return false;
        }
        patternstream.open(txt.c_str());
        if(!patternstream)
        {
            stringstream errorMsg;

            errorMsg << "Cannot open " << txt << endl;
            obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
            return false;
        }

        PatternConv.SetOneObjectOnly();
        if(PatternConv.SetInFormat(txt.substr(pos+1).c_str()))
            PatternConv.Read(&patternMol);
    }

    if(patternMol.Empty())
    {
        obErrorLog.ThrowError(__FUNCTION__, "Cannot derive a molecule from the -s or -S options", obWarning);
        return false;
    }
    patternMol.ConvertDativeBonds();//use standard form for dative bonds

    //Convert to SMILES and generate a -s option for use in the final filtering
    if(!PatternConv.SetOutFormat("smi"))
        return false;
    PatternConv.Write(&patternMol);
    //remove name to leave smiles string
    string smilesstr(smiles.str());
    string::size_type pos = smilesstr.find_first_of(" \t\r\n");
    if(pos!=string::npos)
        smilesstr = smilesstr.substr(0,pos);
    pConv->AddOption("s", OBConversion::GENOPTIONS, smilesstr.c_str());

    return true;
}
  bool FastSearchFormat::ReadChemObject(OBConversion* pConv)
  {
    //Searches index file for structural matches
    //This function is called only once per search

    std::string auditMsg = "OpenBabel::Read fastsearch index ";
    std::string description(Description());
    auditMsg += description.substr(0,description.find('\n'));
    obErrorLog.ThrowError(__FUNCTION__,
                          auditMsg,
                          obAuditMsg);

    //Derive index name
    string indexname = pConv->GetInFilename();
    string::size_type pos=indexname.find_last_of('.');
    if(pos!=string::npos)
      {
        indexname.erase(pos);
        indexname += ".fs";
      }

    //Have to open input stream again because needs to be in binary mode
    ifstream ifs;
    stringstream errorMsg;
    if(!indexname.empty())
      ifs.open(indexname.c_str(),ios::binary);
    if(!ifs)
      {
        errorMsg << "Couldn't open " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
      }

    string datafilename = fs.ReadIndex(&ifs);
    if(datafilename.empty())
      {
        errorMsg << "Difficulty reading from index " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
      }

    vector<OBMol> patternMols;
    if(!ObtainTarget(pConv, patternMols, indexname))
      return false;

    bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option

    //Open the datafile and put it in pConv
    //datafile name derived from index file probably won't have a file path
    //but indexname may. Derive a full datafile name
    string path;
    pos = indexname.find_last_of("/\\");
    if(pos==string::npos)
      path = datafilename;
    else
      path = indexname.substr(0,pos+1) + datafilename;

    ifstream datastream(path.c_str());
    if(!datastream)
      {
        errorMsg << "Difficulty opening " << path << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
      }
    pConv->SetInStream(&datastream);

    //Input format is currently fs; set it appropriately
    if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat()))
      return false;

    // If target has dative bonds like -[N+](=O)[O-] convert it to the uncharged form
    // (-N(=O)=O and add uncharged form to vector of mols which are sent to
    // the -s (SMARTS)filter.
    // Also check whether the target has dative bonds in the uncharged form and supply
    // the charged form to the -s filter.
    // Together with the automatic conversion to the uncharged form when the fs index is made,
    // this ensures that both forms are found however they occur in the datafile or the taget.
    vector<OBBase*> extraSMARTSMols;
    vector<OBMol>extraUnchargedMols;
    for(unsigned i=0;i<patternMols.size();++i)
    {
      if(patternMols[i].ConvertDativeBonds())
        extraSMARTSMols.push_back(&patternMols[i]);
      else 
      {
        // If target has uncharged dative bonds, still use it for fastsearching,
        // but add the charged form for -s filter.
        extraUnchargedMols.push_back(patternMols[i]);
        if(extraUnchargedMols.back().MakeDativeBonds())
          extraSMARTSMols.push_back(&extraUnchargedMols.back());
      }
    }
    OBOp* sFilter = OBOp::FindType("s");
    if(sFilter)
      sFilter->ProcessVec(extraSMARTSMols);

    //Now do searching
    const char* p = pConv->IsOption("t",OBConversion::INOPTIONS);
    if(p)
      {
        //Do a similarity search
        multimap<double, unsigned int> SeekposMap;
        string txt=p;
        if(txt.find('.')==string::npos)
          {
            //Finds n molecules with largest Tanimoto
            int n = atoi(p);
            fs.FindSimilar(&patternMols[0], SeekposMap, n);
          }
        else
          {
            //Finds molecules with Tanimoto > MinTani
            double MaxTani = 1.1;
            size_t pos = txt.find(',');
            if( pos != string::npos ) {
              MaxTani = atof( txt.substr( pos + 1 ).c_str() );
            }
            double MinTani = atof( txt.substr( 0, pos ).c_str() );
            fs.FindSimilar(&patternMols[0], SeekposMap, MinTani, MaxTani);
          }

        //Don't want to filter through SMARTS filter
        pConv->RemoveOption("s", OBConversion::GENOPTIONS);
        //also because op names are case independent
        pConv->RemoveOption("S", OBConversion::GENOPTIONS);

        multimap<double, unsigned int>::reverse_iterator itr;
        for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
          {
            datastream.seekg(itr->second);

            if(pConv->IsOption("a", OBConversion::INOPTIONS))
              {
                //Adds Tanimoto coeff to title
                //First remove any previous value
                pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS);
                stringstream ss;
                ss << " " << itr->first;
                pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str());

              }
            pConv->SetOneObjectOnly();
            if(itr != --SeekposMap.rend())
              pConv->SetMoreFilesToCome();//so that not seen as last on output
            pConv->Convert(NULL,NULL);
          }
      }

    else
    {
      //Structure search
      int MaxCandidates = 4000;
      p = pConv->IsOption("l",OBConversion::INOPTIONS);
      if(p && atoi(p))
        MaxCandidates = atoi(p);

      vector<unsigned int> SeekPositions;

      if(exactmatch)
      {
        //Find mols where all fingerprint bits are the same as the target
        fs.FindMatch(&patternMols[0], SeekPositions, MaxCandidates);
        // ensure that SMARTS filter in transform.cpp looks only for an exact match
        // by setting an option with the number of heavy atoms in the pattern mol included.
        stringstream ss;
        ss << patternMols[0].NumHvyAtoms();
        pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str());
      }

      else
      {
        //Do a substructure search for each target
        vector<OBMol>::iterator iter;
        for(iter=patternMols.begin();iter!=patternMols.end();++iter)
          fs.Find(&*iter, SeekPositions, MaxCandidates);
        clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl;
      }

      vector<unsigned int>::iterator seekitr,
          begin = SeekPositions.begin(), end = SeekPositions.end();

      if(patternMols.size()>1)//only sort and eliminate duplicates if necessary
      {
        sort(begin, end);
        end = unique(begin, end); //removed duplicates are after new end
      }

      //Output the candidate molecules, filtering through s filter, unless it was not requested
      if(pConv->IsOption("n", OBConversion::INOPTIONS) )
        pConv->RemoveOption("s",OBConversion::GENOPTIONS);

      pConv->SetLast(false);
      for(seekitr=begin; seekitr!=end; ++seekitr)
      {
        datastream.seekg(*seekitr);
        if(!pConv->GetInFormat()->ReadChemObject(pConv))
          return false;
        pConv->SetFirstInput(false); //needed for OpSort
      }
    }
    return false;	//To finish
  }
Exemple #4
0
bool FastSearchFormat::ReadChemObject(OBConversion* pConv)
{
    //Searches index file for structural matches
    //This function is called only once per search

    std::string auditMsg = "OpenBabel::Read fastsearch index ";
    std::string description(Description());
    auditMsg += description.substr(0,description.find('\n'));
    obErrorLog.ThrowError(__FUNCTION__,
                          auditMsg,
                          obAuditMsg);

    //Derive index name
    string indexname = pConv->GetInFilename();
    string::size_type pos=indexname.find_last_of('.');
    if(pos!=string::npos)
    {
        indexname.erase(pos);
        indexname += ".fs";
    }

    //Have to open input stream again because needs to be in binary mode
    ifstream ifs;
    stringstream errorMsg;
    if(!indexname.empty())
        ifs.open(indexname.c_str(),ios::binary);
    if(!ifs)
    {
        errorMsg << "Couldn't open " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
    }

    string datafilename = fs.ReadIndex(&ifs);
    if(datafilename.empty())
    {
        errorMsg << "Difficulty reading from index " << indexname << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
    }

    OBMol patternMol;
    bool doSubset = pConv->IsOption("s",OBConversion::INOPTIONS)!=NULL;// -as option
    bool exactmatch = pConv->IsOption("e",OBConversion::INOPTIONS)!=NULL;// -ae option
    if(!doSubset)
    {
        //Similarity or substructure
        if(!ObtainTarget(pConv, patternMol, indexname))
            return false;
    }

    //Open the datafile and put it in pConv
    //datafile name derived from index file probably won't have a file path
    //but indexname may. Derive a full datafile name
    string path;
    pos = indexname.find_last_of("/\\");
    if(pos==string::npos)
        path = datafilename;
    else
        path = indexname.substr(0,pos+1) + datafilename;

    ifstream datastream(path.c_str());
    if(!datastream)
    {
        errorMsg << "Difficulty opening " << path << endl;
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
        return false;
    }
    pConv->SetInStream(&datastream);

    //Input format is currently fs; set it appropriately
    if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str()),pConv->GetOutFormat()))
        return false;
    pConv->AddOption("b",OBConversion::GENOPTIONS);


    //Now do searching
    const char* p = pConv->IsOption("t",OBConversion::INOPTIONS);
    if(p)
    {
        //Do a similarity search
        multimap<double, unsigned int> SeekposMap;
        string txt=p;
        if(txt.find('.')==string::npos)
        {
            //Finds n molecules with largest Tanimoto
            int n = atoi(p);
            fs.FindSimilar(&patternMol, SeekposMap, n);
        }
        else
        {
            //Finds molecules with Tanimoto > MinTani
            double MinTani = atof(txt.c_str());
//            if(doSubset)
//              fs.FindSubset(SeekposMap, MinTani);
//            else
            fs.FindSimilar(&patternMol, SeekposMap, MinTani);
        }

        //Don't want to filter through SMARTS filter
        pConv->RemoveOption("s", OBConversion::GENOPTIONS);

        multimap<double, unsigned int>::reverse_iterator itr;
        for(itr=SeekposMap.rbegin(); itr!=SeekposMap.rend(); ++itr)
        {
            datastream.seekg(itr->second);

            if(pConv->IsOption("a", OBConversion::INOPTIONS))
            {
                //Adds Tanimoto coeff to title
                //First remove any previous value
                pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS);
                stringstream ss;
                ss << " " << itr->first;
                pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str());

            }
            pConv->SetOneObjectOnly();
            if(itr != --SeekposMap.rend())
                pConv->SetMoreFilesToCome();//so that not seen as last on output
            pConv->Convert(NULL,NULL);
        }
    }

    else
    {
        //Structure search
        int MaxCandidates = 4000;
        p = pConv->IsOption("l",OBConversion::INOPTIONS);
        if(p && atoi(p))
            MaxCandidates = atoi(p);

        vector<unsigned int> SeekPositions;
        if(exactmatch)
        {
            //Find mols where all fingerprint bits are the same as the target
            fs.FindMatch(&patternMol, SeekPositions, MaxCandidates);
            // ensure that SMARTS filter in transform.cpp looks only for an exact match
            // by setting an option with the number of heavy atoms in the pattern mol included.
            stringstream ss;
            ss << patternMol.NumHvyAtoms();
            pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str());
        }

        else
        {
            //Do a substructure search
            fs.Find(&patternMol, SeekPositions, MaxCandidates);
            clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl;
        }

        //Output the candidate molecules
        //filtering through s filter, unless the fingerprint type does not require it
        if(fs.GetFingerprint()->Flags() & OBFingerprint::FPT_UNIQUEBITS)
            pConv->RemoveOption("s",OBConversion::GENOPTIONS);

        vector<unsigned int>::iterator itr;
        for(itr=SeekPositions.begin(); itr!=SeekPositions.end(); itr++)
        {
            datastream.seekg(*itr);
            //	datastream.seekg(*itr - datastream.tellg(), ios_base::cur); //Avoid retrieving start

            //debugging kludge to output all candidates directly
            if(pConv->IsOption("c",OBConversion::GENOPTIONS))
            {
                string ln;
                getline(datastream,ln);
                datastream.seekg(*itr);
                *pConv->GetOutStream() << "** " << ln << endl;
            }
            pConv->SetOneObjectOnly();
            pConv->SetLast(itr+1 == SeekPositions.end());
            pConv->Convert(NULL,NULL);
        }
    }
    return false;	//To finish
}