Пример #1
0
 void ParseNumberList(std::string inLine,
                      std::vector<T> &res,                    
                      std::istream *inStream=0){
   bool foundEnd=false;
   while(!foundEnd) {
     CommaTokenizer commaTok(inLine);
     for(CommaTokenizer::const_iterator commaTokIt=commaTok.begin();
         commaTokIt!=commaTok.end();
         commaTokIt++){
       std::string number=*commaTokIt;
       bool atEnd= number.find(";>")!=std::string::npos;
       boost::trim_if(number,boost::is_any_of(" \r\n\t;>"));
       if(number!="" && !atEnd ){
         res.push_back(boost::lexical_cast<T>(number));
       } else if(atEnd){
         // that's it, we're done:
         foundEnd=true;
         break;
       }
     }
     if(foundEnd || !inStream || inStream->eof()){
       break;
     } else {
       std::getline(*inStream,inLine);
     }
   }
   if(!foundEnd){
     throw FileParseException("no end tag found for numeric list");
   }
 }
Пример #2
0
ROMol *SDMolSupplier::next() {
  PRECONDITION(dp_inStream, "no stream");
  if (df_end && d_last >= d_len) {
    throw FileParseException("EOF hit.");
  }

  // set the stream to the current position
  dp_inStream->seekg(d_molpos[d_last]);

  std::string tempStr;
  ROMol *res = NULL;
  // finally if we reached the end of the file set end to be true
  if (dp_inStream->eof()) {
    // FIX: we should probably be throwing an exception here
    df_end = true;
    d_len = d_molpos.size();
    return res;
  }

  res = _next();

  ++d_last;
  std::streampos posHold = dp_inStream->tellg();
  this->checkForEnd();
  if (!this->df_end && d_last >= static_cast<int>(d_molpos.size())) {
    d_molpos.push_back(posHold);
  }

  return res;
}
Пример #3
0
  void TDTMolSupplier::moveTo(unsigned int idx) {
    PRECONDITION(dp_inStream,"no stream");
    CHECK_INVARIANT(idx >= 0, "");

    // dp_inStream->seekg() is called for all idx values
    // and earlier calls to next() may have put the stream into a bad state
    dp_inStream->clear();

    // move until we hit the desired idx
    if (idx < d_molpos.size() ) {
      dp_inStream->seekg(d_molpos[idx]);
      d_last = idx;
    }
    else {
      std::string tempStr;
      d_last = d_molpos.size() - 1;
      dp_inStream->seekg(d_molpos.back());
      while ((d_last < static_cast<int>(idx)) && (!dp_inStream->eof()) ) {
        d_line++;
        std::getline(*dp_inStream,tempStr);
        
        if (tempStr.find("|") == 0) {
          d_molpos.push_back(dp_inStream->tellg());
          d_last++;
        }
      }
      // if we reached end of file without reaching "idx" we have an index error
      if (dp_inStream->eof()) {
        d_len = d_molpos.size();
        std::ostringstream errout;
        errout << "ERROR: Index error (idx = " << idx  << ") : " << " we do no have enough molecule blocks";
        throw FileParseException(errout.str());
      }
    }
  }
Пример #4
0
 SDWriter::SDWriter(std::ostream *outStream,bool takeOwnership) {
   PRECONDITION(outStream,"null stream");
   if (outStream->bad()){
     throw FileParseException("Bad output stream");
   }
   dp_ostream = outStream;
   df_owner = takeOwnership;
   d_molid = 0;
 }
Пример #5
0
 TDTWriter::TDTWriter(std::ostream *outStream, bool takeOwnership) {
   PRECONDITION(outStream,"null stream");
   if (outStream->bad()) {
     throw FileParseException("Bad output stream");
   }
   dp_ostream = outStream;
   df_owner = takeOwnership;
   d_molid = 0;
   d_numDigits=4;
   df_write2D=false;
   df_writeNames=true;
 }
Пример #6
0
void SDMolSupplier::moveTo(unsigned int idx) {
  PRECONDITION(dp_inStream, "no stream");

  // dp_inStream->seekg() is called for all idx values
  // and earlier calls to next() may have put the stream into a bad state
  dp_inStream->clear();

  // move until we hit the desired idx
  if (idx < d_molpos.size()) {
    dp_inStream->seekg(d_molpos[idx]);
    d_last = idx;
  } else {
    std::string tempStr;
    dp_inStream->seekg(d_molpos.back());
    d_last = d_molpos.size() - 1;
    while ((d_last < static_cast<int>(idx)) && (!dp_inStream->eof())) {
      d_line++;
      tempStr = getLine(dp_inStream);

      if (tempStr[0] == '$' && tempStr.substr(0, 4) == "$$$$") {
        std::streampos posHold = dp_inStream->tellg();
        this->checkForEnd();
        if (!this->df_end) {
          d_molpos.push_back(posHold);
          d_last++;
        }
      }
    }
    // if we reached end of file without reaching "idx" we have an index error
    if (dp_inStream->eof()) {
      d_len = d_molpos.size();
      std::ostringstream errout;
      errout << "ERROR: Index error (idx = " << idx << ") : "
             << " we do no have enough mol blocks";
      throw FileParseException(errout.str());
    }
  }
}
Пример #7
0
  ROMol *TDTMolSupplier::parseMol(std::string inLine){
    PRECONDITION(dp_inStream,"no stream");
    Utils::LocaleSwitcher ls;
    std::size_t startP=inLine.find("<");
    std::size_t endP=inLine.find_last_of(">");
    std::string smiles = inLine.substr(startP+1,endP-startP-1);
    ROMol *res = SmilesToMol(smiles,0,df_sanitize);

    if(res && res->getNumAtoms()>0){
      // -----------
      //   Process the properties:
      d_line++;
      std::getline(*dp_inStream,inLine);
      while(!dp_inStream->eof() && inLine.find("|")!=0){
        endP=inLine.find("<");
        std::string propName = inLine.substr(0,endP);
        boost::trim_if(propName,boost::is_any_of(" \t"));
        startP = endP+1;

        if(propName=="2D" && d_confId2D>=0){
          std::string rest=inLine.substr(startP,inLine.size()-startP);
          std::vector<double> coords;
          TDTParseUtils::ParseNumberList(rest,coords,dp_inStream);
          Conformer *conf=new Conformer(res->getNumAtoms());
          conf->setId(d_confId2D);
          conf->set3D(false);
          for(unsigned int atIdx=0;atIdx<res->getNumAtoms();atIdx++){
            if(2*atIdx+1 < coords.size()){
              conf->setAtomPos(atIdx,RDGeom::Point3D(coords[2*atIdx],coords[2*atIdx+1],0.0));
            } else {
              // we're going to let this slide... but maybe we should do something else?
            }
          }
          res->addConformer(conf,false);
        } else if(propName=="3D" && d_confId3D>=0){
          std::string rest=inLine.substr(startP,inLine.size()-startP);
          std::vector<double> coords;
          TDTParseUtils::ParseNumberList(rest,coords,dp_inStream);
          Conformer *conf=new Conformer(res->getNumAtoms());
          conf->setId(d_confId3D);
          conf->set3D(true);
          for(unsigned int atIdx=0;atIdx<res->getNumAtoms();atIdx++){
            if(3*atIdx+2 < coords.size()){
              conf->setAtomPos(atIdx,RDGeom::Point3D(coords[3*atIdx],
                                                     coords[3*atIdx+1],
                                                     coords[3*atIdx+2]));
            } else {
              // we're going to let this slide... but maybe we should do something else?
            }
          }
          res->addConformer(conf,false);
        } else {
          endP=inLine.find_last_of(">");
          if(endP==std::string::npos){
            std::ostringstream errout;
            errout << "no end tag found for property" << propName;
            throw FileParseException(errout.str());
          } else {
            std::string propVal = inLine.substr(startP,endP-startP);
            res->setProp(propName,propVal);
            if(propName==d_nameProp) res->setProp("_Name",propVal);
          }
        }
        std::getline(*dp_inStream,inLine);
      }
    }    
    
    return res;
  }
Пример #8
0
// --------------------------------------------------
//
//  Moves to the position of a particular entry in the
//  stream.
//
//  If insufficient entries are present, a FileParseException
//    will be thrown
//
void SmilesMolSupplier::moveTo(unsigned int idx) {
  PRECONDITION(dp_inStream, "bad instream");
  // get the easy situations (boundary conditions) out of the
  // way first:
  if (d_len > -1 && idx >= static_cast<unsigned int>(d_len)) {
    df_end = true;
    std::ostringstream errout;
    errout << "ERROR: Index error (idx = " << idx << "): "
           << "ran out of lines\n";
    throw FileParseException(errout.str());
  }

  // dp_inStream->seekg() is called for all idx values
  // and earlier calls to next() may have put the stream into a bad state
  dp_inStream->clear();

  // -----------
  // Case 1: we have already read the particular entry:
  //
  // Set the stream position and return
  // -----------
  if (!d_molpos.empty() && d_molpos.size() > idx) {
    dp_inStream->clear();  // clear the EOF tag if it has been set
    df_end = false;
    dp_inStream->seekg(d_molpos[idx]);
    d_next = idx;
    d_line = d_lineNums[idx];
    return;
  }

  // -----------
  // Case 2: we haven't read the entry, so move forward until
  //   we've gone far enough.
  // -----------
  if (d_molpos.empty()) {
    // if we are just starting out, process the title line
    dp_inStream->seekg(0);
    if (df_title) this->processTitleLine();
  } else {
    // move to the last position we've seen:
    dp_inStream->seekg(d_molpos.back());
    // read that line:
    std::string tmp = getLine(dp_inStream);
  }

  // the stream pointer is now at the last thing we read in
  while (d_molpos.size() <= idx) {
    int nextP = this->skipComments();
    if (nextP < 0) {
      std::ostringstream errout;
      errout << "ERROR: Index error (idx = " << idx << "): "
             << "ran out of lines\n";
      throw FileParseException(errout.str());
    } else {
      d_molpos.push_back(nextP);
      d_lineNums.push_back(d_line);
      if (d_molpos.size() == idx + 1 && df_end) {
        // boundary condition: we could read the point we were looking for
        // but not the next one.
        // indicate that we've reached EOF:
        dp_inStream->clear();
        dp_inStream->seekg(0, std::ios_base::end);
        d_len = d_molpos.size();
        break;
      }
    }
  }

  POSTCONDITION(d_molpos.size() > idx, "not enough lines");
  dp_inStream->seekg(d_molpos[idx]);
  d_next = idx;
  return;
}
Пример #9
0
ROMol *SmilesMolSupplier::processLine(std::string inLine) {
  ROMol *res = NULL;

  try {
    // -----------
    // tokenize the input line:
    // -----------
    boost::char_separator<char> sep(d_delim.c_str(), "",
                                    boost::keep_empty_tokens);
    tokenizer tokens(inLine, sep);
    STR_VECT recs;
    for (tokenizer::iterator tokIter = tokens.begin(); tokIter != tokens.end();
         ++tokIter) {
      std::string rec = strip(*tokIter);
      recs.push_back(rec);
    }
    if (recs.size() <= static_cast<unsigned int>(d_smi)) {
      std::ostringstream errout;
      errout << "ERROR: line #" << d_line << "does not contain enough tokens\n";
      throw FileParseException(errout.str());
    }

    // -----------
    // get the smiles and create a molecule
    // -----------
    res = SmilesToMol(recs[d_smi], 0, df_sanitize);
    if (!res) {
      std::stringstream errout;
      errout << "Cannot create molecule from : '" << recs[d_smi] << "'";
      throw SmilesParseException(errout.str());
    }

    // -----------
    // get the name (if there's a name column)
    // -----------
    if (d_name == -1) {
      // if no name defaults it to the line number we read it from string
      std::ostringstream tstr;
      tstr << d_line;
      std::string mname = tstr.str();
      res->setProp(common_properties::_Name, mname);
    } else {
      if (d_name >= static_cast<int>(recs.size())) {
        BOOST_LOG(rdWarningLog) << "WARNING: no name column found on line "
                                << d_line << std::endl;
      } else {
        res->setProp(common_properties::_Name, recs[d_name]);
      }
    }

    // -----------
    // read in the properties
    // -----------
    unsigned int iprop = 0;
    for (unsigned int col = 0; col < recs.size(); col++) {
      if (static_cast<int>(col) == d_smi || static_cast<int>(col) == d_name)
        continue;
      std::string pname, pval;
      if (d_props.size() > col) {
        pname = d_props[col];
      } else {
        pname = "Column_";
        std::stringstream ss;
        ss << col;
        pname += ss.str();
      }

      pval = recs[col];
      res->setProp(pname, pval);
      iprop++;
    }

  } catch (const SmilesParseException &pe) {
    // Couldn't parse the passed in smiles
    // Simply print out a message
    BOOST_LOG(rdErrorLog) << "ERROR: Smiles parse error on line " << d_line
                          << "\n";
    BOOST_LOG(rdErrorLog) << "ERROR: " << pe.message() << "\n";
    res = NULL;
  } catch (const MolSanitizeException &se) {
    // We couldn't sanitize the molecule
    //  write out an error message
    BOOST_LOG(rdErrorLog) << "ERROR: Could not sanitize molecule on line "
                          << d_line << std::endl;
    BOOST_LOG(rdErrorLog) << "ERROR: " << se.message() << "\n";
    res = NULL;
  } catch (...) {
    //  write out an error message
    BOOST_LOG(rdErrorLog) << "ERROR: Could not process molecule on line "
                          << d_line << std::endl;
    res = NULL;
  }

  return res;
}
Пример #10
0
  void ForwardSDMolSupplier::readMolProps(ROMol *mol){
    PRECONDITION(dp_inStream,"no stream");
    PRECONDITION(mol,"no molecule");
    d_line++;
    std::string tempStr;
    std::getline(*dp_inStream,tempStr);

    // FIX: report files missing the $$$$ marker
    while(!(dp_inStream->eof()) && (tempStr[0]!='$'||tempStr.substr(0,4)!="$$$$") ){
      tempStr = strip(tempStr);
      if(tempStr!=""){
        if (tempStr[0] == '>') { // data header line: start of a data item
            // ignore all other crap and seek for for a data label enclosed
            // by '<' and '>' 
            // FIX: "CTfile.pdf" (page 51) says that the a data header line does not
            // have to contain a data label (instead can have something line field 
            // id into a MACCS db). But we do not currently know what to do in this 
            // situation - so ignore such data items for now
            tempStr.erase(0,1); // remove the first ">" sign
            int sl = tempStr.find("<"); // begin datalabel
            int se = tempStr.find(">"); // end datalabel
            if ((sl == -1) || (se == -1) || (se == (sl+1)) ) {
              // we either do not have a data label or the label is emtpy
              // no data label ignore until next data item
              // i.e. until we hit a blank line
              d_line++;
              std::getline(*dp_inStream,tempStr);
              std::string stmp = strip(tempStr);
              while (stmp.length() != 0) {
                d_line++;
                std::getline(*dp_inStream,tempStr);
                if(dp_inStream->eof()) throw FileParseException("End of data field name not found");
              }
            } else {
              std::string dlabel = tempStr.substr(sl+1, se-sl-1);
              // we know the label - now read in the relevant properties
              // until we hit a blank line
              d_line++;
              std::getline(*dp_inStream,tempStr);

              std::string prop="";
              std::string stmp = strip(tempStr);
              int nplines = 0; // number of lines for this property
              while (stmp.length() != 0 || tempStr[0]==' ' || tempStr[0]=='\t') {
                nplines++;
                if (nplines > 1) {
                  prop += "\n";
                }
                // take off \r if it's still in the property:
                if (tempStr[tempStr.length()-1]=='\r'){
                  tempStr.erase(tempStr.length()-1);
                }
                prop += tempStr;
                d_line++;
                // erase tempStr in case the file does not end with a carrier
                // return (we will end up in an infinite loop if we don't do
                // this and we do not check for EOF in this while loop body)
                tempStr.erase();
                std::getline(*dp_inStream,tempStr);
                stmp = strip(tempStr);
              }
              mol->setProp(dlabel, prop);
            }
        } else {
          // at this point we should always be at a line starting with '>'
          // following a blank line. If this is not true throw an exception
          // FIX: should we be deleting the molecule (which is probably fine)
          // because we couldn't read the data ???
          throw FileParseException("Problems encountered parsing data fields");
        }
      }
      d_line++;
      std::getline(*dp_inStream,tempStr);
    }  
  }