void ParseNumberList(std::string inLine, std::vector<T> &res, std::istream *inStream=0){ bool foundEnd=false; while(!foundEnd) { CommaTokenizer commaTok(inLine); for(CommaTokenizer::const_iterator commaTokIt=commaTok.begin(); commaTokIt!=commaTok.end(); commaTokIt++){ std::string number=*commaTokIt; bool atEnd= number.find(";>")!=std::string::npos; boost::trim_if(number,boost::is_any_of(" \r\n\t;>")); if(number!="" && !atEnd ){ res.push_back(boost::lexical_cast<T>(number)); } else if(atEnd){ // that's it, we're done: foundEnd=true; break; } } if(foundEnd || !inStream || inStream->eof()){ break; } else { std::getline(*inStream,inLine); } } if(!foundEnd){ throw FileParseException("no end tag found for numeric list"); } }
ROMol *SDMolSupplier::next() { PRECONDITION(dp_inStream, "no stream"); if (df_end && d_last >= d_len) { throw FileParseException("EOF hit."); } // set the stream to the current position dp_inStream->seekg(d_molpos[d_last]); std::string tempStr; ROMol *res = NULL; // finally if we reached the end of the file set end to be true if (dp_inStream->eof()) { // FIX: we should probably be throwing an exception here df_end = true; d_len = d_molpos.size(); return res; } res = _next(); ++d_last; std::streampos posHold = dp_inStream->tellg(); this->checkForEnd(); if (!this->df_end && d_last >= static_cast<int>(d_molpos.size())) { d_molpos.push_back(posHold); } return res; }
void TDTMolSupplier::moveTo(unsigned int idx) { PRECONDITION(dp_inStream,"no stream"); CHECK_INVARIANT(idx >= 0, ""); // dp_inStream->seekg() is called for all idx values // and earlier calls to next() may have put the stream into a bad state dp_inStream->clear(); // move until we hit the desired idx if (idx < d_molpos.size() ) { dp_inStream->seekg(d_molpos[idx]); d_last = idx; } else { std::string tempStr; d_last = d_molpos.size() - 1; dp_inStream->seekg(d_molpos.back()); while ((d_last < static_cast<int>(idx)) && (!dp_inStream->eof()) ) { d_line++; std::getline(*dp_inStream,tempStr); if (tempStr.find("|") == 0) { d_molpos.push_back(dp_inStream->tellg()); d_last++; } } // if we reached end of file without reaching "idx" we have an index error if (dp_inStream->eof()) { d_len = d_molpos.size(); std::ostringstream errout; errout << "ERROR: Index error (idx = " << idx << ") : " << " we do no have enough molecule blocks"; throw FileParseException(errout.str()); } } }
SDWriter::SDWriter(std::ostream *outStream,bool takeOwnership) { PRECONDITION(outStream,"null stream"); if (outStream->bad()){ throw FileParseException("Bad output stream"); } dp_ostream = outStream; df_owner = takeOwnership; d_molid = 0; }
TDTWriter::TDTWriter(std::ostream *outStream, bool takeOwnership) { PRECONDITION(outStream,"null stream"); if (outStream->bad()) { throw FileParseException("Bad output stream"); } dp_ostream = outStream; df_owner = takeOwnership; d_molid = 0; d_numDigits=4; df_write2D=false; df_writeNames=true; }
void SDMolSupplier::moveTo(unsigned int idx) { PRECONDITION(dp_inStream, "no stream"); // dp_inStream->seekg() is called for all idx values // and earlier calls to next() may have put the stream into a bad state dp_inStream->clear(); // move until we hit the desired idx if (idx < d_molpos.size()) { dp_inStream->seekg(d_molpos[idx]); d_last = idx; } else { std::string tempStr; dp_inStream->seekg(d_molpos.back()); d_last = d_molpos.size() - 1; while ((d_last < static_cast<int>(idx)) && (!dp_inStream->eof())) { d_line++; tempStr = getLine(dp_inStream); if (tempStr[0] == '$' && tempStr.substr(0, 4) == "$$$$") { std::streampos posHold = dp_inStream->tellg(); this->checkForEnd(); if (!this->df_end) { d_molpos.push_back(posHold); d_last++; } } } // if we reached end of file without reaching "idx" we have an index error if (dp_inStream->eof()) { d_len = d_molpos.size(); std::ostringstream errout; errout << "ERROR: Index error (idx = " << idx << ") : " << " we do no have enough mol blocks"; throw FileParseException(errout.str()); } } }
ROMol *TDTMolSupplier::parseMol(std::string inLine){ PRECONDITION(dp_inStream,"no stream"); Utils::LocaleSwitcher ls; std::size_t startP=inLine.find("<"); std::size_t endP=inLine.find_last_of(">"); std::string smiles = inLine.substr(startP+1,endP-startP-1); ROMol *res = SmilesToMol(smiles,0,df_sanitize); if(res && res->getNumAtoms()>0){ // ----------- // Process the properties: d_line++; std::getline(*dp_inStream,inLine); while(!dp_inStream->eof() && inLine.find("|")!=0){ endP=inLine.find("<"); std::string propName = inLine.substr(0,endP); boost::trim_if(propName,boost::is_any_of(" \t")); startP = endP+1; if(propName=="2D" && d_confId2D>=0){ std::string rest=inLine.substr(startP,inLine.size()-startP); std::vector<double> coords; TDTParseUtils::ParseNumberList(rest,coords,dp_inStream); Conformer *conf=new Conformer(res->getNumAtoms()); conf->setId(d_confId2D); conf->set3D(false); for(unsigned int atIdx=0;atIdx<res->getNumAtoms();atIdx++){ if(2*atIdx+1 < coords.size()){ conf->setAtomPos(atIdx,RDGeom::Point3D(coords[2*atIdx],coords[2*atIdx+1],0.0)); } else { // we're going to let this slide... but maybe we should do something else? } } res->addConformer(conf,false); } else if(propName=="3D" && d_confId3D>=0){ std::string rest=inLine.substr(startP,inLine.size()-startP); std::vector<double> coords; TDTParseUtils::ParseNumberList(rest,coords,dp_inStream); Conformer *conf=new Conformer(res->getNumAtoms()); conf->setId(d_confId3D); conf->set3D(true); for(unsigned int atIdx=0;atIdx<res->getNumAtoms();atIdx++){ if(3*atIdx+2 < coords.size()){ conf->setAtomPos(atIdx,RDGeom::Point3D(coords[3*atIdx], coords[3*atIdx+1], coords[3*atIdx+2])); } else { // we're going to let this slide... but maybe we should do something else? } } res->addConformer(conf,false); } else { endP=inLine.find_last_of(">"); if(endP==std::string::npos){ std::ostringstream errout; errout << "no end tag found for property" << propName; throw FileParseException(errout.str()); } else { std::string propVal = inLine.substr(startP,endP-startP); res->setProp(propName,propVal); if(propName==d_nameProp) res->setProp("_Name",propVal); } } std::getline(*dp_inStream,inLine); } } return res; }
// -------------------------------------------------- // // Moves to the position of a particular entry in the // stream. // // If insufficient entries are present, a FileParseException // will be thrown // void SmilesMolSupplier::moveTo(unsigned int idx) { PRECONDITION(dp_inStream, "bad instream"); // get the easy situations (boundary conditions) out of the // way first: if (d_len > -1 && idx >= static_cast<unsigned int>(d_len)) { df_end = true; std::ostringstream errout; errout << "ERROR: Index error (idx = " << idx << "): " << "ran out of lines\n"; throw FileParseException(errout.str()); } // dp_inStream->seekg() is called for all idx values // and earlier calls to next() may have put the stream into a bad state dp_inStream->clear(); // ----------- // Case 1: we have already read the particular entry: // // Set the stream position and return // ----------- if (!d_molpos.empty() && d_molpos.size() > idx) { dp_inStream->clear(); // clear the EOF tag if it has been set df_end = false; dp_inStream->seekg(d_molpos[idx]); d_next = idx; d_line = d_lineNums[idx]; return; } // ----------- // Case 2: we haven't read the entry, so move forward until // we've gone far enough. // ----------- if (d_molpos.empty()) { // if we are just starting out, process the title line dp_inStream->seekg(0); if (df_title) this->processTitleLine(); } else { // move to the last position we've seen: dp_inStream->seekg(d_molpos.back()); // read that line: std::string tmp = getLine(dp_inStream); } // the stream pointer is now at the last thing we read in while (d_molpos.size() <= idx) { int nextP = this->skipComments(); if (nextP < 0) { std::ostringstream errout; errout << "ERROR: Index error (idx = " << idx << "): " << "ran out of lines\n"; throw FileParseException(errout.str()); } else { d_molpos.push_back(nextP); d_lineNums.push_back(d_line); if (d_molpos.size() == idx + 1 && df_end) { // boundary condition: we could read the point we were looking for // but not the next one. // indicate that we've reached EOF: dp_inStream->clear(); dp_inStream->seekg(0, std::ios_base::end); d_len = d_molpos.size(); break; } } } POSTCONDITION(d_molpos.size() > idx, "not enough lines"); dp_inStream->seekg(d_molpos[idx]); d_next = idx; return; }
ROMol *SmilesMolSupplier::processLine(std::string inLine) { ROMol *res = NULL; try { // ----------- // tokenize the input line: // ----------- boost::char_separator<char> sep(d_delim.c_str(), "", boost::keep_empty_tokens); tokenizer tokens(inLine, sep); STR_VECT recs; for (tokenizer::iterator tokIter = tokens.begin(); tokIter != tokens.end(); ++tokIter) { std::string rec = strip(*tokIter); recs.push_back(rec); } if (recs.size() <= static_cast<unsigned int>(d_smi)) { std::ostringstream errout; errout << "ERROR: line #" << d_line << "does not contain enough tokens\n"; throw FileParseException(errout.str()); } // ----------- // get the smiles and create a molecule // ----------- res = SmilesToMol(recs[d_smi], 0, df_sanitize); if (!res) { std::stringstream errout; errout << "Cannot create molecule from : '" << recs[d_smi] << "'"; throw SmilesParseException(errout.str()); } // ----------- // get the name (if there's a name column) // ----------- if (d_name == -1) { // if no name defaults it to the line number we read it from string std::ostringstream tstr; tstr << d_line; std::string mname = tstr.str(); res->setProp(common_properties::_Name, mname); } else { if (d_name >= static_cast<int>(recs.size())) { BOOST_LOG(rdWarningLog) << "WARNING: no name column found on line " << d_line << std::endl; } else { res->setProp(common_properties::_Name, recs[d_name]); } } // ----------- // read in the properties // ----------- unsigned int iprop = 0; for (unsigned int col = 0; col < recs.size(); col++) { if (static_cast<int>(col) == d_smi || static_cast<int>(col) == d_name) continue; std::string pname, pval; if (d_props.size() > col) { pname = d_props[col]; } else { pname = "Column_"; std::stringstream ss; ss << col; pname += ss.str(); } pval = recs[col]; res->setProp(pname, pval); iprop++; } } catch (const SmilesParseException &pe) { // Couldn't parse the passed in smiles // Simply print out a message BOOST_LOG(rdErrorLog) << "ERROR: Smiles parse error on line " << d_line << "\n"; BOOST_LOG(rdErrorLog) << "ERROR: " << pe.message() << "\n"; res = NULL; } catch (const MolSanitizeException &se) { // We couldn't sanitize the molecule // write out an error message BOOST_LOG(rdErrorLog) << "ERROR: Could not sanitize molecule on line " << d_line << std::endl; BOOST_LOG(rdErrorLog) << "ERROR: " << se.message() << "\n"; res = NULL; } catch (...) { // write out an error message BOOST_LOG(rdErrorLog) << "ERROR: Could not process molecule on line " << d_line << std::endl; res = NULL; } return res; }
void ForwardSDMolSupplier::readMolProps(ROMol *mol){ PRECONDITION(dp_inStream,"no stream"); PRECONDITION(mol,"no molecule"); d_line++; std::string tempStr; std::getline(*dp_inStream,tempStr); // FIX: report files missing the $$$$ marker while(!(dp_inStream->eof()) && (tempStr[0]!='$'||tempStr.substr(0,4)!="$$$$") ){ tempStr = strip(tempStr); if(tempStr!=""){ if (tempStr[0] == '>') { // data header line: start of a data item // ignore all other crap and seek for for a data label enclosed // by '<' and '>' // FIX: "CTfile.pdf" (page 51) says that the a data header line does not // have to contain a data label (instead can have something line field // id into a MACCS db). But we do not currently know what to do in this // situation - so ignore such data items for now tempStr.erase(0,1); // remove the first ">" sign int sl = tempStr.find("<"); // begin datalabel int se = tempStr.find(">"); // end datalabel if ((sl == -1) || (se == -1) || (se == (sl+1)) ) { // we either do not have a data label or the label is emtpy // no data label ignore until next data item // i.e. until we hit a blank line d_line++; std::getline(*dp_inStream,tempStr); std::string stmp = strip(tempStr); while (stmp.length() != 0) { d_line++; std::getline(*dp_inStream,tempStr); if(dp_inStream->eof()) throw FileParseException("End of data field name not found"); } } else { std::string dlabel = tempStr.substr(sl+1, se-sl-1); // we know the label - now read in the relevant properties // until we hit a blank line d_line++; std::getline(*dp_inStream,tempStr); std::string prop=""; std::string stmp = strip(tempStr); int nplines = 0; // number of lines for this property while (stmp.length() != 0 || tempStr[0]==' ' || tempStr[0]=='\t') { nplines++; if (nplines > 1) { prop += "\n"; } // take off \r if it's still in the property: if (tempStr[tempStr.length()-1]=='\r'){ tempStr.erase(tempStr.length()-1); } prop += tempStr; d_line++; // erase tempStr in case the file does not end with a carrier // return (we will end up in an infinite loop if we don't do // this and we do not check for EOF in this while loop body) tempStr.erase(); std::getline(*dp_inStream,tempStr); stmp = strip(tempStr); } mol->setProp(dlabel, prop); } } else { // at this point we should always be at a line starting with '>' // following a blank line. If this is not true throw an exception // FIX: should we be deleting the molecule (which is probably fine) // because we couldn't read the data ??? throw FileParseException("Problems encountered parsing data fields"); } } d_line++; std::getline(*dp_inStream,tempStr); } }