static bool find_files(std::string pathname, const std::string &filename, string_array &files, bool recursive) { if (!pathname.empty()) { char c = pathname[pathname.size() - 1]; if ((c != ':') && (c != '\\') && (c != '/')) pathname += "/"; } DIR *dp = opendir(pathname.c_str()); if (!dp) return false; string_array paths; for ( ; ; ) { struct dirent *ep = readdir(dp); if (!ep) break; const bool is_directory = (ep->d_type & DT_DIR) != 0; const bool is_file = (ep->d_type & DT_REG) != 0; if (ep->d_name[0] == '.') continue; std::string filename(ep->d_name); if (is_directory) { if (recursive) paths.push_back(filename); } else if (is_file) files.push_back(pathname + filename); } closedir(dp); dp = NULL; if (recursive) { for (uint i = 0; i < paths.size(); i++) { const std::string &path = paths[i]; if (!find_files(pathname + path, filename, files, true)) return false; } } return true; }
bool MyFb2File::GetGenres(string_array &genres) { bool retVal = false; int pos = 0; if (m_titleInfo.length() == 0) { m_titleInfo = GetXmlTag(titleInfoTag, pos); } if (m_titleInfo.length() == 0) { return retVal; } genres.clear(); pos = 0; QString curGenre = tr(""); while ((curGenre = GetXmlTag(genreTag, m_titleInfo, pos)).length() > 0) { int comma = curGenre.indexOf(','); if (comma != -1) { int pos = 0; QString g = tr(""); while ((comma = curGenre.indexOf(',', pos)) != -1) { int beg = pos, end = comma; while (curGenre.mid(beg, 1) == tr(" ")) { beg++; } while (curGenre.mid(end - 1, 1) == tr(" ")) { end--; } if ((end - beg) > 0) { g = curGenre.mid(beg, end - beg); genres.push_back(Utf8toAnsi(g)); } pos = comma + 1; } int beg = pos, end = curGenre.length(); while (curGenre.mid(beg, 1) == tr(" ")) { beg++; } while (curGenre.mid(end - 1, 1) == tr(" ")) { end--; } if ((end - beg) > 0) { g = curGenre.mid(beg, end - beg); genres.push_back(Utf8toAnsi(g)); } } else { genres.push_back(Utf8toAnsi(curGenre)); } retVal = true; } return retVal; }
int main(int argc, char* argv[]) { // Check the required number of command line arguments. if (argc != 5) { cout << "usr host user pwd jobs_path" << endl; return 0; } // Fetch command line arguments. const auto host = argv[1]; const auto user = argv[2]; const auto pwd = argv[3]; const path jobs_path = argv[4]; // Connect to host and authenticate user. DBClientConnection conn; { cout << local_time() << "Connecting to " << host << " and authenticating " << user << endl; string errmsg; if ((!conn.connect(host, errmsg)) || (!conn.auth("istar", user, pwd, errmsg))) { cerr << local_time() << errmsg << endl; return 1; } } // Initialize constants. cout << local_time() << "Initializing" << endl; const auto collection = "istar.usr"; const auto epoch = date(1970, 1, 1); const size_t num_usrs = 2; constexpr array<size_t, num_usrs> qn{{ 12, 60 }}; constexpr array<double, num_usrs> qv{{ 1.0 / qn[0], 1.0 / qn[1] }}; const size_t num_references = 4; const size_t num_subsets = 5; const array<string, num_subsets> SubsetSMARTS {{ "[!#1]", // heavy "[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]", // hydrophobic "[a]", // aromatic "[$([O,S;H1;v2]-[!$(*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),$([N&v3;H1,H2]-[!$(*=[O,N,P,S])]),$([N;v3;H0]),$([n,o,s;+0]),F]", // acceptor "[N!H0v3,N!H0+v4,OH+0,SH+0,nH+0]", // donor }}; // Initialize variables. array<array<double, qn.back()>, 1> qw; array<array<double, qn.back()>, 1> lw; auto q = qw[0]; auto l = lw[0]; // Read ZINC ID file. const string_array<size_t> zincids("16_zincid.txt"); const auto num_ligands = zincids.size(); // Read SMILES file. const string_array<size_t> smileses("16_smiles.txt"); assert(smileses.size() == num_ligands); // Read supplier file. const string_array<size_t> suppliers("16_supplier.txt"); assert(suppliers.size() == num_ligands); // Read property files of floating point types and integer types. const auto zfproperties = read<array<float, 4>>("16_zfprop.f32"); assert(zfproperties.size() == num_ligands); const auto ziproperties = read<array<int16_t, 5>>("16_ziprop.i16"); assert(ziproperties.size() == num_ligands); // Open files for subsequent reading. std::ifstream usrcat_bin("16_usrcat.f64"); stream_array<size_t> ligands("16_ligand.pdbqt"); assert(ligands.size() == num_ligands); array<vector<double>, 2> scores {{ vector<double>(num_ligands, 0), vector<double>(num_ligands, 0) }}; const auto& u0scores = scores[0]; const auto& u1scores = scores[1]; vector<size_t> scase(num_ligands); // Enter event loop. cout << local_time() << "Entering event loop" << endl; bool sleeping = false; while (true) { // Fetch an incompleted job in a first-come-first-served manner. if (!sleeping) cout << local_time() << "Fetching an incompleted job" << endl; BSONObj info; conn.runCommand("istar", BSON("findandmodify" << "usr" << "query" << BSON("done" << BSON("$exists" << false) << "started" << BSON("$exists" << false)) << "sort" << BSON("submitted" << 1) << "update" << BSON("$set" << BSON("started" << Date_t(duration_cast<std::chrono::milliseconds>(system_clock::now().time_since_epoch()).count())))), info); // conn.findAndModify() is available since MongoDB C++ Driver legacy-1.0.0 const auto value = info["value"]; if (value.isNull()) { // No incompleted jobs. Sleep for a while. if (!sleeping) cout << local_time() << "Sleeping" << endl; sleeping = true; this_thread::sleep_for(chrono::seconds(10)); continue; } sleeping = false; const auto job = value.Obj(); // Obtain job properties. const auto _id = job["_id"].OID(); cout << local_time() << "Executing job " << _id.str() << endl; const auto job_path = jobs_path / _id.str(); const auto format = job["format"].String(); const auto email = job["email"].String(); // Parse the user-supplied ligand. OBMol obMol; OBConversion obConversion; obConversion.SetInFormat(format.c_str()); obConversion.ReadFile(&obMol, (job_path / ("ligand." + format)).string()); const auto num_atoms = obMol.NumAtoms(); // obMol.AddHydrogens(); // Adding hydrogens does not seem to affect SMARTS matching. // Classify subset atoms. array<vector<int>, num_subsets> subsets; for (size_t k = 0; k < num_subsets; ++k) { auto& subset = subsets[k]; subset.reserve(num_atoms); OBSmartsPattern smarts; smarts.Init(SubsetSMARTS[k]); smarts.Match(obMol); for (const auto& map : smarts.GetMapList()) { subset.push_back(map.front()); } } const auto& subset0 = subsets.front(); // Check user-provided ligand validity. if (subset0.empty()) { // Record job completion time stamp. const auto millis_since_epoch = duration_cast<std::chrono::milliseconds>(system_clock::now().time_since_epoch()).count(); conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("done" << Date_t(millis_since_epoch)))); // Send error notification email. cout << local_time() << "Sending an error notification email to " << email << endl; MailMessage message; message.setSender("usr <*****@*****.**>"); message.setSubject("Your usr job has failed"); message.setContent("Description: " + job["description"].String() + "\nSubmitted: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(job["submitted"].Date().millis))) + " UTC\nFailed: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(millis_since_epoch))) + " UTC\nReason: failed to parse the provided ligand."); message.addRecipient(MailRecipient(MailRecipient::PRIMARY_RECIPIENT, email)); SMTPClientSession session("137.189.91.190"); session.login(); session.sendMessage(message); session.close(); continue; } // Calculate the four reference points. const auto n = subset0.size(); const auto v = 1.0 / n; array<vector3, num_references> references{}; auto& ctd = references[0]; auto& cst = references[1]; auto& fct = references[2]; auto& ftf = references[3]; for (const auto i : subset0) { ctd += obMol.GetAtom(i)->GetVector(); } ctd *= v; double cst_dist = numeric_limits<double>::max(); double fct_dist = numeric_limits<double>::lowest(); double ftf_dist = numeric_limits<double>::lowest(); for (const auto i : subset0) { const auto& a = obMol.GetAtom(i)->GetVector(); const auto this_dist = a.distSq(ctd); if (this_dist < cst_dist) { cst = a; cst_dist = this_dist; } if (this_dist > fct_dist) { fct = a; fct_dist = this_dist; } } for (const auto i : subset0) { const auto& a = obMol.GetAtom(i)->GetVector(); const auto this_dist = a.distSq(fct); if (this_dist > ftf_dist) { ftf = a; ftf_dist = this_dist; } } // Precalculate the distances between each atom and each reference point. array<vector<double>, num_references> dista; for (size_t k = 0; k < num_references; ++k) { const auto& reference = references[k]; auto& dists = dista[k]; dists.resize(1 + num_atoms); // OpenBabel atom index starts from 1. dists[0] is dummy. for (size_t i = 0; i < n; ++i) { dists[subset0[i]] = sqrt(obMol.GetAtom(subset0[i])->GetVector().distSq(reference)); } } // Calculate USR and USRCAT features of the input ligand. size_t qo = 0; for (const auto& subset : subsets) { const auto n = subset.size(); for (size_t k = 0; k < num_references; ++k) { const auto& distp = dista[k]; vector<double> dists(n); for (size_t i = 0; i < n; ++i) { dists[i] = distp[subset[i]]; } array<double, 3> m{}; if (n > 2) { const auto v = 1.0 / n; for (size_t i = 0; i < n; ++i) { const auto d = dists[i]; m[0] += d; } m[0] *= v; for (size_t i = 0; i < n; ++i) { const auto d = dists[i] - m[0]; m[1] += d * d; } m[1] = sqrt(m[1] * v); for (size_t i = 0; i < n; ++i) { const auto d = dists[i] - m[0]; m[2] += d * d * d; } m[2] = cbrt(m[2] * v); } else if (n == 2) { m[0] = 0.5 * (dists[0] + dists[1]); m[1] = 0.5 * fabs(dists[0] - dists[1]); } else if (n == 1) { m[0] = dists[0]; } #pragma unroll for (const auto e : m) { q[qo++] = e; } } } assert(qo == qn.back()); // Compute USR and USRCAT scores. usrcat_bin.seekg(0); for (size_t k = 0; k < num_ligands; ++k) { usrcat_bin.read(reinterpret_cast<char*>(l.data()), sizeof(l)); double s = 0; #pragma unroll for (size_t i = 0, u = 0; u < num_usrs; ++u) { #pragma unroll for (const auto qnu = qn[u]; i < qnu; ++i) { s += fabs(q[i] - l[i]); } scores[u][k] = s; } } assert(usrcat_bin.tellg() == sizeof(l) * num_ligands); // Sort ligands by USRCAT score and then by USR score and then by ZINC ID. iota(scase.begin(), scase.end(), 0); sort(scase.begin(), scase.end(), [&](const size_t val0, const size_t val1) { const auto u1score0 = u1scores[val0]; const auto u1score1 = u1scores[val1]; if (u1score0 == u1score1) { const auto u0score0 = u0scores[val0]; const auto u0score1 = u0scores[val1]; if (u0score0 == u0score1) { return zincids[val0] < zincids[val1]; } return u0score0 < u0score1; } return u1score0 < u1score1; }); // Write results. filtering_ostream log_csv_gz; log_csv_gz.push(gzip_compressor()); log_csv_gz.push(file_sink((job_path / "log.csv.gz").string())); log_csv_gz.setf(ios::fixed, ios::floatfield); log_csv_gz << "ZINC ID,USR score,USRCAT score\n" << setprecision(8); filtering_ostream ligands_pdbqt_gz; ligands_pdbqt_gz.push(gzip_compressor()); ligands_pdbqt_gz.push(file_sink((job_path / "ligands.pdbqt.gz").string())); ligands_pdbqt_gz.setf(ios::fixed, ios::floatfield); for (size_t t = 0; t < 10000; ++t) { const size_t k = scase[t]; const auto zincid = zincids[k].substr(0, 8); // Take another substr() to get rid of the trailing newline. const auto u0score = 1 / (1 + scores[0][k] * qv[0]); const auto u1score = 1 / (1 + scores[1][k] * qv[1]); log_csv_gz << zincid << ',' << u0score << ',' << u1score << '\n'; // Only write conformations of the top ligands to ligands.pdbqt.gz. if (t >= 1000) continue; const auto zfp = zfproperties[k]; const auto zip = ziproperties[k]; ligands_pdbqt_gz << "MODEL " << '\n' << "REMARK 911 " << zincid << setprecision(3) << ' ' << setw(8) << zfp[0] << ' ' << setw(8) << zfp[1] << ' ' << setw(8) << zfp[2] << ' ' << setw(8) << zfp[3] << ' ' << setw(3) << zip[0] << ' ' << setw(3) << zip[1] << ' ' << setw(3) << zip[2] << ' ' << setw(3) << zip[3] << ' ' << setw(3) << zip[4] << '\n' << "REMARK 912 " << smileses[k] // A newline is already included in smileses[k]. << "REMARK 913 " << suppliers[k] // A newline is already included in suppliers[k]. << setprecision(8) << "REMARK 951 USR SCORE: " << setw(10) << u0score << '\n' << "REMARK 952 USRCAT SCORE: " << setw(10) << u1score << '\n' ; const auto lig = ligands[k]; ligands_pdbqt_gz.write(lig.data(), lig.size()); ligands_pdbqt_gz << "ENDMDL\n"; } // Update progress. cout << local_time() << "Setting done time" << endl; const auto millis_since_epoch = duration_cast<std::chrono::milliseconds>(system_clock::now().time_since_epoch()).count(); conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("done" << Date_t(millis_since_epoch)))); // Send completion notification email. cout << local_time() << "Sending a completion notification email to " << email << endl; MailMessage message; message.setSender("istar <*****@*****.**>"); message.setSubject("Your usr job has completed"); message.setContent("Description: " + job["description"].String() + "\nSubmitted: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(job["submitted"].Date().millis))) + " UTC\nCompleted: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(millis_since_epoch))) + " UTC\nResult: http://istar.cse.cuhk.edu.hk/usr/iview/?" + _id.str()); message.addRecipient(MailRecipient(MailRecipient::PRIMARY_RECIPIENT, email)); SMTPClientSession session("137.189.91.190"); session.login(); session.sendMessage(message); session.close(); } }
int main(int argc, char* argv[]) { // Check the required number of command line arguments. if (argc != 5) { cout << "usr host user pwd jobs_path" << endl; return 0; } // Fetch command line arguments. const auto host = argv[1]; const auto user = argv[2]; const auto pwd = argv[3]; const path jobs_path = argv[4]; DBClientConnection conn; { // Connect to host and authenticate user. cout << local_time() << "Connecting to " << host << " and authenticating " << user << endl; string errmsg; if ((!conn.connect(host, errmsg)) || (!conn.auth("istar", user, pwd, errmsg))) { cerr << local_time() << errmsg << endl; return 1; } } // Initialize constants. cout << local_time() << "Initializing" << endl; const auto collection = "istar.usr2"; const size_t num_usrs = 2; const array<string, 2> usr_names{{ "USR", "USRCAT" }}; constexpr array<size_t, num_usrs> qn{{ 12, 60 }}; constexpr array<double, num_usrs> qv{{ 1.0 / qn[0], 1.0 / qn[1] }}; const size_t num_refPoints = 4; const size_t num_subsets = 5; const array<string, num_subsets> SubsetSMARTS {{ "[!#1]", // heavy "[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]", // hydrophobic "[a]", // aromatic "[$([O,S;H1;v2]-[!$(*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),$([N&v3;H1,H2]-[!$(*=[O,N,P,S])]),$([N;v3;H0]),$([n,o,s;+0]),F]", // acceptor "[N!H0v3,N!H0+v4,OH+0,SH+0,nH+0]", // donor }}; const size_t num_hits = 100; // Wrap SMARTS strings to RWMol objects. array<unique_ptr<ROMol>, num_subsets> SubsetMols; for (size_t k = 0; k < num_subsets; ++k) { SubsetMols[k].reset(reinterpret_cast<ROMol*>(SmartsToMol(SubsetSMARTS[k]))); } // Read ZINC ID file. const string_array<size_t> zincids("16/zincid.txt"); const auto num_ligands = zincids.size(); cout << local_time() << "Found " << num_ligands << " database molecules" << endl; // Read SMILES file. const string_array<size_t> smileses("16/smiles.txt"); assert(smileses.size() == num_ligands); // Read supplier file. const string_array<size_t> suppliers("16/supplier.txt"); assert(suppliers.size() == num_ligands); // Read property files of floating point types and integer types. const auto zfproperties = read<array<float, 4>>("16/zfprop.f32"); assert(zfproperties.size() == num_ligands); const auto ziproperties = read<array<int16_t, 5>>("16/ziprop.i16"); assert(ziproperties.size() == num_ligands); // Read cumulative number of conformers file. const auto mconfss = read<size_t>("16/mconfs.u64"); const auto num_conformers = mconfss.back(); assert(mconfss.size() == num_ligands); assert(num_conformers >= num_ligands); cout << local_time() << "Found " << num_conformers << " database conformers" << endl; // Read feature file. const auto features = read<array<double, qn.back()>>("16/usrcat.f64"); assert(features.size() == num_conformers); // Read ligand footer file and open ligand SDF file for seeking and reading. stream_array<size_t> ligands("16/ligand.sdf"); assert(ligands.size() == num_conformers); // Initialize variables. array<vector<int>, num_subsets> subsets; array<vector<double>, num_refPoints> dista; alignas(32) array<double, qn.back()> q; // Initialize vectors to store compounds' primary score and their corresponding conformer. vector<double> scores(num_ligands); // Primary score of molecules. vector<size_t> cnfids(num_ligands); // ID of conformer with the best primary score. const auto compare = [&](const size_t val0, const size_t val1) // Sort by the primary score. { return scores[val0] < scores[val1]; }; // Initialize an io service pool and create worker threads for later use. const size_t num_threads = thread::hardware_concurrency(); cout << local_time() << "Creating an io service pool of " << num_threads << " worker threads" << endl; io_service_pool io(num_threads); safe_counter<size_t> cnt; // Initialize the number of chunks and the number of molecules per chunk. const auto num_chunks = num_threads << 4; const auto chunk_size = 1 + (num_ligands - 1) / num_chunks; assert(chunk_size * num_chunks >= num_ligands); assert(chunk_size >= num_hits); cout << local_time() << "Using " << num_chunks << " chunks and a chunk size of " << chunk_size << endl; vector<size_t> scase(num_ligands); vector<size_t> zcase(num_hits * (num_chunks - 1) + min(num_hits, num_ligands - chunk_size * (num_chunks - 1))); // The last chunk might have fewer than num_hits records. // Enter event loop. cout << local_time() << "Entering event loop" << endl; cout.setf(ios::fixed, ios::floatfield); bool sleeping = false; while (true) { // Fetch an incompleted job in a first-come-first-served manner. if (!sleeping) cout << local_time() << "Fetching an incompleted job" << endl; BSONObj info; const auto started = milliseconds_since_epoch(); conn.runCommand("istar", BSON("findandmodify" << "usr2" << "query" << BSON("started" << BSON("$exists" << false)) << "sort" << BSON("submitted" << 1) << "update" << BSON("$set" << BSON("started" << started))), info); // conn.findAndModify() is available since MongoDB C++ Driver legacy-1.0.0 const auto value = info["value"]; if (value.isNull()) { // No incompleted jobs. Sleep for a while. if (!sleeping) cout << local_time() << "Sleeping" << endl; sleeping = true; this_thread::sleep_for(chrono::seconds(2)); continue; } sleeping = false; const auto job = value.Obj(); // Obtain job properties. const auto _id = job["_id"].OID(); cout << local_time() << "Executing job " << _id.str() << endl; const auto job_path = jobs_path / _id.str(); const size_t usr0 = job["usr"].Int(); // Specify the primary sorting score. 0: USR; 1: USRCAT. assert(usr0 == 0 || usr0 == 1); const auto usr1 = usr0 ^ 1; const auto qnu0 = qn[usr0]; const auto qnu1 = qn[usr1]; // Read and validate the user-supplied SDF file. cout << local_time() << "Reading and validating the query file" << endl; SDMolSupplier sup((job_path / "query.sdf").string(), true, false, true); // sanitize, removeHs, strictParsing if (!sup.length() || !sup.atEnd()) { const auto error = 1; cout << local_time() << "Failed to parse the query file, error code = " << error << endl; conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("completed" << milliseconds_since_epoch() << "error" << error))); continue; } // Process each of the query molecules sequentially. const auto num_queries = 1; // Restrict the number of query molecules to 1. Setting num_queries = sup.length() to execute any number of query molecules. for (unsigned int query_number = 0; query_number < num_queries; ++query_number) { cout << local_time() << "Parsing query molecule " << query_number << endl; const unique_ptr<ROMol> qry_ptr(sup.next()); // Calling next() may print "ERROR: Could not sanitize molecule on line XXXX" to stderr. auto& qryMol = *qry_ptr; // Get the number of atoms, including and excluding hydrogens. const auto num_atoms = qryMol.getNumAtoms(); const auto num_heavy_atoms = qryMol.getNumHeavyAtoms(); assert(num_heavy_atoms); cout << local_time() << "Found " << num_atoms << " atoms and " << num_heavy_atoms << " heavy atoms" << endl; // Create an output directory. cout << local_time() << "Creating output directory" << endl; const auto output_dir = job_path / to_string(query_number); create_directory(output_dir); // Draw a SVG. cout << local_time() << "Drawing a SVG" << endl; { const unique_ptr<ROMol> qrz_ptr(removeHs(qryMol)); auto& qrzMol = *qrz_ptr; compute2DCoords(qrzMol); boost::filesystem::ofstream ofs(output_dir / "query.svg"); ofs << DrawingToSVG(MolToDrawing(qrzMol)); } // Calculate Morgan fingerprint. cout << local_time() << "Calculating Morgan fingerprint" << endl; const unique_ptr<SparseIntVect<uint32_t>> qryFp(getFingerprint(qryMol, 2)); // Classify atoms to pharmacophoric subsets. cout << local_time() << "Classifying atoms into subsets" << endl; for (size_t k = 0; k < num_subsets; ++k) { vector<vector<pair<int, int>>> matchVect; SubstructMatch(qryMol, *SubsetMols[k], matchVect); const auto num_matches = matchVect.size(); auto& subset = subsets[k]; subset.resize(num_matches); for (size_t i = 0; i < num_matches; ++i) { subset[i] = matchVect[i].front().second; } cout << local_time() << "Found " << num_matches << " atoms for subset " << k << endl; } const auto& subset0 = subsets.front(); assert(subset0.size() == num_heavy_atoms); // Calculate the four reference points. cout << local_time() << "Calculating " << num_refPoints << " reference points" << endl; const auto qryRefPoints = calcRefPoints(qryMol, subset0); const Point3DConstPtrVect qryRefPointv {{ &qryRefPoints[0], &qryRefPoints[1], &qryRefPoints[2], &qryRefPoints[3], }}; // Precalculate the distances of heavy atoms to the reference points, given that subsets[1 to 4] are subsets of subsets[0]. cout << local_time() << "Calculating " << num_heavy_atoms * num_refPoints << " pairwise distances" << endl; const auto& qryCnf = qryMol.getConformer(); for (size_t k = 0; k < num_refPoints; ++k) { const auto& refPoint = qryRefPoints[k]; auto& distp = dista[k]; distp.resize(num_atoms); for (size_t i = 0; i < num_heavy_atoms; ++i) { distp[subset0[i]] = sqrt(dist2(qryCnf.getAtomPos(subset0[i]), refPoint)); } } // Loop over pharmacophoric subsets and reference points. cout << local_time() << "Calculating " << 3 * num_refPoints * num_subsets << " moments of USRCAT feature" << endl; size_t qo = 0; for (const auto& subset : subsets) { const auto n = subset.size(); for (size_t k = 0; k < num_refPoints; ++k) { // Load distances from precalculated ones. const auto& distp = dista[k]; vector<double> dists(n); for (size_t i = 0; i < n; ++i) { dists[i] = distp[subset[i]]; } // Compute moments. array<double, 3> m{}; if (n > 2) { const auto v = 1.0 / n; for (size_t i = 0; i < n; ++i) { const auto d = dists[i]; m[0] += d; } m[0] *= v; for (size_t i = 0; i < n; ++i) { const auto d = dists[i] - m[0]; m[1] += d * d; } m[1] = sqrt(m[1] * v); for (size_t i = 0; i < n; ++i) { const auto d = dists[i] - m[0]; m[2] += d * d * d; } m[2] = cbrt(m[2] * v); } else if (n == 2) { m[0] = 0.5 * (dists[0] + dists[1]); m[1] = 0.5 * fabs(dists[0] - dists[1]); } else if (n == 1) { m[0] = dists[0]; } for (const auto e : m) { q[qo++] = e; } } } assert(qo == qn.back()); // Compute USR and USRCAT scores. cout << local_time() << "Calculating " << num_ligands << " " << usr_names[usr0] << " scores" << endl; scores.assign(scores.size(), numeric_limits<double>::max()); iota(scase.begin(), scase.end(), 0); cnt.init(num_chunks); for (size_t l = 0; l < num_chunks; ++l) { io.post([&,l]() { // Loop over molecules of the current chunk. const auto chunk_beg = chunk_size * l; const auto chunk_end = min(chunk_beg + chunk_size, num_ligands); for (size_t k = chunk_beg; k < chunk_end; ++k) { // Loop over conformers of the current molecule and calculate their primary score. auto& scorek = scores[k]; size_t j = k ? mconfss[k - 1] : 0; for (const auto mconfs = mconfss[k]; j < mconfs; ++j) { const auto& d = features[j]; double s = 0; for (size_t i = 0; i < qnu0; ++i) { s += abs(q[i] - d[i]); if (s >= scorek) break; } if (s < scorek) { scorek = s; cnfids[k] = j; } } } // Sort the scores of molecules of the current chunk. sort(scase.begin() + chunk_beg, scase.begin() + chunk_end, compare); // Copy the indexes of top hits of the current chunk to a global vector for final sorting. copy_n(scase.begin() + chunk_beg, min(num_hits, chunk_end - chunk_beg), zcase.begin() + num_hits * l); cnt.increment(); }); } cnt.wait(); // Sort the top hits from chunks. cout << local_time() << "Sorting " << zcase.size() << " hits by " << usr_names[usr0] << " score" << endl; sort(zcase.begin(), zcase.end(), compare); // Create output directory and write output files. cout << local_time() << "Writing output files" << endl; SDWriter hits_sdf((output_dir / "hits.sdf").string()); boost::filesystem::ofstream hits_csv(output_dir / "hits.csv"); hits_csv.setf(ios::fixed, ios::floatfield); hits_csv << "ZINC ID,USR score,USRCAT score,2D Tanimoto score,Molecular weight (g/mol),Partition coefficient xlogP,Apolar desolvation (kcal/mol),Polar desolvation (kcal/mol),Hydrogen bond donors,Hydrogen bond acceptors,Polar surface area tPSA (Å^2),Net charge,Rotatable bonds,SMILES,Vendors and annotations\n"; for (size_t l = 0; l < num_hits; ++l) { // Obtain indexes to the hit molecule and the hit conformer. const auto k = zcase[l]; const auto j = cnfids[k]; // Read SDF content of the hit conformer. const auto lig = ligands[j]; // Construct a RDKit ROMol object. istringstream iss(lig); SDMolSupplier sup(&iss, false, true, false, true); assert(sup.length() == 1); assert(sup.atEnd()); const unique_ptr<ROMol> hit_ptr(sup.next()); auto& hitMol = *hit_ptr; // Calculate Morgan fingerprint. const unique_ptr<SparseIntVect<uint32_t>> hitFp(getFingerprint(hitMol, 2)); // Calculate Tanimoto similarity. const auto ts = TanimotoSimilarity(*qryFp, *hitFp); // Find heavy atoms. vector<vector<pair<int, int>>> matchVect; SubstructMatch(hitMol, *SubsetMols[0], matchVect); const auto num_matches = matchVect.size(); assert(num_matches == hitMol.getNumHeavyAtoms()); vector<int> hitHeavyAtoms(num_matches); for (size_t i = 0; i < num_matches; ++i) { hitHeavyAtoms[i] = matchVect[i].front().second; assert(hitHeavyAtoms[i] == i); // hitHeavyAtoms can be constructed using iota(hitHeavyAtoms.begin(), hitHeavyAtoms.end(), 0); because for RDKit-generated SDF molecules, heavy atom are always the first few atoms. } // Calculate the four reference points. const auto hitRefPoints = calcRefPoints(hitMol, hitHeavyAtoms); const Point3DConstPtrVect hitRefPointv {{ &hitRefPoints[0], &hitRefPoints[1], &hitRefPoints[2], &hitRefPoints[3], }}; // Calculate a 3D transform from the four reference points of the hit conformer to those of the query molecule. Transform3D trans; AlignPoints(qryRefPointv, hitRefPointv, trans); // Apply the 3D transform to all atoms of the hit conformer. auto& hitCnf = hitMol.getConformer(); transformConformer(hitCnf, trans); // Write the aligned hit conformer. hits_sdf.write(hitMol); // Calculate the secondary score of the saved conformer, which has the best primary score. const auto& d = features[j]; double s = 0; for (size_t i = 0; i < qnu1; ++i) { s += abs(q[i] - d[i]); } const auto u0score = 1 / (1 + scores[k] * qv[usr0]); // Primary score of the current molecule. const auto u1score = 1 / (1 + s * qv[usr1]); // Secondary score of the current molecule. const auto zincid = zincids[k].substr(0, 8); // Take another substr() to get rid of the trailing newline. const auto zfp = zfproperties[k]; const auto zip = ziproperties[k]; const auto smiles = smileses[k]; // A newline is already included in smileses[k]. const auto supplier = suppliers[k]; // A newline is already included in suppliers[k]. hits_csv << zincid << setprecision(8) << ',' << (usr1 ? u0score : u1score) << ',' << (usr1 ? u1score : u0score) << ',' << ts << setprecision(3) << ',' << zfp[0] << ',' << zfp[1] << ',' << zfp[2] << ',' << zfp[3] << ',' << zip[0] << ',' << zip[1] << ',' << zip[2] << ',' << zip[3] << ',' << zip[4] << ',' << smiles.substr(0, smiles.length() - 1) // Get rid of the trailing newline. << ',' << supplier.substr(0, supplier.length() - 1) // Get rid of the trailing newline. << '\n' ; } } // Update job status. cout << local_time() << "Setting completed time" << endl; const auto completed = milliseconds_since_epoch(); conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("completed" << completed << "nqueries" << num_queries))); // Calculate runtime in seconds and screening speed in million conformers per second. const auto runtime = (completed - started) * 0.001; const auto speed = num_conformers * 0.000001 * num_queries / runtime; cout << local_time() << "Completed " << num_queries << " " << (num_queries == 1 ? "query" : "queries") << " in " << setprecision(3) << runtime << " seconds" << endl << local_time() << "Screening speed was " << setprecision(0) << speed << " M conformers per second" << endl ; } }
static bool find_files(std::string pathname, const std::string &filename, string_array &files, bool recursive) { if (!pathname.empty()) { char c = pathname[pathname.size() - 1]; if ((c != ':') && (c != '\\') && (c != '/')) pathname += "\\"; } WIN32_FIND_DATAA find_data; HANDLE findHandle = FindFirstFileA((pathname + filename).c_str(), &find_data); if (findHandle == INVALID_HANDLE_VALUE) return false; do { const bool is_directory = (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0; const bool is_system = (find_data.dwFileAttributes & FILE_ATTRIBUTE_SYSTEM) != 0; const bool is_hidden = (find_data.dwFileAttributes & FILE_ATTRIBUTE_HIDDEN) != 0; std::string filename(find_data.cFileName); if ((!is_directory) && (!is_system) && (!is_hidden)) files.push_back(pathname + filename); } while (FindNextFileA(findHandle, &find_data)); FindClose(findHandle); if (recursive) { string_array paths; HANDLE findHandle = FindFirstFileA((pathname + "*").c_str(), &find_data); if (findHandle == INVALID_HANDLE_VALUE) return false; do { const bool is_directory = (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0; const bool is_system = (find_data.dwFileAttributes & FILE_ATTRIBUTE_SYSTEM) != 0; const bool is_hidden = (find_data.dwFileAttributes & FILE_ATTRIBUTE_HIDDEN) != 0; std::string filename(find_data.cFileName); if ((is_directory) && (!is_hidden) && (!is_system)) paths.push_back(filename); } while (FindNextFileA(findHandle, &find_data)); FindClose(findHandle); for (uint i = 0; i < paths.size(); i++) { const std::string &path = paths[i]; if (path[0] == '.') continue; if (!find_files(pathname + path, filename, files, true)) return false; } } return true; }
int main_internal(string_array cmd_line, int num_helper_threads, ilzham &lzham_dll) { comp_options options; options.m_max_helper_threads = num_helper_threads; #ifdef _XBOX options.m_dict_size_log2 = 21; #endif if (!cmd_line.size()) { print_usage(); return simple_test(lzham_dll, options); } enum op_mode_t { OP_MODE_INVALID = -1, OP_MODE_COMPRESS = 0, OP_MODE_DECOMPRESS = 1, OP_MODE_ALL = 2 }; op_mode_t op_mode = OP_MODE_INVALID; for (int i = 0; i < (int)cmd_line.size(); i++) { const std::string &str = cmd_line[i]; if (str[0] == '-') { if (str.size() < 2) { print_error("Invalid option: %s\n", str.c_str()); return EXIT_FAILURE; } switch (tolower(str[1])) { case 'u': { options.m_unbuffered_decompression = true; break; } case 'd': { int dict_size = atoi(str.c_str() + 2); if ((dict_size < LZHAM_MIN_DICT_SIZE_LOG2) || (dict_size > LZHAMTEST_MAX_POSSIBLE_DICT_SIZE)) { print_error("Invalid dictionary size: %s\n", str.c_str()); return EXIT_FAILURE; } options.m_dict_size_log2 = dict_size; break; } case 'm': { int comp_level = atoi(str.c_str() + 2); if ((comp_level < 0) || (comp_level > (int)LZHAM_COMP_LEVEL_UBER)) { print_error("Invalid compression level: %s\n", str.c_str()); return EXIT_FAILURE; } options.m_comp_level = static_cast<lzham_compress_level>(comp_level); break; } case 't': { int num_threads = atoi(str.c_str() + 2); if ((num_threads < 0) || (num_threads > LZHAM_MAX_HELPER_THREADS)) { print_error("Invalid number of helper threads: %s\n", str.c_str()); return EXIT_FAILURE; } options.m_max_helper_threads = num_threads; break; } case 'c': { options.m_compute_adler32_during_decomp = false; break; } case 'v': { options.m_verify_compressed_data = true; break; } case 'r': { options.m_randomize_params = true; break; } case 'p': { options.m_force_polar_codes = true; break; } case 'x': { options.m_extreme_parsing = true; break; } case 'e': { options.m_deterministic_parsing = true; break; } case 's': { int seed = atoi(str.c_str() + 2); srand(seed); printf("Using random seed: %i\n", seed); break; } default: { print_error("Invalid option: %s\n", str.c_str()); return EXIT_FAILURE; } } cmd_line.erase(cmd_line.begin() + i); i--; continue; } if (str.size() != 1) { print_error("Invalid mode: %s\n", str.c_str()); return EXIT_FAILURE; } switch (tolower(str[0])) { case 'c': { op_mode = OP_MODE_COMPRESS; break; } case 'd': { op_mode = OP_MODE_DECOMPRESS; break; } case 'a': { op_mode = OP_MODE_ALL; break; } default: { print_error("Invalid mode: %s\n", str.c_str()); return EXIT_FAILURE; } } cmd_line.erase(cmd_line.begin() + i); break; } if (op_mode == OP_MODE_INVALID) { print_error("No mode specified!\n"); print_usage(); return EXIT_FAILURE; } printf("Using options:\n"); options.print(); printf("\n"); int exit_status = EXIT_FAILURE; switch (op_mode) { case OP_MODE_COMPRESS: { if (cmd_line.size() < 2) { print_error("Must specify input and output filenames!\n"); return EXIT_FAILURE; } else if (cmd_line.size() > 2) { print_error("Too many filenames!\n"); return EXIT_FAILURE; } const std::string &src_file = cmd_line[0]; const std::string &cmp_file = cmd_line[1]; bool comp_result = compress_streaming(lzham_dll, src_file.c_str(), cmp_file.c_str(), options); if (comp_result) exit_status = EXIT_SUCCESS; if ((comp_result) && (options.m_verify_compressed_data)) { char decomp_file[256]; #ifdef _XBOX sprintf(decomp_file, "e:\\__decomp_temp_%u__.tmp", (uint)GetTickCount()); #else sprintf(decomp_file, "__decomp_temp_%u__.tmp", (uint)timer::get_ms()); #endif if (!decompress_file(lzham_dll, cmp_file.c_str(), decomp_file, options)) { print_error("Failed decompressing file \"%s\" to \"%s\"\n", cmp_file.c_str(), decomp_file); return EXIT_FAILURE; } printf("Comparing file \"%s\" to \"%s\"\n", decomp_file, src_file.c_str()); if (!compare_files(decomp_file, src_file.c_str())) { print_error("Failed comparing decompressed file data while compressing \"%s\" to \"%s\"\n", src_file.c_str(), cmp_file.c_str()); return EXIT_FAILURE; } else { printf("Decompressed file compared OK to original file.\n"); } remove(decomp_file); } break; } case OP_MODE_DECOMPRESS: { if (cmd_line.size() < 2) { print_error("Must specify input and output filenames!\n"); return EXIT_FAILURE; } else if (cmd_line.size() > 2) { print_error("Too many filenames!\n"); return EXIT_FAILURE; } if (decompress_file(lzham_dll, cmd_line[0].c_str(), cmd_line[1].c_str(), options)) exit_status = EXIT_SUCCESS; break; } case OP_MODE_ALL: { if (!cmd_line.size()) { print_error("No directory specified!\n"); return EXIT_FAILURE; } else if (cmd_line.size() != 1) { print_error("Too many filenames!\n"); return EXIT_FAILURE; } if (test_recursive(lzham_dll, cmd_line[0].c_str(), options)) exit_status = EXIT_SUCCESS; break; } default: { print_error("No mode specified!\n"); print_usage(); return EXIT_FAILURE; } } return exit_status; }
bool MyCmdLineParser::Parse(const string_array& args) { // first parse the input command line into entries, don't care if they are valid or not vector<string_array*> entries; string_array pureArgs; string_array* sa = NULL; int argc = args.size(); for ( int i = 1; i < argc; i++ ) { // hack to ignore mac os x bundle argument string arg = args[i]; #ifdef Q_CYGWIN_WIN arg = MyUtils::CygwinPathProof(args[i].c_str()).toStdString(); #endif if (arg.substr(0, 6) != "-psn_0" ) { if ( arg[0] == '-' && arg.length() > 1 && !IsNumber( arg[1] ) && arg[1] != '.' ) { sa = new string_array; sa->clear(); if ( arg.length() > 2 && arg[1] == '-' ) // long name { sa->push_back( arg.substr(2) ); } else { sa->push_back( arg.substr(1) ); } entries.push_back( sa ); } else if ( sa ) { sa->push_back( arg ); } else { pureArgs.push_back( arg ); } } } // m_cmdLineEntries.clear(); CmdLineEntry e; bool bSucceed = true; string error_msg = ""; for ( size_t i = 0; i < entries.size(); i++ ) { string_array strgs = *entries[i]; if ( !IsValid( strgs[0].c_str(), &e ) ) // && !IsValid( strgs[0].c_str() + 1, &e ) ) { bSucceed = false; error_msg += "Option '" + strgs[0] + "' not recognized."; break; } if ( e.type == CMD_LINE_OPTION ) { e.arguments.clear(); for ( size_t j = 1; j < strgs.size(); j++ ) { if ( j <= (size_t)e.maxArguments ) { e.arguments.push_back( strgs[j] ); } else { pureArgs.push_back( strgs[j] ); } } if ( (int)e.arguments.size() < e.minArguments ) { bSucceed = false; // cout << e.arguments.size() << " " << e.minArguments << "\n"; error_msg += "Argument missing for option '" + strgs[0] + "'."; } } else if ( e.type == CMD_LINE_SWITCH ) { for ( size_t j = 1; j < strgs.size(); j++ ) { pureArgs.push_back( strgs[j] ); } } m_cmdLineEntries.push_back( e ); } // release buffers for ( size_t i = 0; i < entries.size(); i++ ) { delete entries[i]; } entries.clear(); /* if ( bSucceed && (int)pureArgs.size() > m_nNumberOfPureArguments ) { bSucceed = false; error_msg += "Option '" + pureArgs[0] + "' not recognized."; } */ m_cmdLineFloatingArguments = pureArgs; if ( !bSucceed ) { PrintErrorMessage( error_msg ); } else if ( Found( "h" ) || Found( "help" ) ) { PrintHelp(); bSucceed = false; } return bSucceed; }