示例#1
0
static bool find_files(std::string pathname, const std::string &filename, string_array &files, bool recursive)
{
   if (!pathname.empty())
   {
      char c = pathname[pathname.size() - 1];
      if ((c != ':') && (c != '\\') && (c != '/'))
         pathname += "/";
   }

   DIR *dp = opendir(pathname.c_str());

   if (!dp)
      return false;

   string_array paths;

   for ( ; ; )
   {
      struct dirent *ep = readdir(dp);
      if (!ep)
         break;

      const bool is_directory = (ep->d_type & DT_DIR) != 0;
      const bool is_file =  (ep->d_type & DT_REG) != 0;

      if (ep->d_name[0] == '.')
         continue;

      std::string filename(ep->d_name);

      if (is_directory)
      {
         if (recursive)
            paths.push_back(filename);
      }
      else if (is_file)
         files.push_back(pathname + filename);
   }

   closedir(dp);
   dp = NULL;

   if (recursive)
   {
      for (uint i = 0; i < paths.size(); i++)
      {
         const std::string &path = paths[i];
         if (!find_files(pathname + path, filename, files, true))
            return false;
      }
   }

   return true;
}
示例#2
0
bool MyFb2File::GetGenres(string_array &genres)
{
  bool retVal = false;
  int pos = 0;
  if (m_titleInfo.length() == 0)
  {
    m_titleInfo = GetXmlTag(titleInfoTag, pos);
  }
  if (m_titleInfo.length() == 0)
  {
    return retVal;
  }

  genres.clear();

  pos = 0;
  QString curGenre = tr("");
  while ((curGenre = GetXmlTag(genreTag, m_titleInfo, pos)).length() > 0)
  {
    int comma = curGenre.indexOf(',');
    if (comma != -1)
    {
      int pos = 0;
      QString g = tr("");
      while ((comma = curGenre.indexOf(',', pos)) != -1)
      {
        int beg = pos, end = comma;
        while (curGenre.mid(beg, 1) == tr(" "))
        {
          beg++;
        }
        while (curGenre.mid(end - 1, 1) == tr(" "))
        {
          end--;
        }
        if ((end - beg) > 0)
        {
          g = curGenre.mid(beg, end - beg);
          genres.push_back(Utf8toAnsi(g));
        }
        pos = comma + 1;
      }
      int beg = pos, end = curGenre.length();
      while (curGenre.mid(beg, 1) == tr(" "))
      {
        beg++;
      }
      while (curGenre.mid(end - 1, 1) == tr(" "))
      {
        end--;
      }
      if ((end - beg) > 0)
      {
        g = curGenre.mid(beg, end - beg);
        genres.push_back(Utf8toAnsi(g));
      }
    }
    else
    {
      genres.push_back(Utf8toAnsi(curGenre));
    }
    retVal = true;
  }

  return retVal;

}
示例#3
0
int main(int argc, char* argv[])
{
	// Check the required number of command line arguments.
	if (argc != 5)
	{
		cout << "usr host user pwd jobs_path" << endl;
		return 0;
	}

	// Fetch command line arguments.
	const auto host = argv[1];
	const auto user = argv[2];
	const auto pwd = argv[3];
	const path jobs_path = argv[4];

	// Connect to host and authenticate user.
	DBClientConnection conn;
	{
		cout << local_time() << "Connecting to " << host << " and authenticating " << user << endl;
		string errmsg;
		if ((!conn.connect(host, errmsg)) || (!conn.auth("istar", user, pwd, errmsg)))
		{
			cerr << local_time() << errmsg << endl;
			return 1;
		}
	}

	// Initialize constants.
	cout << local_time() << "Initializing" << endl;
	const auto collection = "istar.usr";
	const auto epoch = date(1970, 1, 1);
	const size_t num_usrs = 2;
	constexpr array<size_t, num_usrs> qn{{ 12, 60 }};
	constexpr array<double, num_usrs> qv{{ 1.0 / qn[0], 1.0 / qn[1] }};
	const size_t num_references = 4;
	const size_t num_subsets = 5;
	const array<string, num_subsets> SubsetSMARTS
	{{
		"[!#1]", // heavy
		"[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]", // hydrophobic
		"[a]", // aromatic
		"[$([O,S;H1;v2]-[!$(*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),$([N&v3;H1,H2]-[!$(*=[O,N,P,S])]),$([N;v3;H0]),$([n,o,s;+0]),F]", // acceptor
		"[N!H0v3,N!H0+v4,OH+0,SH+0,nH+0]", // donor
	}};

	// Initialize variables.
	array<array<double, qn.back()>, 1> qw;
	array<array<double, qn.back()>, 1> lw;
	auto q = qw[0];
	auto l = lw[0];

	// Read ZINC ID file.
	const string_array<size_t> zincids("16_zincid.txt");
	const auto num_ligands = zincids.size();

	// Read SMILES file.
	const string_array<size_t> smileses("16_smiles.txt");
	assert(smileses.size() == num_ligands);

	// Read supplier file.
	const string_array<size_t> suppliers("16_supplier.txt");
	assert(suppliers.size() == num_ligands);

	// Read property files of floating point types and integer types.
	const auto zfproperties = read<array<float, 4>>("16_zfprop.f32");
	assert(zfproperties.size() == num_ligands);
	const auto ziproperties = read<array<int16_t, 5>>("16_ziprop.i16");
	assert(ziproperties.size() == num_ligands);

	// Open files for subsequent reading.
	std::ifstream usrcat_bin("16_usrcat.f64");
	stream_array<size_t> ligands("16_ligand.pdbqt");
	assert(ligands.size() == num_ligands);
	array<vector<double>, 2> scores
	{{
		vector<double>(num_ligands, 0),
		vector<double>(num_ligands, 0)
	}};
	const auto& u0scores = scores[0];
	const auto& u1scores = scores[1];
	vector<size_t> scase(num_ligands);

	// Enter event loop.
	cout << local_time() << "Entering event loop" << endl;
	bool sleeping = false;
	while (true)
	{
		// Fetch an incompleted job in a first-come-first-served manner.
		if (!sleeping) cout << local_time() << "Fetching an incompleted job" << endl;
		BSONObj info;
		conn.runCommand("istar", BSON("findandmodify" << "usr" << "query" << BSON("done" << BSON("$exists" << false) << "started" << BSON("$exists" << false)) << "sort" << BSON("submitted" << 1) << "update" << BSON("$set" << BSON("started" << Date_t(duration_cast<std::chrono::milliseconds>(system_clock::now().time_since_epoch()).count())))), info); // conn.findAndModify() is available since MongoDB C++ Driver legacy-1.0.0
		const auto value = info["value"];
		if (value.isNull())
		{
			// No incompleted jobs. Sleep for a while.
			if (!sleeping) cout << local_time() << "Sleeping" << endl;
			sleeping = true;
			this_thread::sleep_for(chrono::seconds(10));
			continue;
		}
		sleeping = false;
		const auto job = value.Obj();

		// Obtain job properties.
		const auto _id = job["_id"].OID();
		cout << local_time() << "Executing job " << _id.str() << endl;
		const auto job_path = jobs_path / _id.str();
		const auto format = job["format"].String();
		const auto email = job["email"].String();

		// Parse the user-supplied ligand.
		OBMol obMol;
		OBConversion obConversion;
		obConversion.SetInFormat(format.c_str());
		obConversion.ReadFile(&obMol, (job_path / ("ligand." + format)).string());
		const auto num_atoms = obMol.NumAtoms();
//		obMol.AddHydrogens(); // Adding hydrogens does not seem to affect SMARTS matching.

		// Classify subset atoms.
		array<vector<int>, num_subsets> subsets;
		for (size_t k = 0; k < num_subsets; ++k)
		{
			auto& subset = subsets[k];
			subset.reserve(num_atoms);
			OBSmartsPattern smarts;
			smarts.Init(SubsetSMARTS[k]);
			smarts.Match(obMol);
			for (const auto& map : smarts.GetMapList())
			{
				subset.push_back(map.front());
			}
		}
		const auto& subset0 = subsets.front();

		// Check user-provided ligand validity.
		if (subset0.empty())
		{
			// Record job completion time stamp.
			const auto millis_since_epoch = duration_cast<std::chrono::milliseconds>(system_clock::now().time_since_epoch()).count();
			conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("done" << Date_t(millis_since_epoch))));

			// Send error notification email.
			cout << local_time() << "Sending an error notification email to " << email << endl;
			MailMessage message;
			message.setSender("usr <*****@*****.**>");
			message.setSubject("Your usr job has failed");
			message.setContent("Description: " + job["description"].String() + "\nSubmitted: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(job["submitted"].Date().millis))) + " UTC\nFailed: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(millis_since_epoch))) + " UTC\nReason: failed to parse the provided ligand.");
			message.addRecipient(MailRecipient(MailRecipient::PRIMARY_RECIPIENT, email));
			SMTPClientSession session("137.189.91.190");
			session.login();
			session.sendMessage(message);
			session.close();
			continue;
		}

		// Calculate the four reference points.
		const auto n = subset0.size();
		const auto v = 1.0 / n;
		array<vector3, num_references> references{};
		auto& ctd = references[0];
		auto& cst = references[1];
		auto& fct = references[2];
		auto& ftf = references[3];
		for (const auto i : subset0)
		{
			ctd += obMol.GetAtom(i)->GetVector();
		}
		ctd *= v;
		double cst_dist = numeric_limits<double>::max();
		double fct_dist = numeric_limits<double>::lowest();
		double ftf_dist = numeric_limits<double>::lowest();
		for (const auto i : subset0)
		{
			const auto& a = obMol.GetAtom(i)->GetVector();
			const auto this_dist = a.distSq(ctd);
			if (this_dist < cst_dist)
			{
				cst = a;
				cst_dist = this_dist;
			}
			if (this_dist > fct_dist)
			{
				fct = a;
				fct_dist = this_dist;
			}
		}
		for (const auto i : subset0)
		{
			const auto& a = obMol.GetAtom(i)->GetVector();
			const auto this_dist = a.distSq(fct);
			if (this_dist > ftf_dist)
			{
				ftf = a;
				ftf_dist = this_dist;
			}
		}

		// Precalculate the distances between each atom and each reference point.
		array<vector<double>, num_references> dista;
		for (size_t k = 0; k < num_references; ++k)
		{
			const auto& reference = references[k];
			auto& dists = dista[k];
			dists.resize(1 + num_atoms); // OpenBabel atom index starts from 1. dists[0] is dummy.
			for (size_t i = 0; i < n; ++i)
			{
				dists[subset0[i]] = sqrt(obMol.GetAtom(subset0[i])->GetVector().distSq(reference));
			}
		}

		// Calculate USR and USRCAT features of the input ligand.
		size_t qo = 0;
		for (const auto& subset : subsets)
		{
			const auto n = subset.size();
			for (size_t k = 0; k < num_references; ++k)
			{
				const auto& distp = dista[k];
				vector<double> dists(n);
				for (size_t i = 0; i < n; ++i)
				{
					dists[i] = distp[subset[i]];
				}
				array<double, 3> m{};
				if (n > 2)
				{
					const auto v = 1.0 / n;
					for (size_t i = 0; i < n; ++i)
					{
						const auto d = dists[i];
						m[0] += d;
					}
					m[0] *= v;
					for (size_t i = 0; i < n; ++i)
					{
						const auto d = dists[i] - m[0];
						m[1] += d * d;
					}
					m[1] = sqrt(m[1] * v);
					for (size_t i = 0; i < n; ++i)
					{
						const auto d = dists[i] - m[0];
						m[2] += d * d * d;
					}
					m[2] = cbrt(m[2] * v);
				}
				else if (n == 2)
				{
					m[0] = 0.5 *     (dists[0] + dists[1]);
					m[1] = 0.5 * fabs(dists[0] - dists[1]);
				}
				else if (n == 1)
				{
					m[0] = dists[0];
				}
				#pragma unroll
				for (const auto e : m)
				{
					q[qo++] = e;
				}
			}
		}
		assert(qo == qn.back());

		// Compute USR and USRCAT scores.
		usrcat_bin.seekg(0);
		for (size_t k = 0; k < num_ligands; ++k)
		{
			usrcat_bin.read(reinterpret_cast<char*>(l.data()), sizeof(l));
			double s = 0;
			#pragma unroll
			for (size_t i = 0, u = 0; u < num_usrs; ++u)
			{
				#pragma unroll
				for (const auto qnu = qn[u]; i < qnu; ++i)
				{
					s += fabs(q[i] - l[i]);
				}
				scores[u][k] = s;
			}
		}
		assert(usrcat_bin.tellg() == sizeof(l) * num_ligands);

		// Sort ligands by USRCAT score and then by USR score and then by ZINC ID.
		iota(scase.begin(), scase.end(), 0);
		sort(scase.begin(), scase.end(), [&](const size_t val0, const size_t val1)
		{
			const auto u1score0 = u1scores[val0];
			const auto u1score1 = u1scores[val1];
			if (u1score0 == u1score1)
			{
				const auto u0score0 = u0scores[val0];
				const auto u0score1 = u0scores[val1];
				if (u0score0 == u0score1)
				{
					return zincids[val0] < zincids[val1];
				}
				return u0score0 < u0score1;
			}
			return u1score0 < u1score1;
		});

		// Write results.
		filtering_ostream log_csv_gz;
		log_csv_gz.push(gzip_compressor());
		log_csv_gz.push(file_sink((job_path / "log.csv.gz").string()));
		log_csv_gz.setf(ios::fixed, ios::floatfield);
		log_csv_gz << "ZINC ID,USR score,USRCAT score\n" << setprecision(8);
		filtering_ostream ligands_pdbqt_gz;
		ligands_pdbqt_gz.push(gzip_compressor());
		ligands_pdbqt_gz.push(file_sink((job_path / "ligands.pdbqt.gz").string()));
		ligands_pdbqt_gz.setf(ios::fixed, ios::floatfield);
		for (size_t t = 0; t < 10000; ++t)
		{
			const size_t k = scase[t];
			const auto zincid = zincids[k].substr(0, 8); // Take another substr() to get rid of the trailing newline.
			const auto u0score = 1 / (1 + scores[0][k] * qv[0]);
			const auto u1score = 1 / (1 + scores[1][k] * qv[1]);
			log_csv_gz << zincid << ',' << u0score << ',' << u1score << '\n';

			// Only write conformations of the top ligands to ligands.pdbqt.gz.
			if (t >= 1000) continue;

			const auto zfp = zfproperties[k];
			const auto zip = ziproperties[k];
			ligands_pdbqt_gz
				<< "MODEL " << '\n'
				<< "REMARK 911 " << zincid
				<< setprecision(3)
				<< ' ' << setw(8) << zfp[0]
				<< ' ' << setw(8) << zfp[1]
				<< ' ' << setw(8) << zfp[2]
				<< ' ' << setw(8) << zfp[3]
				<< ' ' << setw(3) << zip[0]
				<< ' ' << setw(3) << zip[1]
				<< ' ' << setw(3) << zip[2]
				<< ' ' << setw(3) << zip[3]
				<< ' ' << setw(3) << zip[4]
				<< '\n'
				<< "REMARK 912 " << smileses[k]  // A newline is already included in smileses[k].
				<< "REMARK 913 " << suppliers[k] // A newline is already included in suppliers[k].
				<< setprecision(8)
				<< "REMARK 951    USR SCORE: " << setw(10) << u0score << '\n'
				<< "REMARK 952 USRCAT SCORE: " << setw(10) << u1score << '\n'
			;
			const auto lig = ligands[k];
			ligands_pdbqt_gz.write(lig.data(), lig.size());
			ligands_pdbqt_gz << "ENDMDL\n";
		}

		// Update progress.
		cout << local_time() << "Setting done time" << endl;
		const auto millis_since_epoch = duration_cast<std::chrono::milliseconds>(system_clock::now().time_since_epoch()).count();
		conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("done" << Date_t(millis_since_epoch))));

		// Send completion notification email.
		cout << local_time() << "Sending a completion notification email to " << email << endl;
		MailMessage message;
		message.setSender("istar <*****@*****.**>");
		message.setSubject("Your usr job has completed");
		message.setContent("Description: " + job["description"].String() + "\nSubmitted: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(job["submitted"].Date().millis))) + " UTC\nCompleted: " + to_simple_string(ptime(epoch, boost::posix_time::milliseconds(millis_since_epoch))) + " UTC\nResult: http://istar.cse.cuhk.edu.hk/usr/iview/?" + _id.str());
		message.addRecipient(MailRecipient(MailRecipient::PRIMARY_RECIPIENT, email));
		SMTPClientSession session("137.189.91.190");
		session.login();
		session.sendMessage(message);
		session.close();
	}
}
示例#4
0
int main(int argc, char* argv[])
{
	// Check the required number of command line arguments.
	if (argc != 5)
	{
		cout << "usr host user pwd jobs_path" << endl;
		return 0;
	}

	// Fetch command line arguments.
	const auto host = argv[1];
	const auto user = argv[2];
	const auto pwd = argv[3];
	const path jobs_path = argv[4];

	DBClientConnection conn;
	{
		// Connect to host and authenticate user.
		cout << local_time() << "Connecting to " << host << " and authenticating " << user << endl;
		string errmsg;
		if ((!conn.connect(host, errmsg)) || (!conn.auth("istar", user, pwd, errmsg)))
		{
			cerr << local_time() << errmsg << endl;
			return 1;
		}
	}

	// Initialize constants.
	cout << local_time() << "Initializing" << endl;
	const auto collection = "istar.usr2";
	const size_t num_usrs = 2;
	const array<string, 2> usr_names{{ "USR", "USRCAT" }};
	constexpr array<size_t, num_usrs> qn{{ 12, 60 }};
	constexpr array<double, num_usrs> qv{{ 1.0 / qn[0], 1.0 / qn[1] }};
	const size_t num_refPoints = 4;
	const size_t num_subsets = 5;
	const array<string, num_subsets> SubsetSMARTS
	{{
		"[!#1]", // heavy
		"[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]", // hydrophobic
		"[a]", // aromatic
		"[$([O,S;H1;v2]-[!$(*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),$([N&v3;H1,H2]-[!$(*=[O,N,P,S])]),$([N;v3;H0]),$([n,o,s;+0]),F]", // acceptor
		"[N!H0v3,N!H0+v4,OH+0,SH+0,nH+0]", // donor
	}};
	const size_t num_hits = 100;

	// Wrap SMARTS strings to RWMol objects.
	array<unique_ptr<ROMol>, num_subsets> SubsetMols;
	for (size_t k = 0; k < num_subsets; ++k)
	{
		SubsetMols[k].reset(reinterpret_cast<ROMol*>(SmartsToMol(SubsetSMARTS[k])));
	}

	// Read ZINC ID file.
	const string_array<size_t> zincids("16/zincid.txt");
	const auto num_ligands = zincids.size();
	cout << local_time() << "Found " << num_ligands << " database molecules" << endl;

	// Read SMILES file.
	const string_array<size_t> smileses("16/smiles.txt");
	assert(smileses.size() == num_ligands);

	// Read supplier file.
	const string_array<size_t> suppliers("16/supplier.txt");
	assert(suppliers.size() == num_ligands);

	// Read property files of floating point types and integer types.
	const auto zfproperties = read<array<float, 4>>("16/zfprop.f32");
	assert(zfproperties.size() == num_ligands);
	const auto ziproperties = read<array<int16_t, 5>>("16/ziprop.i16");
	assert(ziproperties.size() == num_ligands);

	// Read cumulative number of conformers file.
	const auto mconfss = read<size_t>("16/mconfs.u64");
	const auto num_conformers = mconfss.back();
	assert(mconfss.size() == num_ligands);
	assert(num_conformers >= num_ligands);
	cout << local_time() << "Found " << num_conformers << " database conformers" << endl;

	// Read feature file.
	const auto features = read<array<double, qn.back()>>("16/usrcat.f64");
	assert(features.size() == num_conformers);

	// Read ligand footer file and open ligand SDF file for seeking and reading.
	stream_array<size_t> ligands("16/ligand.sdf");
	assert(ligands.size() == num_conformers);

	// Initialize variables.
	array<vector<int>, num_subsets> subsets;
	array<vector<double>, num_refPoints> dista;
	alignas(32) array<double, qn.back()> q;

	// Initialize vectors to store compounds' primary score and their corresponding conformer.
	vector<double> scores(num_ligands); // Primary score of molecules.
	vector<size_t> cnfids(num_ligands); // ID of conformer with the best primary score.
	const auto compare = [&](const size_t val0, const size_t val1) // Sort by the primary score.
	{
		return scores[val0] < scores[val1];
	};

	// Initialize an io service pool and create worker threads for later use.
	const size_t num_threads = thread::hardware_concurrency();
	cout << local_time() << "Creating an io service pool of " << num_threads << " worker threads" << endl;
	io_service_pool io(num_threads);
	safe_counter<size_t> cnt;

	// Initialize the number of chunks and the number of molecules per chunk.
	const auto num_chunks = num_threads << 4;
	const auto chunk_size = 1 + (num_ligands - 1) / num_chunks;
	assert(chunk_size * num_chunks >= num_ligands);
	assert(chunk_size >= num_hits);
	cout << local_time() << "Using " << num_chunks << " chunks and a chunk size of " << chunk_size << endl;
	vector<size_t> scase(num_ligands);
	vector<size_t> zcase(num_hits * (num_chunks - 1) + min(num_hits, num_ligands - chunk_size * (num_chunks - 1))); // The last chunk might have fewer than num_hits records.

	// Enter event loop.
	cout << local_time() << "Entering event loop" << endl;
	cout.setf(ios::fixed, ios::floatfield);
	bool sleeping = false;
	while (true)
	{
		// Fetch an incompleted job in a first-come-first-served manner.
		if (!sleeping) cout << local_time() << "Fetching an incompleted job" << endl;
		BSONObj info;
		const auto started = milliseconds_since_epoch();
		conn.runCommand("istar", BSON("findandmodify" << "usr2" << "query" << BSON("started" << BSON("$exists" << false)) << "sort" << BSON("submitted" << 1) << "update" << BSON("$set" << BSON("started" << started))), info); // conn.findAndModify() is available since MongoDB C++ Driver legacy-1.0.0
		const auto value = info["value"];
		if (value.isNull())
		{
			// No incompleted jobs. Sleep for a while.
			if (!sleeping) cout << local_time() << "Sleeping" << endl;
			sleeping = true;
			this_thread::sleep_for(chrono::seconds(2));
			continue;
		}
		sleeping = false;
		const auto job = value.Obj();

		// Obtain job properties.
		const auto _id = job["_id"].OID();
		cout << local_time() << "Executing job " << _id.str() << endl;
		const auto job_path = jobs_path / _id.str();
		const size_t usr0 = job["usr"].Int(); // Specify the primary sorting score. 0: USR; 1: USRCAT.
		assert(usr0 == 0 || usr0 == 1);
		const auto usr1 = usr0 ^ 1;
		const auto qnu0 = qn[usr0];
		const auto qnu1 = qn[usr1];

		// Read and validate the user-supplied SDF file.
		cout << local_time() << "Reading and validating the query file" << endl;
		SDMolSupplier sup((job_path / "query.sdf").string(), true, false, true); // sanitize, removeHs, strictParsing
		if (!sup.length() || !sup.atEnd())
		{
			const auto error = 1;
			cout << local_time() << "Failed to parse the query file, error code = " << error << endl;
			conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("completed" << milliseconds_since_epoch() << "error" << error)));
			continue;
		}

		// Process each of the query molecules sequentially.
		const auto num_queries = 1; // Restrict the number of query molecules to 1. Setting num_queries = sup.length() to execute any number of query molecules.
		for (unsigned int query_number = 0; query_number < num_queries; ++query_number)
		{
			cout << local_time() << "Parsing query molecule " << query_number << endl;
			const unique_ptr<ROMol> qry_ptr(sup.next()); // Calling next() may print "ERROR: Could not sanitize molecule on line XXXX" to stderr.
			auto& qryMol = *qry_ptr;

			// Get the number of atoms, including and excluding hydrogens.
			const auto num_atoms = qryMol.getNumAtoms();
			const auto num_heavy_atoms = qryMol.getNumHeavyAtoms();
			assert(num_heavy_atoms);
			cout << local_time() << "Found " << num_atoms << " atoms and " << num_heavy_atoms << " heavy atoms" << endl;

			// Create an output directory.
			cout << local_time() << "Creating output directory" << endl;
			const auto output_dir = job_path / to_string(query_number);
			create_directory(output_dir);

			// Draw a SVG.
			cout << local_time() << "Drawing a SVG" << endl;
			{
				const unique_ptr<ROMol> qrz_ptr(removeHs(qryMol));
				auto& qrzMol = *qrz_ptr;
				compute2DCoords(qrzMol);
				boost::filesystem::ofstream ofs(output_dir / "query.svg");
				ofs << DrawingToSVG(MolToDrawing(qrzMol));
			}

			// Calculate Morgan fingerprint.
			cout << local_time() << "Calculating Morgan fingerprint" << endl;
			const unique_ptr<SparseIntVect<uint32_t>> qryFp(getFingerprint(qryMol, 2));

			// Classify atoms to pharmacophoric subsets.
			cout << local_time() << "Classifying atoms into subsets" << endl;
			for (size_t k = 0; k < num_subsets; ++k)
			{
				vector<vector<pair<int, int>>> matchVect;
				SubstructMatch(qryMol, *SubsetMols[k], matchVect);
				const auto num_matches = matchVect.size();
				auto& subset = subsets[k];
				subset.resize(num_matches);
				for (size_t i = 0; i < num_matches; ++i)
				{
					subset[i] = matchVect[i].front().second;
				}
				cout << local_time() << "Found " << num_matches << " atoms for subset " << k << endl;
			}
			const auto& subset0 = subsets.front();
			assert(subset0.size() == num_heavy_atoms);

			// Calculate the four reference points.
			cout << local_time() << "Calculating " << num_refPoints << " reference points" << endl;
			const auto qryRefPoints = calcRefPoints(qryMol, subset0);
			const Point3DConstPtrVect qryRefPointv
			{{
				&qryRefPoints[0],
				&qryRefPoints[1],
				&qryRefPoints[2],
				&qryRefPoints[3],
			}};

			// Precalculate the distances of heavy atoms to the reference points, given that subsets[1 to 4] are subsets of subsets[0].
			cout << local_time() << "Calculating " << num_heavy_atoms * num_refPoints << " pairwise distances" << endl;
			const auto& qryCnf = qryMol.getConformer();
			for (size_t k = 0; k < num_refPoints; ++k)
			{
				const auto& refPoint = qryRefPoints[k];
				auto& distp = dista[k];
				distp.resize(num_atoms);
				for (size_t i = 0; i < num_heavy_atoms; ++i)
				{
					distp[subset0[i]] = sqrt(dist2(qryCnf.getAtomPos(subset0[i]), refPoint));
				}
			}

			// Loop over pharmacophoric subsets and reference points.
			cout << local_time() << "Calculating " << 3 * num_refPoints * num_subsets << " moments of USRCAT feature" << endl;
			size_t qo = 0;
			for (const auto& subset : subsets)
			{
				const auto n = subset.size();
				for (size_t k = 0; k < num_refPoints; ++k)
				{
					// Load distances from precalculated ones.
					const auto& distp = dista[k];
					vector<double> dists(n);
					for (size_t i = 0; i < n; ++i)
					{
						dists[i] = distp[subset[i]];
					}

					// Compute moments.
					array<double, 3> m{};
					if (n > 2)
					{
						const auto v = 1.0 / n;
						for (size_t i = 0; i < n; ++i)
						{
							const auto d = dists[i];
							m[0] += d;
						}
						m[0] *= v;
						for (size_t i = 0; i < n; ++i)
						{
							const auto d = dists[i] - m[0];
							m[1] += d * d;
						}
						m[1] = sqrt(m[1] * v);
						for (size_t i = 0; i < n; ++i)
						{
							const auto d = dists[i] - m[0];
							m[2] += d * d * d;
						}
						m[2] = cbrt(m[2] * v);
					}
					else if (n == 2)
					{
						m[0] = 0.5 *     (dists[0] + dists[1]);
						m[1] = 0.5 * fabs(dists[0] - dists[1]);
					}
					else if (n == 1)
					{
						m[0] = dists[0];
					}
					for (const auto e : m)
					{
						q[qo++] = e;
					}
				}
			}
			assert(qo == qn.back());

			// Compute USR and USRCAT scores.
			cout << local_time() << "Calculating " << num_ligands << " " << usr_names[usr0] << " scores" << endl;
			scores.assign(scores.size(), numeric_limits<double>::max());
			iota(scase.begin(), scase.end(), 0);
			cnt.init(num_chunks);
			for (size_t l = 0; l < num_chunks; ++l)
			{
				io.post([&,l]()
				{
					// Loop over molecules of the current chunk.
					const auto chunk_beg = chunk_size * l;
					const auto chunk_end = min(chunk_beg + chunk_size, num_ligands);
					for (size_t k = chunk_beg; k < chunk_end; ++k)
					{
						// Loop over conformers of the current molecule and calculate their primary score.
						auto& scorek = scores[k];
						size_t j = k ? mconfss[k - 1] : 0;
						for (const auto mconfs = mconfss[k]; j < mconfs; ++j)
						{
							const auto& d = features[j];
							double s = 0;
							for (size_t i = 0; i < qnu0; ++i)
							{
								s += abs(q[i] - d[i]);
								if (s >= scorek) break;
							}
							if (s < scorek)
							{
								scorek = s;
								cnfids[k] = j;
							}
						}
					}

					// Sort the scores of molecules of the current chunk.
					sort(scase.begin() + chunk_beg, scase.begin() + chunk_end, compare);

					// Copy the indexes of top hits of the current chunk to a global vector for final sorting.
					copy_n(scase.begin() + chunk_beg, min(num_hits, chunk_end - chunk_beg), zcase.begin() + num_hits * l);

					cnt.increment();
				});
			}
			cnt.wait();

			// Sort the top hits from chunks.
			cout << local_time() << "Sorting " << zcase.size() << " hits by " << usr_names[usr0] << " score" << endl;
			sort(zcase.begin(), zcase.end(), compare);

			// Create output directory and write output files.
			cout << local_time() << "Writing output files" << endl;
			SDWriter hits_sdf((output_dir / "hits.sdf").string());
			boost::filesystem::ofstream hits_csv(output_dir / "hits.csv");
			hits_csv.setf(ios::fixed, ios::floatfield);
			hits_csv << "ZINC ID,USR score,USRCAT score,2D Tanimoto score,Molecular weight (g/mol),Partition coefficient xlogP,Apolar desolvation (kcal/mol),Polar desolvation (kcal/mol),Hydrogen bond donors,Hydrogen bond acceptors,Polar surface area tPSA (Å^2),Net charge,Rotatable bonds,SMILES,Vendors and annotations\n";
			for (size_t l = 0; l < num_hits; ++l)
			{
				// Obtain indexes to the hit molecule and the hit conformer.
				const auto k = zcase[l];
				const auto j = cnfids[k];

				// Read SDF content of the hit conformer.
				const auto lig = ligands[j];

				// Construct a RDKit ROMol object.
				istringstream iss(lig);
				SDMolSupplier sup(&iss, false, true, false, true);
				assert(sup.length() == 1);
				assert(sup.atEnd());
				const unique_ptr<ROMol> hit_ptr(sup.next());
				auto& hitMol = *hit_ptr;

				// Calculate Morgan fingerprint.
				const unique_ptr<SparseIntVect<uint32_t>> hitFp(getFingerprint(hitMol, 2));

				// Calculate Tanimoto similarity.
				const auto ts = TanimotoSimilarity(*qryFp, *hitFp);

				// Find heavy atoms.
				vector<vector<pair<int, int>>> matchVect;
				SubstructMatch(hitMol, *SubsetMols[0], matchVect);
				const auto num_matches = matchVect.size();
				assert(num_matches == hitMol.getNumHeavyAtoms());
				vector<int> hitHeavyAtoms(num_matches);
				for (size_t i = 0; i < num_matches; ++i)
				{
					hitHeavyAtoms[i] = matchVect[i].front().second;
					assert(hitHeavyAtoms[i] == i); // hitHeavyAtoms can be constructed using iota(hitHeavyAtoms.begin(), hitHeavyAtoms.end(), 0); because for RDKit-generated SDF molecules, heavy atom are always the first few atoms.
				}

				// Calculate the four reference points.
				const auto hitRefPoints = calcRefPoints(hitMol, hitHeavyAtoms);
				const Point3DConstPtrVect hitRefPointv
				{{
					&hitRefPoints[0],
					&hitRefPoints[1],
					&hitRefPoints[2],
					&hitRefPoints[3],
				}};

				// Calculate a 3D transform from the four reference points of the hit conformer to those of the query molecule.
				Transform3D trans;
				AlignPoints(qryRefPointv, hitRefPointv, trans);

				// Apply the 3D transform to all atoms of the hit conformer.
				auto& hitCnf = hitMol.getConformer();
				transformConformer(hitCnf, trans);

				// Write the aligned hit conformer.
				hits_sdf.write(hitMol);

				// Calculate the secondary score of the saved conformer, which has the best primary score.
				const auto& d = features[j];
				double s = 0;
				for (size_t i = 0; i < qnu1; ++i)
				{
					s += abs(q[i] - d[i]);
				}

				const auto u0score = 1 / (1 + scores[k] * qv[usr0]); // Primary score of the current molecule.
				const auto u1score = 1 / (1 + s         * qv[usr1]); // Secondary score of the current molecule.
				const auto zincid = zincids[k].substr(0, 8); // Take another substr() to get rid of the trailing newline.
				const auto zfp = zfproperties[k];
				const auto zip = ziproperties[k];
				const auto smiles = smileses[k];    // A newline is already included in smileses[k].
				const auto supplier = suppliers[k]; // A newline is already included in suppliers[k].
				hits_csv
					<< zincid
					<< setprecision(8)
					<< ',' << (usr1 ? u0score : u1score)
					<< ',' << (usr1 ? u1score : u0score)
					<< ',' << ts
					<< setprecision(3)
					<< ',' << zfp[0]
					<< ',' << zfp[1]
					<< ',' << zfp[2]
					<< ',' << zfp[3]
					<< ',' << zip[0]
					<< ',' << zip[1]
					<< ',' << zip[2]
					<< ',' << zip[3]
					<< ',' << zip[4]
					<< ',' << smiles.substr(0, smiles.length() - 1)     // Get rid of the trailing newline.
					<< ',' << supplier.substr(0, supplier.length() - 1) // Get rid of the trailing newline.
					<< '\n'
				;
			}
		}

		// Update job status.
		cout << local_time() << "Setting completed time" << endl;
		const auto completed = milliseconds_since_epoch();
		conn.update(collection, BSON("_id" << _id), BSON("$set" << BSON("completed" << completed << "nqueries" << num_queries)));

		// Calculate runtime in seconds and screening speed in million conformers per second.
		const auto runtime = (completed - started) * 0.001;
		const auto speed = num_conformers * 0.000001 * num_queries / runtime;
		cout
			<< local_time() << "Completed " << num_queries << " " << (num_queries == 1 ? "query" : "queries") << " in " << setprecision(3) << runtime << " seconds" << endl
			<< local_time() << "Screening speed was " << setprecision(0) << speed << " M conformers per second" << endl
		;
	}
}
示例#5
0
static bool find_files(std::string pathname, const std::string &filename, string_array &files, bool recursive)
{
   if (!pathname.empty())
   {
      char c = pathname[pathname.size() - 1];
      if ((c != ':') && (c != '\\') && (c != '/'))
         pathname += "\\";
   }

   WIN32_FIND_DATAA find_data;

   HANDLE findHandle = FindFirstFileA((pathname + filename).c_str(), &find_data);
   if (findHandle == INVALID_HANDLE_VALUE)
      return false;

   do
   {
      const bool is_directory = (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0;
      const bool is_system =  (find_data.dwFileAttributes & FILE_ATTRIBUTE_SYSTEM) != 0;
      const bool is_hidden =  (find_data.dwFileAttributes & FILE_ATTRIBUTE_HIDDEN) != 0;

      std::string filename(find_data.cFileName);

      if ((!is_directory) && (!is_system) && (!is_hidden))
         files.push_back(pathname + filename);

   } while (FindNextFileA(findHandle, &find_data));

   FindClose(findHandle);

   if (recursive)
   {
      string_array paths;

      HANDLE findHandle = FindFirstFileA((pathname + "*").c_str(), &find_data);
      if (findHandle == INVALID_HANDLE_VALUE)
         return false;

      do
      {
         const bool is_directory = (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0;
         const bool is_system =  (find_data.dwFileAttributes & FILE_ATTRIBUTE_SYSTEM) != 0;
         const bool is_hidden =  (find_data.dwFileAttributes & FILE_ATTRIBUTE_HIDDEN) != 0;

         std::string filename(find_data.cFileName);

         if ((is_directory) && (!is_hidden) && (!is_system))
            paths.push_back(filename);

      } while (FindNextFileA(findHandle, &find_data));

      FindClose(findHandle);

      for (uint i = 0; i < paths.size(); i++)
      {
         const std::string &path = paths[i];
         if (path[0] == '.')
            continue;

         if (!find_files(pathname + path, filename, files, true))
            return false;
      }
   }

   return true;
}
示例#6
0
int main_internal(string_array cmd_line, int num_helper_threads, ilzham &lzham_dll)
{
   comp_options options;
   options.m_max_helper_threads = num_helper_threads;

#ifdef _XBOX
   options.m_dict_size_log2 = 21;
#endif

   if (!cmd_line.size())
   {
      print_usage();
      return simple_test(lzham_dll, options);
   }

   enum op_mode_t
   {
      OP_MODE_INVALID = -1,
      OP_MODE_COMPRESS = 0,
      OP_MODE_DECOMPRESS = 1,
      OP_MODE_ALL = 2
   };

   op_mode_t op_mode = OP_MODE_INVALID;

   for (int i = 0; i < (int)cmd_line.size(); i++)
   {
      const std::string &str = cmd_line[i];
      if (str[0] == '-')
      {
         if (str.size() < 2)
         {
            print_error("Invalid option: %s\n", str.c_str());
            return EXIT_FAILURE;
         }
         switch (tolower(str[1]))
         {
            case 'u':
            {
               options.m_unbuffered_decompression = true;
               break;
            }
            case 'd':
            {
               int dict_size = atoi(str.c_str() + 2);
               if ((dict_size < LZHAM_MIN_DICT_SIZE_LOG2) || (dict_size > LZHAMTEST_MAX_POSSIBLE_DICT_SIZE))
               {
                  print_error("Invalid dictionary size: %s\n", str.c_str());
                  return EXIT_FAILURE;
               }
               options.m_dict_size_log2 = dict_size;
               break;
            }
            case 'm':
            {
               int comp_level = atoi(str.c_str() + 2);
               if ((comp_level < 0) || (comp_level > (int)LZHAM_COMP_LEVEL_UBER))
               {
                  print_error("Invalid compression level: %s\n", str.c_str());
                  return EXIT_FAILURE;
               }
               options.m_comp_level = static_cast<lzham_compress_level>(comp_level);
               break;
            }
            case 't':
            {
               int num_threads = atoi(str.c_str() + 2);
               if ((num_threads < 0) || (num_threads > LZHAM_MAX_HELPER_THREADS))
               {
                  print_error("Invalid number of helper threads: %s\n", str.c_str());
                  return EXIT_FAILURE;
               }
               options.m_max_helper_threads = num_threads;
               break;
            }
            case 'c':
            {
               options.m_compute_adler32_during_decomp = false;
               break;
            }
            case 'v':
            {
               options.m_verify_compressed_data = true;
               break;
            }
            case 'r':
            {
               options.m_randomize_params = true;
               break;
            }
            case 'p':
            {
               options.m_force_polar_codes = true;
               break;
            }
            case 'x':
            {
               options.m_extreme_parsing = true;
               break;
            }
            case 'e':
            {
               options.m_deterministic_parsing = true;
               break;
            }
            case 's':
            {
               int seed = atoi(str.c_str() + 2);
               srand(seed);
               printf("Using random seed: %i\n", seed);
               break;
            }
            default:
            {
               print_error("Invalid option: %s\n", str.c_str());
               return EXIT_FAILURE;
            }
         }

         cmd_line.erase(cmd_line.begin() + i);
         i--;

         continue;
      }

      if (str.size() != 1)
      {
         print_error("Invalid mode: %s\n", str.c_str());
         return EXIT_FAILURE;
      }
      switch (tolower(str[0]))
      {
         case 'c':
         {
            op_mode = OP_MODE_COMPRESS;
            break;
         }
         case 'd':
         {
            op_mode = OP_MODE_DECOMPRESS;
            break;
         }
         case 'a':
         {
            op_mode = OP_MODE_ALL;
            break;
         }
         default:
         {
            print_error("Invalid mode: %s\n", str.c_str());
            return EXIT_FAILURE;
         }
      }
      cmd_line.erase(cmd_line.begin() + i);
      break;
   }

   if (op_mode == OP_MODE_INVALID)
   {
      print_error("No mode specified!\n");
      print_usage();
      return EXIT_FAILURE;
   }

   printf("Using options:\n");
   options.print();
   printf("\n");

   int exit_status = EXIT_FAILURE;

   switch (op_mode)
   {
      case OP_MODE_COMPRESS:
      {
         if (cmd_line.size() < 2)
         {
            print_error("Must specify input and output filenames!\n");
            return EXIT_FAILURE;
         }
         else if (cmd_line.size() > 2)
         {
            print_error("Too many filenames!\n");
            return EXIT_FAILURE;
         }

         const std::string &src_file = cmd_line[0];
         const std::string &cmp_file = cmd_line[1];

         bool comp_result = compress_streaming(lzham_dll, src_file.c_str(), cmp_file.c_str(), options);
         if (comp_result)
            exit_status = EXIT_SUCCESS;

         if ((comp_result) && (options.m_verify_compressed_data))
         {
            char decomp_file[256];

#ifdef _XBOX
            sprintf(decomp_file, "e:\\__decomp_temp_%u__.tmp", (uint)GetTickCount());
#else
            sprintf(decomp_file, "__decomp_temp_%u__.tmp", (uint)timer::get_ms());
#endif
            if (!decompress_file(lzham_dll, cmp_file.c_str(), decomp_file, options))
            {
               print_error("Failed decompressing file \"%s\" to \"%s\"\n", cmp_file.c_str(), decomp_file);
               return EXIT_FAILURE;
            }

            printf("Comparing file \"%s\" to \"%s\"\n", decomp_file, src_file.c_str());

            if (!compare_files(decomp_file, src_file.c_str()))
            {
               print_error("Failed comparing decompressed file data while compressing \"%s\" to \"%s\"\n", src_file.c_str(), cmp_file.c_str());
               return EXIT_FAILURE;
            }
            else
            {
               printf("Decompressed file compared OK to original file.\n");
            }

            remove(decomp_file);
         }

         break;
      }
      case OP_MODE_DECOMPRESS:
      {
         if (cmd_line.size() < 2)
         {
            print_error("Must specify input and output filenames!\n");
            return EXIT_FAILURE;
         }
         else if (cmd_line.size() > 2)
         {
            print_error("Too many filenames!\n");
            return EXIT_FAILURE;
         }
         if (decompress_file(lzham_dll, cmd_line[0].c_str(), cmd_line[1].c_str(), options))
            exit_status = EXIT_SUCCESS;
         break;
      }
      case OP_MODE_ALL:
      {
         if (!cmd_line.size())
         {
            print_error("No directory specified!\n");
            return EXIT_FAILURE;
         }
         else if (cmd_line.size() != 1)
         {
            print_error("Too many filenames!\n");
            return EXIT_FAILURE;
         }
         if (test_recursive(lzham_dll, cmd_line[0].c_str(), options))
            exit_status = EXIT_SUCCESS;
         break;
      }
      default:
      {
         print_error("No mode specified!\n");
         print_usage();
         return EXIT_FAILURE;
      }
   }

   return exit_status;
}
示例#7
0
bool MyCmdLineParser::Parse(const string_array& args)
{
  // first parse the input command line into entries, don't care if they are valid or not
  vector<string_array*> entries;
  string_array pureArgs;
  string_array* sa = NULL;

  int argc = args.size();
  for ( int i = 1; i < argc; i++ )
  {
    // hack to ignore mac os x bundle argument
    string arg = args[i];
#ifdef Q_CYGWIN_WIN
    arg = MyUtils::CygwinPathProof(args[i].c_str()).toStdString();
#endif
    if (arg.substr(0, 6) != "-psn_0" )
    {
      if ( arg[0] == '-' && arg.length() > 1
           && !IsNumber( arg[1] ) && arg[1] != '.' )
      {
        sa = new string_array;
        sa->clear();
        if ( arg.length() > 2 && arg[1] == '-' )    // long name
        {
          sa->push_back( arg.substr(2) );
        }
        else
        {
          sa->push_back( arg.substr(1) );
        }
        entries.push_back( sa );
      }
      else if ( sa )
      {
        sa->push_back( arg );
      }
      else
      {
        pureArgs.push_back( arg );
      }
    }
  }

  //
  m_cmdLineEntries.clear();
  CmdLineEntry e;
  bool bSucceed = true;
  string error_msg = "";
  for ( size_t i = 0; i < entries.size(); i++ )
  {
    string_array strgs = *entries[i];

    if ( !IsValid( strgs[0].c_str(), &e ) ) // && !IsValid( strgs[0].c_str() + 1, &e ) )
    {
      bSucceed = false;
      error_msg += "Option '" + strgs[0] + "' not recognized.";
      break;
    }
    if ( e.type == CMD_LINE_OPTION )
    {
      e.arguments.clear();
      for ( size_t j = 1; j < strgs.size(); j++ )
      {
        if ( j <= (size_t)e.maxArguments )
        {
          e.arguments.push_back( strgs[j] );
        }
        else
        {
          pureArgs.push_back( strgs[j] );
        }
      }
      if ( (int)e.arguments.size() < e.minArguments )
      {
        bSucceed = false;
        //  cout << e.arguments.size() << " " << e.minArguments << "\n";
        error_msg += "Argument missing for option '" + strgs[0] + "'.";
      }
    }
    else if ( e.type == CMD_LINE_SWITCH )
    {
      for ( size_t j = 1; j < strgs.size(); j++ )
      {
        pureArgs.push_back( strgs[j] );
      }
    }
    m_cmdLineEntries.push_back( e );
  }

  // release buffers
  for ( size_t i = 0; i < entries.size(); i++ )
  {
    delete entries[i];
  }
  entries.clear();

  /*
  if ( bSucceed && (int)pureArgs.size() > m_nNumberOfPureArguments )
  {
    bSucceed = false;
    error_msg += "Option '" + pureArgs[0] + "' not recognized.";
  }
  */

  m_cmdLineFloatingArguments = pureArgs;

  if ( !bSucceed )
  {
    PrintErrorMessage( error_msg );
  }
  else if ( Found( "h" ) || Found( "help" ) )
  {
    PrintHelp();
    bSucceed = false;
  }

  return bSucceed;
}