Example #1
0
void mergeDRFiles(vector<String>& names, string& output_file, Size& best_k, string& e_property, double& score_cutoff, double& score_cuton)
{

	DockResultFile* output = new DockResultFile(output_file, ios::out);
	bool sort_by_scores = 1;
	if (e_property == "") sort_by_scores = 0;

	vector<Result*> new_results;

	/// First of all, copy Result data
	map<Result::Method, Result*> result_map;
	for (Size file = 0; file < names.size(); file++)
	{
		DockResultFile* input = new DockResultFile(names[file]);

		const vector<Result*>* results = input->getResults();
		for (Size i = 0; i < results->size(); i++)
		{
			map<Result::Method, Result*>::iterator it = result_map.find((*results)[i]->getMethod());
			if (it == result_map.end())
			{
				Result* result_copy = new Result(*(*results)[i]);
				if (!sort_by_scores) output->addResult(result_copy);
				else new_results.push_back(result_copy);
				result_map.insert(make_pair(result_copy->getMethod(), result_copy));
			}
			else
			{
				*it->second += *(*results)[i];
			}
		}

		input->close();
		delete input;
	}


	if (e_property != "")
	{
		e_property = "score_"+new_results.back()->getMethodString();
	}

	/// If no sorting is desired, iterate over all input-files and write each input-molecules to output-file
	if (!sort_by_scores)
	{
		output->disableAutomaticResultCreation();

		for (Size file = 0; file < names.size(); file++)
		{
			GenericMolFile* input = MolFileFactory::open(names[file]);

			int mol_no = 0;
			for (Molecule* mol = input->read(); mol; mol = input->read(), mol_no++)
			{
				*output << *mol;
				delete mol;
				Log.level(20) << "\r" << names[file] << " : " << mol_no+1;
				Log.flush();
			}
			Log.level(20)<<endl;
			Log.flush();

			input->close();
			delete input;
		}
	}

	/// If sorting is desired, iterate over all input-files and save each input-molecules to a map.
	/// Then write all FlexibleMolecules in this map to the output file and adapt the Result objects.
	else
	{
		multimap < double, FlexibleMolecule* > compounds; // map containing score and conformation-ID
		set < String > IDs; // IDs of the base-conformations

		for (Size file = 0; file < names.size(); file++)
		{
			DockResultFile* input = new DockResultFile(names[file]);
			int mol_no = 0;
			for (Molecule* mol = input->read(); mol; mol = input->read(), mol_no++)
			{
				if (!mol->hasProperty(e_property))
				{
					Log.level(10) << "Compound " << mol->getName() << " in file " << names[file] << " has no score property. Skipping this compound." << endl;
					for (Size i = 0; i < new_results.size(); i++)
					{
						new_results[i]->erase(input->getCurrentLigand());
					}
					delete mol;
					continue;
				}

				double score = ((String)mol->getProperty(e_property).toString()).toFloat();

				if (score > score_cutoff || score < score_cuton)
				{
					for (Size i = 0; i < new_results.size(); i++)
					{
						new_results[i]->erase(input->getCurrentLigand());
					}
					delete mol;
					continue;
				}

				if ((compounds.size() < best_k || score < compounds.rbegin()->first))
				{
					FlexibleMolecule* flexmol_copy = new FlexibleMolecule(*input->getCurrentLigand());
					compounds.insert(make_pair(score, flexmol_copy));
					IDs.insert(flexmol_copy->getId());

					if (compounds.size() > best_k)
					{
						for (Size i = 0; i < new_results.size(); i++)
						{
							new_results[i]->erase(compounds.rbegin()->second);
						}
						IDs.erase(compounds.rbegin()->second->getId());
						delete compounds.rbegin()->second;
						multimap<double, FlexibleMolecule*>::iterator it = compounds.end();
						it--;
						compounds.erase(it);
					}
				}
				else
				{
					for (Size i = 0; i < new_results.size(); i++)
					{
						new_results[i]->erase(input->getCurrentLigand());
					}
					delete mol;
				}
				Log.level(20) <<"\r"<<names[file]<<" : "<<mol_no+1<<flush;
			}
			Log.level(20)<<endl;

			input->close();
			delete input;
		}

		if (compounds.size() < best_k)
		{
			Log.level(20)<<"found "<<compounds.size()<<" compounds matching the given criteria."<<endl;
		}

		list<String> score_list;

		for (multimap < double, FlexibleMolecule* > ::iterator it = compounds.begin();
			it!=compounds.end(); it++)
		{
			output->writeLigand(it->second);
			score_list.push_back(it->second->getId());
			delete it->second;
		}

		// Remove those ligands from results for which no final result is available (e.g. due to missing atom parameters)
		vector < String > import_IDs = *new_results[0]->getInputConformations();
		for (Size i = 0; i < import_IDs.size(); i++)
		{
			if (IDs.find(import_IDs[i]) == IDs.end())
			{
				vector<vector<Result::ResultData> > data_list;
				for (Size k = 0; k < new_results.size(); k++)
				{
					if (k == 0)
					{
						data_list.push_back(*new_results[k]->get(import_IDs[i]));
					}

					vector<vector<Result::ResultData> > new_data_list;
					for (Size j = 0; j < data_list.size(); j++)
					{
						for (Size l = 0; l < data_list[j].size(); l++)
						{
							String ID = data_list[j][l].getLigandConformationId();
							new_data_list.push_back(*new_results[k]->get(ID));
							new_results[k]->erase(ID);
						}
					}
					data_list = new_data_list;
				}
			}
		}

		for (Size i = 0; i < new_results.size(); i++)
		{
			list<String> new_list;
			new_results[i]->sort(score_list, new_list);
			score_list = new_list;
		}

		output->writeResults(new_results);
	}

	output->close();
	delete output;
}
Example #2
0
void sortMolecules(vector<String>& names, string& output_file, Size& best_k, string& e_property, double& score_cutoff, double& score_cuton)
{
	multimap<double, Molecule*> compounds;

	for (Size file = 0; file < names.size(); file++)
	{
		GenericMolFile* input = MolFileFactory::open(names[file]);

		int mol_no = 0;
		for (Molecule* mol = input->read(); mol; mol = input->read(), mol_no++)
		{
			if (!mol->hasProperty(e_property))
			{
				Log.level(10) << "Compound " << mol->getName() << " in file " << names[file] << " has no score property. Skipping this compound." << endl;
				delete mol;
				continue;
			}

			double score = ((String)mol->getProperty(e_property).toString()).toFloat();

			if (score > score_cutoff || score < score_cuton)
			{
				delete mol;
				continue;
			}

			if ((compounds.size() < best_k || score < compounds.rbegin()->first))
			{
				compounds.insert(make_pair(score, mol));

				if (compounds.size() > best_k)
				{
					delete compounds.rbegin()->second;
					multimap<double, Molecule*>::iterator it = compounds.end();
					it--;
					compounds.erase(it);
				}
			}
			else
			{
				delete mol;
			}
			Log.level(20) << "\r" << names[file] << " : " << mol_no+1 << flush;
		}
		Log.level(20) << endl;
		Log.flush();

		input->close();
		delete input;
	}

	if (compounds.size() < best_k)
	{
		Log.level(20) << "found " << compounds.size() << " compounds matching the given criteria." << endl;
	}

	GenericMolFile* output = MolFileFactory::open(output_file, ios::out, "mol2.gz");

	for (multimap < double, Molecule* > ::iterator it = compounds.begin();
		it!=compounds.end(); it++)
	{
		*output << *it->second;
		delete it->second;
	}

	output->close();
	delete output;
}
Example #3
0
void processMultiMoleculeFile(ScoringFunction* scoring_function, StructurePreparer* sp, String par_file, Rescoring* rescoring, bool train, double min_dock_score, String dock_score_label, GenericMolFile* input, GenericMolFile* output, double output_score_threshold, bool ignore_top)
{
	list<pair<double, bool> > rescore_list;
	double min_rescore = 1e12;
	int i = 1;

	for (Molecule* mol = input->read(); mol; mol = input->read(), i++)
	{
		String name = mol->getName();
		Log<<"====== Ligand "<<i;
		if (name != "") Log<<", "<<name;
		Log<<" ============"<<endl;
		Log.flush();

		if (mol->hasProperty("score_ligcheck"))
		{
			double score_ligcheck = ((String)mol->getProperty("score_ligcheck").toString()).toDouble();
			if (score_ligcheck < 0.95) // 0 = error, 1 = check passed
			{
				cout<<"Skipping compound because it has been marked as containing errors by LigCheck."<<endl;
				delete mol;
				continue;
			}
		}

		double score = 0;
		try
		{
			sp->prepare(mol, par_file);
			if (!rescoring || train)
			{
				scoring_function->setLigand(mol);
				scoring_function->update();
				score = scoring_function->updateScore();
				scoring_function->printResult();
			}
			if (rescoring)
			{
				if (train)
				{
					if (score > 1000)
					{
						Log.level(10)<<"score>1000, thus current compound is not added to training data set."<<endl;
						delete mol;
						continue;
					}
					rescoring->addScoreContributions(mol);
				}
				else score = rescoring->rescore(mol);
			}

			if (score < output_score_threshold)
			{
				if (ignore_top)
				{
					bool keep_dock_score = 0;
					if (mol->hasProperty("score"))
					{
						double docking_score = mol->getProperty(dock_score_label).toString().toDouble();
						if (docking_score < min_dock_score)
						{
							rescore_list.push_back(make_pair(docking_score, false));
							keep_dock_score = 1;
						}
					}
					if (!keep_dock_score)
					{
						rescore_list.push_back(make_pair(score, true));
						if (score < min_rescore)
						{
							min_rescore = score;
						}
					}
				}
				else
				{
					mol->setProperty("re-score", score);
				}
			}
		}
		catch (BALL::Exception::GeneralException e)
		{
			cout<<e.getMessage()<<endl;
			cout<<"Error! Skipping this molecule!"<<endl;
		}

		if (score < output_score_threshold && (!rescoring || !train) && !ignore_top)
		{
			*output << *mol;
		}

		delete mol;
	}

	/** If ignoring (i.e. not rescoring) the top fraction of docking-results, we need to make sure
	 that all rescored compounds get a rescore-value larger than that of the former. */
	if (ignore_top && !train)
	{
		input->reopen();
		int i = 1;
		double offset = min_dock_score-min_rescore+0.25;

		list<pair<double, bool> >::iterator rescore_it = rescore_list.begin();
		for (Molecule* mol = input->read(); mol; mol = input->read(), i++)
		{
			String name = mol->getName();
			if (mol->hasProperty("score_ligcheck"))
			{
				double score_ligcheck = ((String)mol->getProperty("score_ligcheck").toString()).toDouble();
				if (score_ligcheck < 0.95) // 0 = error, 1 = check passed
				{
					delete mol;
					continue;
				}
			}
			double rescore = rescore_it->first;
			if (rescore_it->second) // if compound was rescored
			{
				rescore += offset;
			}
			if (rescore < output_score_threshold)
			{
				mol->setProperty("re-score", rescore);
				*output << *mol;
			}
			delete mol;
			rescore_it++;
		}
	}

	delete input;
	if (output) delete output;
}
Example #4
0
int runRescoring(CommandlineParser& par, bool simple_rescoring, bool train)
{
	// just to make sure ...
	if (simple_rescoring) train = false;


	/** If desired, write ini-file with default parameters and abort */

	String default_inifile = par.get("write_ini");
	if (default_inifile != CommandlineParser::NOT_FOUND)
	{
		// if ini-file already exists, read its entries first
		Options default_options;
		ScoringFunction::getDefaultOptions(default_options);
		list<Constraint*> clist;
		if (ifstream(default_inifile.c_str()))
		{
			DockingAlgorithm::readOptionFile(default_inifile, default_options, clist);
		}

		Options* scoring_options = default_options.getSubcategory("Scoring Function");
		scoring_options->setDefault("scoring_type", "MM");
		scoring_options->setDefault("nonbonded_cutoff_precalculation", scoring_options->get("nonbonded_cutoff"));
		scoring_options->set("nonbonded_cutoff", 3.0);
		DockingAlgorithm::writeOptionFile(par.get("write_ini"), default_options, clist);
		Log << "Ini-file w/ default values has been written to file '"<<default_inifile<<"'. Goodbye!"<<endl;
		return 0;
	}


	/** Fetch information about specification of desired rescoring approach */

	String method = "";
	String free_energy_label = "";
	String model_file = "";
	String scoring_type = "MM";
	String grid_file = "";
	Options option;
	par.copyAdvancedParametersToOptions(option);
	Options* option_category = option.getSubcategory("Scoring Function");
	if (!option_category) option_category = &option;

	list<Constraint*> constraints;
	if (par.get(DockingAlgorithm::OPTION_FILE_PARAMETER_NAME) != CommandlineParser::NOT_FOUND)
	{
		DockingAlgorithm::readOptionFile(par.get(DockingAlgorithm::OPTION_FILE_PARAMETER_NAME), option, constraints);
		scoring_type = option_category->setDefault("scoring_function", "MM");
		grid_file = option_category->setDefault("grid_file", "");
	}
	// Overload ini-file settings with values taken from command-line (if any)
	if (par.get("function") != CommandlineParser::NOT_FOUND)
	{
		scoring_type = par.get("function");
	}

	method = par.get("method");
	free_energy_label = par.get("exp");
	if (par.get("mod") != CommandlineParser::NOT_FOUND)
	{
		model_file = par.get("mod");
	}

	if (!simple_rescoring && method != "Rescoring3D" && method != "Rescoring4D" && method != "Rescoring1D")
	{
		cerr << "[Error:] Type of desired rescoring method unknown: available are 'Rescoring3D', 'Rescoring4D' and 'Rescoring1D'." << endl;
		exit(1);
	}


	/** Setup StructurePreparer, Rescoring and ScoringFunction  */

	//Log.setMinLevel(cout, 39);

	GenericMolFile* ref_ligand_file = MolFileFactory::open(par.get("rl"));
	Molecule* ref_ligand = ref_ligand_file->read();
	ref_ligand_file->close();
	delete ref_ligand_file;

	StructurePreparer* sp;
	if (scoring_type.hasSubstring("PLP"))
	{
		sp = new StructurePreparer("PLP");
	}
	else
	{
		sp = new StructurePreparer;
	}

	String par_file = option_category->get("filename");
	if (par_file == "") par_file="Amber/amber96-docking.ini";
	System receptor;
	GenericMolFile* receptor_file = MolFileFactory::open(par.get("rec"));
	if (!receptor_file)
	{
		cerr<<"Format of receptor-file not supported!"<<endl;
		return 1;
	}

	*receptor_file >> receptor;
	delete receptor_file;
	sp->prepare(&receptor, par_file);
	sp->prepare(ref_ligand, par_file);

	ScoringFunction* scoring_function;
	if (scoring_type == "MM" || scoring_type == "PB")
	{
		// Support for using one and the same config-file for docking and rescoring
		String precalc_nonbonded_cuttoff = option_category->get("nonbonded_cutoff_precalculation");
		if (precalc_nonbonded_cuttoff != "")
		{
			option_category->set("nonbonded_cutoff", precalc_nonbonded_cuttoff);
		}
	}
	if (scoring_type == "MM")
	{
		scoring_function = new MMScoring(receptor, *ref_ligand, option);
	}
	else if (scoring_type == "GridedMM")
	{
		scoring_function = new GridedMM(receptor, *ref_ligand, option);
	}
	else if (scoring_type == "GridedPLP")
	{
		scoring_function = new GridedPLP(receptor, *ref_ligand, option);
	}
	else if (scoring_type == "PLP")
	{
		scoring_function = new PLPScoring(receptor, *ref_ligand, option);
	}
	else if (scoring_type == "PB")
	{
		scoring_function = new PBScoring(receptor, *ref_ligand, option);
	}
	else
	{
		String mess="ScoringFunction type \'"+scoring_type+"\' unknown/unsupported!";
		cerr<<"[Error:] "<<mess<<endl;
		return 1;
	}

	GridBasedScoring* gbs = dynamic_cast<GridBasedScoring*>(scoring_function);
	for (list < Constraint* > ::iterator it = constraints.begin(); it != constraints.end(); it++)
	{
		scoring_function->constraints.push_back(*it);
		(*it)->setScoringFunction(scoring_function);
	}

	cout<<endl<<"-----------------------------------------"<<endl;
	cout<<"Scores will be calculated as : "<<scoring_function->getEquation()<<endl;
	cout<<"-----------------------------------------"<<endl<<endl;

	if (gbs != NULL)
	{
		gbs->replaceGridSetFromFile(grid_file);
	}

	Rescoring* rescoring = 0;

	if (!simple_rescoring)
	{
		if (!train && free_energy_label == "")
		{
			cerr<<"[Error:] free-energy label must be specified !"<<endl;
			return 1;
		}
		if (method == "Rescoring3D")
		{
			rescoring = new Rescoring3D(receptor, *ref_ligand, option, free_energy_label, scoring_function);
		}
		else if (method == "Rescoring4D")
		{
			rescoring = new Rescoring4D(receptor, *ref_ligand, option, free_energy_label, scoring_function);
		}
		else if (method == "Rescoring1D")
		{
			rescoring = new Rescoring1D(receptor, *ref_ligand, option, free_energy_label, scoring_function);
		}
		else
		{
			cerr<<"[Error:] Rescoring-method unknown !"<<endl;
			return 1;
		}
		if (!train)
		{
			rescoring->loadModel(model_file);
		}
		scoring_function->enableStoreInteractions();
	}

	scoring_function->setLigand(ref_ligand);
	scoring_function->update();
	scoring_function->updateScore();
	Log<<"====== Reference ligand ============"<<endl;
	scoring_function->printResult();


	/** If top fraction of docking results should not be rescored, then fetch scores and compute threshold for this fraction */

	bool ignore_top = false;
	double ignore_top_fraction = 0.0;
	if (par.has("tf"))
	{
		ignore_top_fraction = par.get("tf").toDouble();
		if (ignore_top_fraction < 1e-14 || ignore_top_fraction < 0 || ignore_top_fraction > 1)
		{
			ignore_top_fraction = 0.0;
			ignore_top = false;
		}
		else
		{
			ignore_top = true;
		}
	}
	double min_dock_score = -1e100;
	list<pair<double, bool> > rescore_list;
	if (ignore_top)
	{
		set<double> scores;
		GenericMolFile* input = MolFileFactory::open(par.get("i"));
		for (Molecule* mol = input->read(); mol; delete mol, mol = input->read())
		{
			if (mol->hasProperty("score"))
			{
				scores.insert(mol->getProperty("score").toString().toDouble());
			}
		}
		Size i = 0;
		Size max = scores.size()*ignore_top_fraction;
		set<double>::iterator s_it = scores.begin();
		for (; i < max; s_it++)
		{
			i++;
		}
		min_dock_score = *s_it;
		delete input;
	}


	/** Now, Rescore entire sd-/mol2-file   */

	double threshold = option.setDefaultReal("output_score_threshold", 1e100);

	GenericMolFile* input = MolFileFactory::open(par.get("i"));
	GenericMolFile* output = 0;

	if (simple_rescoring || !train)
	{
		output = MolFileFactory::open(par.get("o"), ios::out, input);

		DockResultFile* drf_output = dynamic_cast<DockResultFile*>(output);
		if (drf_output)
		{
			String dummy = "0";
			BALL::Docking::Result::Method method = Result::getMethod(3);
			String description = "";
			description = rescoring->getName()+"+"+scoring_function->getName();
			drf_output->setOutputParameters(method, "re-score", dummy, description);
		}
	}

	// Do the actual work ..
	processMultiMoleculeFile(scoring_function, sp, par_file, rescoring, train, min_dock_score, "score", input, output, threshold, ignore_top);

	if(!simple_rescoring)
	{
		if (train)
		{
			rescoring->recalibrate();
			rescoring->saveModel(par.get("o"));
		}
		else
		{
			double correlation, q2, stderr;
			rescoring->calculateQuality(correlation, q2, stderr);
			cout<<"Correlation = "<<String(correlation)<<endl;
			cout<<"Q2="<<String(q2)<<endl;
			cout<<"Standard error="<<String(stderr)<<endl;
		}
	}

	for (list<Constraint*>::iterator it = constraints.begin(); it != constraints.end(); it++)
	{
		delete *it;
	}

	delete rescoring;
	delete sp;
	delete ref_ligand;

	return 0;
}
Example #5
0
int main(int argc, char* argv[])
{
	CommandlineParser parpars("BindingDBCleaner", "fix bindingdb.org downloads", VERSION, String(__DATE__), "Preparation");
	parpars.registerParameter("i", "input file", INFILE, true);
	parpars.registerParameter("type", "type of contained activity values: 'Ki' or 'IC50'",STRING, true);
	parpars.registerParameter("o", "output file", OUTFILE, true);
	parpars.registerParameter("target", "binding-DB target name", STRING, true);
	String manual = "This tool cleans up the sd-properties contained in sd-files downloaded from bindingdb.org.\n\nFor all compounds in the input file, the affinity value for the specified target is searched and retained but all other properties are removed. Furthermore, the IC50 or Ki value of each compound is converted to a binding-free-energy value in units of [kJ/mol] that is added as a property-tag named 'binding_free_energy'.\n\nAll compounds in the input file for which no IC50 resp. Ki value for the specified target can found, are ignored and not written to the output file.";
	parpars.setToolManual(manual);
	list<String> slist;
	slist.push_back("IC50");
	slist.push_back("Ki");
	parpars.setParameterRestrictions("type", slist);
	parpars.setSupportedFormats("i","mol2,sdf,drf");
	parpars.setSupportedFormats("o","mol2,sdf,drf");
	parpars.setOutputFormatSource("o","i");
	parpars.parse(argc, argv);

	GenericMolFile* input = MolFileFactory::open(parpars.get("i"));
	GenericMolFile* output = MolFileFactory::open(parpars.get("o"), ios::out, "mol2.gz");
	String target_name = parpars.get("target");
	target_name.trim();

	String type = parpars.get("type");
	bool use_IC50 = (type == "IC50");
	bool use_Ki = (type == "Ki");
	String response_property = "Enzymologic: IC50 nM";
	if (use_Ki) response_property = "Enzymologic: Ki nM";

	if (!use_IC50 && !use_Ki)
	{
		Log.error() << "[Error:] Please set parameter 'type' to either 'IC50' or 'Ki'!" << endl;
		return 1;
	}

	Size no_mols_found = 0;
	Size no_acitivities_found = 0;
	Size total = 1;
	Size no_invalid = 0;
	for (Molecule* mol = input->read(); mol; mol = input->read(), total++)
	{
        bool found = false;
        int prop_id = 0;
		int target_no = 1;

		for (NamedPropertyIterator it = mol->beginNamedProperty();
			it!=mol->endNamedProperty(); it++)
		{
			String prop = "TARGET Biomolecule "+String(target_no);
			if (it->getName() == prop)
			{
				String resp = response_property+" "+String(target_no);

				if (it->toString().trim() == target_name
					&& mol->hasProperty(resp))
				{
					String resp_value = String(mol->getProperty(resp).toString()).trim();

					if (resp_value == "n/a")
					{
						target_no++;
						continue;
					}

					for (NamedPropertyIterator it2 = mol->beginNamedProperty(); it2 != mol->endNamedProperty(); it2++)
					{
						it2->clear();
					}

					if (resp_value.hasPrefix(">"))
					{
						resp_value = resp_value.after(">");
					}
					if (resp_value.hasPrefix("<"))
					{
						resp_value = resp_value.after("<");
					}

					mol->setProperty("Target", target_name);
					mol->setProperty(response_property, resp_value);
					double value = resp_value.toFloat();
					if (use_IC50)
					{
						String name = "Enzymologic: pIC50 nM";
                        if (prop_id > 0)
                        {
                            name += " "+String(prop_id);
                        }
						mol->setProperty(name, -log10(value));
					}
					else if (use_Ki)
					{
						String name = "Enzymologic: pKi nM";
                        if (prop_id > 0)
                        {
                            name += " " + String(prop_id);
                        }
						mol->setProperty(name, -log10(value));
					}

					String name = "binding_free_energy";
                    if (prop_id > 0)
                    {
                        name += " " + String(prop_id);
                    }
					float free_energy = 1.987*298.15*log(1e-09*value)*4.184/1000;
					mol->setProperty(name, free_energy);

					// remove compounds with completely senseless binding-affinity data
					if (free_energy > -125 && free_energy < 0 && !BALL::Maths::isNan(free_energy) && BALL::Maths::isFinite(free_energy))
					{
                        found = true;
                        prop_id++;
						no_acitivities_found++;
					}
					break;
				}
				target_no++;
			}
		}

		if (found)
		{
			*output << *mol;
			no_mols_found++;
		}
		delete mol;
	}

	if (no_invalid > 0)
	{
		Log << "[Warning:] Ignored "<<no_invalid<<" compounds due to invalid free-energy (<-125 or >0) !" <<endl;
	}

	Log.level(20) << "\rFound " << no_acitivities_found << " activities measurements for " << no_mols_found << " molecules and saved them to " << parpars.get("o") << endl;

	delete input;
	delete output;
}
Example #6
0
int main(int argc, char* argv[])
{
  CommandlineParser parpars("LigandFileSplitter", "split molecule files", VERSION, String(__DATE__), "Preparation");
	parpars.registerParameter("i", "input molecule file", INFILE, true);
  parpars.registerParameter("no", "Number of output files to be created", BALL::INT, false);
  parpars.registerParameter("mpf", "Number of molecules per output file", BALL::INT, false);
	parpars.registerParameter("outname_pattern", "Pattern that will be used to generate the names of the output files, see notes and examples below.", BALL::STRING, false);
	parpars.registerParameter("o", "Output filenames. If none are specified, input filename postfixed with IDs will be used", OUTFILELIST, false);
  
	String man =
	"LigandFileSplitter splits a molecule file into a given number of subsets.\n\n"

	"Examples:\n\n"

  "$ LigandFileSplitter -i Trypsin_actives.sdf -o batch_1 batch_2\n"
	"  will split the input file Trypsin_actives.sdf in the two output files batch_1.sdf and batch_2.sdf.\n\n"
	
  "$ LigandFileSplitter -i Trypsin_actives.sdf -no 3\n"
	"  will split the input file Trypsin_actives.sdf in three files named Trypsin_actives_0.sdf, Trypsin_actives_1.sdf and Trypsin_actives_2.sdf\n\n"
	
  "$ LigandFileSplitter -i ligands.sdf -ligands_per_file 4\n"
	"  will split the input file ligands.sdf in as many files needed to fit at most 4 ligands per file.\n"
	"  The files will be named ligands_0.sdf, ligands_1.sdf ... ligands_N.sdf\n\n"
	
  "$ LigandFileSplitter -i ligands.sdf -ligands_per_file 5 -outname_pattern split_ligands-%d.sdf\n"
	"  will split the input file ligands.sdf in as many files needed to fit at most 5 ligands per file.\n"
  "  The files will be named split_ligands-0.sdf, split_ligands-1.sdf, ... , split_ligands-N.sdf.\n\n"
			  
  "$ LigandFileSplitter -i ligands.sdf -outname_pattern split_ligands_%d.sdf -no 100\n"
	"  will split the input file ligands.sdf in 100 files using the following names:\n"
	"  split_ligands_0.sdf, split_ligands_1.sdf, ... , split_ligands_99.sdf.\n\n"
			
  "NOTES:\n"
	"- Molecules are not sorted in any way.\n"
	"- The tool is no format converter and the format of the output files will be the same as of the input file.\n"
	"- Output_name_pattern accepts a printf-like pattern, expecting exactly one decimal integer placeholder, %d.\n"
	"- The following are valid patterns: output_ligand.sdf_%d, split_%d.mol, %d_lig.drf\n"
	"- The following are invalid patterns: output_%f.sdf, ligands.drf_%u, %d_lig_%d.mol, molecules.sdf\n\n"
  
  "WARNING:\n"
	"- If the parameter outname_pattern is specified, the user is responsible for the occurrence of a valid file extension\n"
	"  in the outname_pattern, which has to be of the same file format as the input file.\n\n";
  
	parpars.setToolManual(man);
	parpars.setSupportedFormats("i","mol2,sdf,drf");
	parpars.setSupportedFormats("o","mol2,sdf,drf");
	parpars.setOutputFormatSource("o","i");
 	parpars.parse(argc, argv);
  
  
  // Check if parameter setting is valid and/or useful  
 	validateParameters(parpars);

  
  unsigned int n_molecules = 0;
  unsigned int n_outfiles = 0;
  unsigned int mpf = 0;
  
  String infile = parpars.get("i");
  String infile_name = infile.substr(0, infile.find_last_of('.'));
  String infile_type = infile.substr(infile.find_last_of('.') + 1, infile.length() - infile.find_last_of('.') - 1);
  
  vector<String> outfile_names;
  HashSet <String> conformation_ids;
  
  Molecule* mol;
 	GenericMolFile* input;
  GenericMolFile* output;

  DockResultFile* drf_input;
  DockResultFile* drf_output;  
  

	// Determine number of molecules in input files.
	// In case of DockResultFiles, we do not need to process all contained molecules 
	// in order to achieve this; we can simply count the result-section entries.
  
  input = MolFileFactory::open(infile);
	
  drf_input = NULL;
  drf_input = dynamic_cast<DockResultFile*>(input);
	if (drf_input)
	{
		n_molecules = drf_input->countConformations();
	}
	else
	{
 		Log.level(10) << "\rCount number of molecules in input file ..." << endl;
		for (mol = input->read(); mol; mol = input->read())
		{
			++n_molecules;
			delete mol;
		}
	}	
	
	Log.level(10) << "\r" << n_molecules << " molecules found." << endl << endl;

	input->close();
	delete input;

	
	// Check which split method should be applied
	
	if (parpars.has("o"))
	{
		// Option 1:
		// Number of output files specified by explicit name passing.
		// Parameter 'o' is specified
    
		list<String> tmp = parpars.getList("o");
    for (list<String>::iterator iter = tmp.begin(); iter != tmp.end(); ++iter)
    {
      outfile_names.push_back(*iter + "." + infile_type);
    }
    
		n_outfiles = outfile_names.size();
  
    if (n_molecules >= n_outfiles)
    {
      mpf = floor((double)n_molecules / n_outfiles);
    }
    else
    {
      Log.level(10) << "\rNOTE: Number of molecules in input file is smaller than number of specified output files." << endl;
      n_outfiles = n_molecules;
      mpf = 1;
    }
	}
  else
  {
    if (parpars.has("no"))
    {
      // Option 2:
		  // Number of output files is specified directly.
		  // Parameter 'no' is specified
      
      n_outfiles = parpars.get("no").toInt();
      
      if (n_molecules >= n_outfiles)
      {
        mpf = floor((double)n_molecules / n_outfiles);
      }
      else
      {
        Log.level(10) << "\rNOTE: Number of molecules in input file is smaller than specified number of output files." << endl;
        n_outfiles = n_molecules;
        mpf = 1;
      }
    }
    else
    {
      if (parpars.has("mpf"))
      {
        // Option 3:
        // Number of molecules per output file is specified directly.
        // Parameter 'mpf' is specified
      
        mpf = parpars.get("mpf").toInt();
        n_outfiles = ceil((double)n_molecules / (double)mpf);
      }
    }
    
    
    // Generate output file names
    
    if (parpars.has("outname_pattern"))
    {
      // Option 1: Generate output file names from specified pattern

      String pattern = parpars.get("outname_pattern");
      
      for (unsigned int i=0; i!= n_outfiles; ++i)
      {
        outfile_names.push_back(getOutputFileName(pattern, true, infile_type, i));
      }
    }
    else
    {
      // Option 2: Simple indexing of input file name
      
      for (unsigned int i=0; i!= n_outfiles; ++i)
      {
        outfile_names.push_back(getOutputFileName(infile_name, false, infile_type, i));
      }
    }
      
  }
  
  
  // Now do the splitting
  
  input = MolFileFactory::open(infile);

  drf_input = NULL;
	drf_input = dynamic_cast<DockResultFile*>(input);
	if (drf_input) 
  {
    drf_input->selectAllResultsForInput();
  }
  
  for (unsigned int i=0; i!=outfile_names.size(); ++i)
  {
    conformation_ids.clear();
    
    output = MolFileFactory::open(outfile_names[i], ios::out, infile_type);
    drf_output = dynamic_cast<DockResultFile*>(output);
    if (drf_input && drf_output)
    {
      drf_output->disableAutomaticResultCreation();
    }

    
    if (i < outfile_names.size()-1)
    {
      // Not the last file - so number of molecules is standard
      
      for (unsigned int j=0; j!=mpf; ++j)
      {
        mol = input->read();
        if (drf_input && drf_output && mol->hasProperty("Conformation_input_UID"))
        {
          conformation_ids.insert(mol->getProperty("Conformation_input_UID").toString());
        }

        output->write(*mol);
        delete mol;
      }
    }
    else
    {
      // Last output file - so write remaining molecules into it
      
      mol = input->read();
      while (mol)
      {
        if (drf_input && drf_output && mol->hasProperty("Conformation_input_UID"))
        {
          conformation_ids.insert(mol->getProperty("Conformation_input_UID").toString());
        }

        output->write(*mol);
        delete mol;
        mol = input->read();
      }
    }

    if (drf_input && drf_output)
    {
      const vector<Result*>* results = drf_input->getResults();
      for (unsigned int i = 0; i < results->size(); ++i)
      {
        Result* new_res = new Result(*(*results)[i], conformation_ids);
        drf_output->addResult(new_res);
      }
    }
    
    output->close();
    delete output;
  }  

  input->close();
  delete input;
  
  
  return 0;
}