Exemplo n.º 1
0
void startValidation(ValidationConfiguration& conf, QSARData* q, String* data_filename)
{
	bool created_data_object=0;
	if(q==NULL || data_filename==NULL)
	{
		q = new QSARData;
		created_data_object=1;
	}

	Registry reg;
	Model* m;
	String model_type;

	ifstream model_input(conf.model.c_str()); // read model-abbreviation
	if(!model_input)
	{
		Log.error()<<"Error: Model-file '"<<conf.model<<"' does not exist!!"<<endl;
		return;
	}
	getline(model_input,model_type);
	getline(model_input,model_type);
	model_type = model_type.getField(0,"\t");
	model_input.close();

	RegistryEntry* entry = reg.getEntry(model_type);

	if(!entry->kernel)
	{
		m = (*entry->create)(*q);
	}
	else
	{
		// parameters irrelevant; will be overwritten by those read from file
		m = (*entry->createKernel1)(*q,1,1, -1);
	}

	m->readFromFile(conf.model.c_str());
	m->model_val->selectStat(conf.statistic);

	if(conf.data!="")
	{
		if(!data_filename || conf.data!=*data_filename)
		{
			q->readFromFile(conf.data);
			if(data_filename) *data_filename = conf.data;
		}

		if(conf.val_type==1) m->model_val->testInputData(1);

		else if(conf.val_type==2) m->model_val->crossValidation(conf.k_folds,1);

		else if(conf.val_type==3) m->model_val->bootstrap(conf.bootstrap_samples);

		else if(conf.val_type==4) m->model_val->yRandomizationTest(conf.no_of_permutation_tests,conf.k_folds);

		else if(conf.val_type==6)
		{
			if(entry->regression)
			{
				((RegressionModel*)m)->validation->calculateCoefficientStdErrors(conf.bootstrap_samples);
			}
		}

		else if(conf.val_type==7)
		{
			if(!data_filename || conf.validation_data!=*data_filename)
			{
				q->readFromFile(conf.validation_data.c_str());
				*data_filename = conf.validation_data;
			}
			m->model_val->testInputData(1);
		}
	}

	// save the result of the validation to the specified file
	m->model_val->saveToFile(conf.output);

	if(created_data_object) delete q;
	delete m;
}
Exemplo n.º 2
0
Model* AutomaticModelCreator::generateModel()
{
	Registry registry;
	bool use_regression = !data_->checkforDiscreteY(); 
	bool use_random_testsets = 0;
	Size no_folds = 3;
	if (use_random_testsets) no_folds = 10; 

	Size best_model_id = 0;
	Size best_kernel_id = 0;
	double best_nested_quality = 0;

	/// Do 4-fold nested cross validation, including feature selection steps, for each model type and, if applicable, each kernel function

	Log.level(10)<<"model-name  kernel  #features  nested Q^2  stddev"<<std::endl;
	Log.level(10)<<"---------------------------------------------------"<<std::endl;

	Log.level(10)<<std::setiosflags(std::ios::fixed)<<std::left;

	for (Size model_id = 1; model_id < 14; model_id++)
	{
		RegistryEntry* reg_entry;
		try
		{
			reg_entry = registry.getEntry(model_id);
		}
		catch(BALL::Exception::GeneralException e)
		{
			// a model with the current id does not exist
			continue;
		}

		if (use_regression != reg_entry->regression) continue; 

		Size no_kernel_types = 1;
		if (reg_entry->kernel) no_kernel_types = 3; 

		for (Size kernel_id = 1; kernel_id <= no_kernel_types; kernel_id++)
		{
			double nested_q2 = 0;
			int no_features = 0;
			vector<double> q2_values;

			for (Size fold_id = 0; fold_id < no_folds; fold_id++)
			{
				vector<QSARData*> sets;
				if (use_random_testsets)
				{
					// randomly select 25% of compounds for external validation set
					sets = data_->generateExternalSet(0.25);
				}
				else
				{
					sets = data_->evenSplit(no_folds, fold_id);
				}
				if (data_->isDataCentered())
				{
					bool center_y = data_->isResponseCentered();
					sets[0]->centerData(center_y); // train-partition
					sets[1]->centerData(center_y); // test-partition
				}

				Model* model;
				if (!reg_entry->kernel) model = (*reg_entry->create)(*sets[0]); 
				else model = (*reg_entry->createKernel1)(*sets[0], kernel_id, 1, -1);
				model->setParameters(reg_entry->parameterDefaults);
				optimizeParameters(model);

				// select relevant features using training partition
				selectFeatures(model);
				no_features += model->getDescriptorIDs()->size();

				// train model using only the training partition
				model->readTrainingData();
				model->train();

				// test fit to external validation data
				model->data = sets[1];
				model->model_val->testInputData(true);
				nested_q2 += model->model_val->getFitRes();
				q2_values.push_back(model->model_val->getFitRes());

				delete sets[0];
				delete sets[1];
				delete model;
			}

			nested_q2 /= no_folds;
			no_features /= no_folds;
			double stddev = Statistics::getStddev(q2_values);

			Log.level(10)<<std::setw(10)<<reg_entry->name_abreviation<<"  ";
			if (reg_entry->kernel)
			{
				if (kernel_id == 1) Log.level(10)<<setw(6)<<"polyn."; 
				else if (kernel_id == 2) Log.level(10)<<setw(6)<<"rbf"; 
				else if (kernel_id == 3) Log.level(10)<<setw(6)<<"sigm."; 
			}
			else Log.level(10)<<setw(6)<<"none";
			Log.level(10)<<"  ";
			Log.level(10)<<setw(9)<<no_features<<"  "<<setw(10)<<nested_q2<<"  "<<setw(6)<<stddev<<endl<<flush;

			double quality = nested_q2-stddev; // make sure to prefer models w/ low stddev
			if (quality > best_nested_quality)
			{
				best_nested_quality = quality;
				best_model_id = model_id;
				best_kernel_id = kernel_id;
			}
		}
	}

	if (best_nested_quality < min_quality_)
	{
		Log.level(10)<<"Sorry, no model with satisfactory prediction quality found!"<<endl;
		return 0;
	}

	/// Create best model using ENTIRE data set and return it
	RegistryEntry* reg_entry = registry.getEntry(best_model_id);
	Model* model;
	if (!reg_entry->kernel) model = (*reg_entry->create)(*data_); 
	else model = (*reg_entry->createKernel1)(*data_, best_kernel_id, 1, -1);
	model->setParameters(reg_entry->parameterDefaults);
	optimizeParameters(model);
	selectFeatures(model);
	model->readTrainingData();
	model->train();

	return model;
}
Exemplo n.º 3
0
void startModelCreation(ModelConfiguration& conf, QSARData* q, String* data_filename)
{
	bool created_data_object=0;
	if(q==NULL || data_filename==NULL || conf.data_file!=*data_filename)
	{
		if(q==NULL)
		{
			q = new QSARData;
			created_data_object=1;
		}
		q->readFromFile(conf.data_file);
		if(data_filename) *data_filename = conf.data_file;
	}
	else
	{
		Log.level(2)<<"[ModelCreator debug-info:] QSARData object for file "<<conf.data_file<<" already in memory; not reading it again."<<endl;
	}
	Registry reg;
	Model* model;
	bool kernel=0;
	RegistryEntry* entry = reg.getEntry(conf.model_no);

	if(entry->create!=NULL)
	{
		model = (*entry->create)(*q);
	}
	else
	{
		if(conf.kernel_type==0 || conf.kernel_par1==0)
		{
			Log.error()<<"For kernel based model, kernel-type and kernel-parameter(s) must be specified!"<<endl;
			return;
		}

		model = (*entry->createKernel1)(*q,conf.kernel_type,conf.kernel_par1, conf.kernel_par2);
		kernel=1;
	}

	if(conf.model_parameters.size()>0)
	{
		model->setParameters(conf.model_parameters);
	}
	if(!conf.no_training && conf.optimize_model_parameters)
	{
		if(conf.k_fold==0)
		{
			Log.error()<<"'k_fold' must be set if model parameters are to be optimized!"<<endl;
			return;
		}
		model->optimizeParameters(conf.k_fold);
	}
	if(!conf.no_training && kernel && conf.grid_search_steps>0)
	{
		if(conf.k_fold==0)
		{
			Log.error()<<"'k_fold' must be set if grid search is to be done!"<<endl;
			return;
		}
		if(conf.grid_search_stepwidth==0 && conf.kernel_type!=2)
		{
			Log.error()<<"'grid_search_stepwidth' must be set if grid search is to be done!"<<endl;
			return;
		}
		((KernelModel*)model)->kernel->gridSearch(conf.grid_search_stepwidth, conf.grid_search_steps, conf.grid_search_recursions, conf.k_fold);
	}

	model->readTrainingData();
	if(!conf.no_training)
	{
		try
		{
			model->train();
		}
		catch(BALL::Exception::GeneralException e)
		{
			Log.error()<<e.getMessage();
		}
	}

	model->saveToFile(conf.output);

	if(created_data_object) delete q;
	delete model;
}
Exemplo n.º 4
0
void startFeatureSelection(FeatureSelectionConfiguration& conf, QSARData* q, String* data_filename)
{
	bool created_data_object=0;
	if(q==NULL || data_filename==NULL || conf.data_file!=*data_filename)
	{
		if(q==NULL)
		{
			q = new QSARData;
			created_data_object=1;
		}
		q->readFromFile(conf.data_file);
		if(data_filename) *data_filename = conf.data_file;
	}
	else
	{
		Log.level(2)<<"[FeatureSelector debug-info:] QSARData object for file "<<conf.data_file<<" already in memory; not reading it again."<<endl;
	}

	Registry reg;
	Model* m;
	String model_type;

	ifstream model_input(conf.model.c_str()); // read model-abbreviation
	if(!model_input)
	{
		Log.error()<<"Error: Model-file '"<<conf.model<<"' does not exist!!"<<endl;
		return;
	}
	getline(model_input,model_type);
	getline(model_input,model_type);
	model_type = model_type.getField(0,"\t");
	model_input.close();

	RegistryEntry* entry = reg.getEntry(model_type);
	if(!entry->kernel)
	{
		m = (*entry->create)(*q);
	}
	else
	{
		// parameters irrelevant; will be overwritten by those read from file
		m = (*entry->createKernel1)(*q,1,1, -1);
	}

	if(conf.statistic>0)
	{
		Log.level(3)<<"  using "<<conf.statistic_name<<" to assess qualitiy of the model ... "<<endl;
		m->model_val->selectStat(conf.statistic);
	}

	m->readFromFile(conf.model.c_str());
	FeatureSelection fs(*m);
	if(conf.quality_increase_cutoff!=-1)
	{
		fs.setQualityIncreaseCutoff(conf.quality_increase_cutoff);
	}
	if(conf.remove_correlated || conf.feat_type==0)
	{
		fs.removeHighlyCorrelatedFeatures(conf.cor_threshold);
	}
	if(conf.feat_type==1)
	{
		fs.forwardSelection(conf.k_fold,conf.opt);
	}
	else if(conf.feat_type==2)
	{
		fs.backwardSelection(conf.k_fold,conf.opt);
	}
	else if(conf.feat_type==3)
	{
		fs.stepwiseSelection(conf.k_fold,conf.opt);
	}
	else if(conf.feat_type==4)
	{
		fs.removeLowResponseCorrelation(conf.cor_threshold);
	}
	else if(conf.feat_type==6)
	{
		fs.twinScan(conf.k_fold,conf.opt);
	}
	if(conf.opt_model_after_fs)
	{
		m->optimizeParameters(conf.opt_k_fold);
	}
	KernelModel* km = dynamic_cast<KernelModel*>(m);
	if(km && conf.opt_kernel_after_fs)
	{
		/// search locally around current kernel parameters
		try
		{
			// specifing start-values for grid search now obsolete; grid search will automatically search locally around current kernel parameter(s)
			km->kernel->gridSearch(conf.grid_search_stepwidth, conf.grid_search_steps,conf.grid_search_recursions,conf.opt_k_fold,conf.opt/*,start_par1,start_par2*/);
		}
		catch(BALL::Exception::GeneralException e)
		{
			Log.error()<<e.getName()<<" : "<<e.getMessage()<<endl;
			return;
		}
	}

	m->readTrainingData();
	m->train();
	m->saveToFile(conf.output);

	if(created_data_object) delete q;
	delete m;
}
Exemplo n.º 5
0
int main(int argc, char* argv[])
{
	CommandlineParser par("ModelCreator","create a QSAR model       ","1.1",String(__DATE__), "QuEasy (QSAR)");
	par.registerMandatoryInputFile("i", "input dat-file");
	par.registerMandatoryOutputFile("o", "output model file");
	par.registerMandatoryStringParameter("type", "model type");
	par.registerOptionalStringParameter("kernel", "kernel type (in case of kernel-model)");
	Registry reg;
	list<String> restr;
	for(RegistryEntryIterator it=reg.beginEntry(); it!=reg.endEntry(); it++)
	{
		restr.push_back(it->second.name_abreviation);
	}
	par.setParameterRestrictions("type", restr);
	restr.clear();
	restr.push_back("none");
	restr.push_back("polynomial");
	restr.push_back("rbf");
	restr.push_back("sigmoidal");
	par.setParameterRestrictions("kernel", restr);
	String man = "ModelCreator creates a QSAR model using an input data set as generated by InputReader.\n\nThe type of QSAR model to be used can be specified by '-type', the type of kernel-function (if any) can be chosen by '-kernel'. Optimization of model- and kernel-parmeters will be done automatically using cross-validation.\n\nOutput of this tool is a model-file that can be used by other QuEasy tools (e.g. FeatureSelector).";
	par.setToolManual(man);
	par.setSupportedFormats("i","dat");
	par.setSupportedFormats("o","mod");
	par.parse(argc,argv);

	ModelConfiguration conf;
	conf.data_file = par.get("i");
	conf.output = par.get("o");
	conf.optimize_model_parameters = true;
	conf.kernel_par1 = reg.default_kernel_par1;
	conf.kernel_par1 = reg.default_kernel_par2;
	conf.k_fold = reg.default_k;
	conf.grid_search_recursions = reg.default_gridsearch_recursion;
	conf.grid_search_stepwidth = reg.default_gridsearch_stepwidth;
	conf.grid_search_steps = reg.default_gridsearch_steps;
	try
	{
		conf.model_no = reg.getModelNo(par.get("type"));
	}
	catch(BALL::Exception::GeneralException e)
	{
		cerr << "A model-type '"<<par.get("type")<<"' does not exist; possible choices are:"<<endl;
		for(RegistryEntryIterator it=reg.beginEntry(); it!=reg.endEntry(); it++)
		{
			cerr<<"   "<<it->second.name_abreviation<<" :  "<<it->second.name<<" ";
			if(it->second.regression) cerr<<"(regression)"<<endl;
			else cerr<<"(classification)"<<endl;
		}
		cerr<<endl;
		exit(1);
	}

	String kernel = par.get("kernel");
	if(kernel!=CommandlineParser::NOT_FOUND && kernel != "none")
	{
		if(!reg.getEntry(par.get("type"))->kernel)
		{
			cerr << "[Error:] The chosen model-type has no kernel but you specified a kernel-type!"<<endl;
			exit(1);
		}
		if(kernel=="polynomial")
		{
			conf.kernel_type = 1;
		}
		else if(kernel=="rbf")
		{
			conf.kernel_type = 2;
		}
		else if(kernel=="sigmoidal")
		{
			conf.kernel_type = 3;
		}
		else
		{
			cerr << "Specified kernel-type '"<<kernel<<"' unknown; possible choices are: polynomial, rbf, sigmoidal"<<endl;
			exit(1);
		}
	}

	startModelCreation(conf,0,0);

}