void startValidation(ValidationConfiguration& conf, QSARData* q, String* data_filename) { bool created_data_object=0; if(q==NULL || data_filename==NULL) { q = new QSARData; created_data_object=1; } Registry reg; Model* m; String model_type; ifstream model_input(conf.model.c_str()); // read model-abbreviation if(!model_input) { Log.error()<<"Error: Model-file '"<<conf.model<<"' does not exist!!"<<endl; return; } getline(model_input,model_type); getline(model_input,model_type); model_type = model_type.getField(0,"\t"); model_input.close(); RegistryEntry* entry = reg.getEntry(model_type); if(!entry->kernel) { m = (*entry->create)(*q); } else { // parameters irrelevant; will be overwritten by those read from file m = (*entry->createKernel1)(*q,1,1, -1); } m->readFromFile(conf.model.c_str()); m->model_val->selectStat(conf.statistic); if(conf.data!="") { if(!data_filename || conf.data!=*data_filename) { q->readFromFile(conf.data); if(data_filename) *data_filename = conf.data; } if(conf.val_type==1) m->model_val->testInputData(1); else if(conf.val_type==2) m->model_val->crossValidation(conf.k_folds,1); else if(conf.val_type==3) m->model_val->bootstrap(conf.bootstrap_samples); else if(conf.val_type==4) m->model_val->yRandomizationTest(conf.no_of_permutation_tests,conf.k_folds); else if(conf.val_type==6) { if(entry->regression) { ((RegressionModel*)m)->validation->calculateCoefficientStdErrors(conf.bootstrap_samples); } } else if(conf.val_type==7) { if(!data_filename || conf.validation_data!=*data_filename) { q->readFromFile(conf.validation_data.c_str()); *data_filename = conf.validation_data; } m->model_val->testInputData(1); } } // save the result of the validation to the specified file m->model_val->saveToFile(conf.output); if(created_data_object) delete q; delete m; }
Model* AutomaticModelCreator::generateModel() { Registry registry; bool use_regression = !data_->checkforDiscreteY(); bool use_random_testsets = 0; Size no_folds = 3; if (use_random_testsets) no_folds = 10; Size best_model_id = 0; Size best_kernel_id = 0; double best_nested_quality = 0; /// Do 4-fold nested cross validation, including feature selection steps, for each model type and, if applicable, each kernel function Log.level(10)<<"model-name kernel #features nested Q^2 stddev"<<std::endl; Log.level(10)<<"---------------------------------------------------"<<std::endl; Log.level(10)<<std::setiosflags(std::ios::fixed)<<std::left; for (Size model_id = 1; model_id < 14; model_id++) { RegistryEntry* reg_entry; try { reg_entry = registry.getEntry(model_id); } catch(BALL::Exception::GeneralException e) { // a model with the current id does not exist continue; } if (use_regression != reg_entry->regression) continue; Size no_kernel_types = 1; if (reg_entry->kernel) no_kernel_types = 3; for (Size kernel_id = 1; kernel_id <= no_kernel_types; kernel_id++) { double nested_q2 = 0; int no_features = 0; vector<double> q2_values; for (Size fold_id = 0; fold_id < no_folds; fold_id++) { vector<QSARData*> sets; if (use_random_testsets) { // randomly select 25% of compounds for external validation set sets = data_->generateExternalSet(0.25); } else { sets = data_->evenSplit(no_folds, fold_id); } if (data_->isDataCentered()) { bool center_y = data_->isResponseCentered(); sets[0]->centerData(center_y); // train-partition sets[1]->centerData(center_y); // test-partition } Model* model; if (!reg_entry->kernel) model = (*reg_entry->create)(*sets[0]); else model = (*reg_entry->createKernel1)(*sets[0], kernel_id, 1, -1); model->setParameters(reg_entry->parameterDefaults); optimizeParameters(model); // select relevant features using training partition selectFeatures(model); no_features += model->getDescriptorIDs()->size(); // train model using only the training partition model->readTrainingData(); model->train(); // test fit to external validation data model->data = sets[1]; model->model_val->testInputData(true); nested_q2 += model->model_val->getFitRes(); q2_values.push_back(model->model_val->getFitRes()); delete sets[0]; delete sets[1]; delete model; } nested_q2 /= no_folds; no_features /= no_folds; double stddev = Statistics::getStddev(q2_values); Log.level(10)<<std::setw(10)<<reg_entry->name_abreviation<<" "; if (reg_entry->kernel) { if (kernel_id == 1) Log.level(10)<<setw(6)<<"polyn."; else if (kernel_id == 2) Log.level(10)<<setw(6)<<"rbf"; else if (kernel_id == 3) Log.level(10)<<setw(6)<<"sigm."; } else Log.level(10)<<setw(6)<<"none"; Log.level(10)<<" "; Log.level(10)<<setw(9)<<no_features<<" "<<setw(10)<<nested_q2<<" "<<setw(6)<<stddev<<endl<<flush; double quality = nested_q2-stddev; // make sure to prefer models w/ low stddev if (quality > best_nested_quality) { best_nested_quality = quality; best_model_id = model_id; best_kernel_id = kernel_id; } } } if (best_nested_quality < min_quality_) { Log.level(10)<<"Sorry, no model with satisfactory prediction quality found!"<<endl; return 0; } /// Create best model using ENTIRE data set and return it RegistryEntry* reg_entry = registry.getEntry(best_model_id); Model* model; if (!reg_entry->kernel) model = (*reg_entry->create)(*data_); else model = (*reg_entry->createKernel1)(*data_, best_kernel_id, 1, -1); model->setParameters(reg_entry->parameterDefaults); optimizeParameters(model); selectFeatures(model); model->readTrainingData(); model->train(); return model; }
void startModelCreation(ModelConfiguration& conf, QSARData* q, String* data_filename) { bool created_data_object=0; if(q==NULL || data_filename==NULL || conf.data_file!=*data_filename) { if(q==NULL) { q = new QSARData; created_data_object=1; } q->readFromFile(conf.data_file); if(data_filename) *data_filename = conf.data_file; } else { Log.level(2)<<"[ModelCreator debug-info:] QSARData object for file "<<conf.data_file<<" already in memory; not reading it again."<<endl; } Registry reg; Model* model; bool kernel=0; RegistryEntry* entry = reg.getEntry(conf.model_no); if(entry->create!=NULL) { model = (*entry->create)(*q); } else { if(conf.kernel_type==0 || conf.kernel_par1==0) { Log.error()<<"For kernel based model, kernel-type and kernel-parameter(s) must be specified!"<<endl; return; } model = (*entry->createKernel1)(*q,conf.kernel_type,conf.kernel_par1, conf.kernel_par2); kernel=1; } if(conf.model_parameters.size()>0) { model->setParameters(conf.model_parameters); } if(!conf.no_training && conf.optimize_model_parameters) { if(conf.k_fold==0) { Log.error()<<"'k_fold' must be set if model parameters are to be optimized!"<<endl; return; } model->optimizeParameters(conf.k_fold); } if(!conf.no_training && kernel && conf.grid_search_steps>0) { if(conf.k_fold==0) { Log.error()<<"'k_fold' must be set if grid search is to be done!"<<endl; return; } if(conf.grid_search_stepwidth==0 && conf.kernel_type!=2) { Log.error()<<"'grid_search_stepwidth' must be set if grid search is to be done!"<<endl; return; } ((KernelModel*)model)->kernel->gridSearch(conf.grid_search_stepwidth, conf.grid_search_steps, conf.grid_search_recursions, conf.k_fold); } model->readTrainingData(); if(!conf.no_training) { try { model->train(); } catch(BALL::Exception::GeneralException e) { Log.error()<<e.getMessage(); } } model->saveToFile(conf.output); if(created_data_object) delete q; delete model; }
void startFeatureSelection(FeatureSelectionConfiguration& conf, QSARData* q, String* data_filename) { bool created_data_object=0; if(q==NULL || data_filename==NULL || conf.data_file!=*data_filename) { if(q==NULL) { q = new QSARData; created_data_object=1; } q->readFromFile(conf.data_file); if(data_filename) *data_filename = conf.data_file; } else { Log.level(2)<<"[FeatureSelector debug-info:] QSARData object for file "<<conf.data_file<<" already in memory; not reading it again."<<endl; } Registry reg; Model* m; String model_type; ifstream model_input(conf.model.c_str()); // read model-abbreviation if(!model_input) { Log.error()<<"Error: Model-file '"<<conf.model<<"' does not exist!!"<<endl; return; } getline(model_input,model_type); getline(model_input,model_type); model_type = model_type.getField(0,"\t"); model_input.close(); RegistryEntry* entry = reg.getEntry(model_type); if(!entry->kernel) { m = (*entry->create)(*q); } else { // parameters irrelevant; will be overwritten by those read from file m = (*entry->createKernel1)(*q,1,1, -1); } if(conf.statistic>0) { Log.level(3)<<" using "<<conf.statistic_name<<" to assess qualitiy of the model ... "<<endl; m->model_val->selectStat(conf.statistic); } m->readFromFile(conf.model.c_str()); FeatureSelection fs(*m); if(conf.quality_increase_cutoff!=-1) { fs.setQualityIncreaseCutoff(conf.quality_increase_cutoff); } if(conf.remove_correlated || conf.feat_type==0) { fs.removeHighlyCorrelatedFeatures(conf.cor_threshold); } if(conf.feat_type==1) { fs.forwardSelection(conf.k_fold,conf.opt); } else if(conf.feat_type==2) { fs.backwardSelection(conf.k_fold,conf.opt); } else if(conf.feat_type==3) { fs.stepwiseSelection(conf.k_fold,conf.opt); } else if(conf.feat_type==4) { fs.removeLowResponseCorrelation(conf.cor_threshold); } else if(conf.feat_type==6) { fs.twinScan(conf.k_fold,conf.opt); } if(conf.opt_model_after_fs) { m->optimizeParameters(conf.opt_k_fold); } KernelModel* km = dynamic_cast<KernelModel*>(m); if(km && conf.opt_kernel_after_fs) { /// search locally around current kernel parameters try { // specifing start-values for grid search now obsolete; grid search will automatically search locally around current kernel parameter(s) km->kernel->gridSearch(conf.grid_search_stepwidth, conf.grid_search_steps,conf.grid_search_recursions,conf.opt_k_fold,conf.opt/*,start_par1,start_par2*/); } catch(BALL::Exception::GeneralException e) { Log.error()<<e.getName()<<" : "<<e.getMessage()<<endl; return; } } m->readTrainingData(); m->train(); m->saveToFile(conf.output); if(created_data_object) delete q; delete m; }
int main(int argc, char* argv[]) { CommandlineParser par("ModelCreator","create a QSAR model ","1.1",String(__DATE__), "QuEasy (QSAR)"); par.registerMandatoryInputFile("i", "input dat-file"); par.registerMandatoryOutputFile("o", "output model file"); par.registerMandatoryStringParameter("type", "model type"); par.registerOptionalStringParameter("kernel", "kernel type (in case of kernel-model)"); Registry reg; list<String> restr; for(RegistryEntryIterator it=reg.beginEntry(); it!=reg.endEntry(); it++) { restr.push_back(it->second.name_abreviation); } par.setParameterRestrictions("type", restr); restr.clear(); restr.push_back("none"); restr.push_back("polynomial"); restr.push_back("rbf"); restr.push_back("sigmoidal"); par.setParameterRestrictions("kernel", restr); String man = "ModelCreator creates a QSAR model using an input data set as generated by InputReader.\n\nThe type of QSAR model to be used can be specified by '-type', the type of kernel-function (if any) can be chosen by '-kernel'. Optimization of model- and kernel-parmeters will be done automatically using cross-validation.\n\nOutput of this tool is a model-file that can be used by other QuEasy tools (e.g. FeatureSelector)."; par.setToolManual(man); par.setSupportedFormats("i","dat"); par.setSupportedFormats("o","mod"); par.parse(argc,argv); ModelConfiguration conf; conf.data_file = par.get("i"); conf.output = par.get("o"); conf.optimize_model_parameters = true; conf.kernel_par1 = reg.default_kernel_par1; conf.kernel_par1 = reg.default_kernel_par2; conf.k_fold = reg.default_k; conf.grid_search_recursions = reg.default_gridsearch_recursion; conf.grid_search_stepwidth = reg.default_gridsearch_stepwidth; conf.grid_search_steps = reg.default_gridsearch_steps; try { conf.model_no = reg.getModelNo(par.get("type")); } catch(BALL::Exception::GeneralException e) { cerr << "A model-type '"<<par.get("type")<<"' does not exist; possible choices are:"<<endl; for(RegistryEntryIterator it=reg.beginEntry(); it!=reg.endEntry(); it++) { cerr<<" "<<it->second.name_abreviation<<" : "<<it->second.name<<" "; if(it->second.regression) cerr<<"(regression)"<<endl; else cerr<<"(classification)"<<endl; } cerr<<endl; exit(1); } String kernel = par.get("kernel"); if(kernel!=CommandlineParser::NOT_FOUND && kernel != "none") { if(!reg.getEntry(par.get("type"))->kernel) { cerr << "[Error:] The chosen model-type has no kernel but you specified a kernel-type!"<<endl; exit(1); } if(kernel=="polynomial") { conf.kernel_type = 1; } else if(kernel=="rbf") { conf.kernel_type = 2; } else if(kernel=="sigmoidal") { conf.kernel_type = 3; } else { cerr << "Specified kernel-type '"<<kernel<<"' unknown; possible choices are: polynomial, rbf, sigmoidal"<<endl; exit(1); } } startModelCreation(conf,0,0); }