void makeSimplePage(std::string iVar="puppi*ptodRSO",std::string iCut="(pt > 1. && dr > 0.2 )") { TTree *lTree = load("output/OutputTmp.root"); float lMin = 1e9; float lMax = -1e9; float lVar = 0; /* lTree->SetBranchAddress(iVar.c_str(),&lVar); for(int i0 = 0; i0 < lTree->GetEntries(); i0++) { lTree->GetEntry(i0); if(lVar < lMin) lMin = lVar; if(lVar > lMax) lMax = lVar; } */ lMin = -5; lMax = 200; TH1F* lH = new TH1F("A","A",100,lMin,lMax); lH ->SetLineColor(kBlue); lH ->SetLineWidth(2); TH1F* lHPU = new TH1F("B","B",100,lMin,lMax); lHPU ->SetLineColor(kRed); lHPU ->SetLineWidth(2); TH1F* lH1 = new TH1F("C","C",100,lMin,lMax); lH1 ->SetLineColor(kBlue); lH1 ->SetLineWidth(2); lH1 ->SetLineStyle(kDashed); TH1F* lHPU1 = new TH1F("D","D",100,lMin,lMax); lHPU1->SetLineColor(kRed); lHPU1->SetLineWidth(2); lHPU1->SetLineStyle(kDashed); lTree->Draw((iVar+">>A").c_str() ,(iCut+"*(pu == 0)").c_str()); lTree->Draw((iVar+">>B").c_str() ,(iCut+"*(pu > 0)").c_str()); lTree->Draw((iVar+">>C").c_str() ,(iCut+"*(pu == 0 && charge == 0)").c_str()); lTree->Draw((iVar+">>D").c_str() ,(iCut+"*(pu > 0 && charge == 0)").c_str()); lH->GetXaxis()->SetTitle((getName(iVar)).c_str()); TCanvas *lCan = new TCanvas("A","A",800,600); lH ->Draw(); lHPU ->Draw("sames"); lH1 ->Draw("sames"); lHPU1->Draw("sames"); //lCan->SaveAs((iVar+".png").c_str()); TCanvas *lCan = new TCanvas("B","B",800,600); roc(lHPU1,lH1); }
TEST_F(LinkPredictionGTest, testROCMetric) { ROCMetric roc(G); std::pair<std::vector<double>, std::vector<double>> curve = roc.getCurve(predictions); double auc = roc.getAreaUnderCurve(); EXPECT_EQ(auc, 0.8125); EXPECT_EQ(0, curve.first[0]); EXPECT_EQ(0.5, curve.second[0]); EXPECT_EQ(0.25, curve.first[1]); EXPECT_EQ(0.5, curve.second[1]); EXPECT_EQ(0.5, curve.first[2]); EXPECT_EQ(1, curve.second[2]); EXPECT_EQ(0.75, curve.first[3]); EXPECT_EQ(1, curve.second[3]); EXPECT_EQ(1, curve.first[4]); EXPECT_EQ(1, curve.second[4]); }
//ag_merge [-n _start_N_value_] [-a _start_alpha_value_] -d _directory1_ _directory2_ [_directory3_] //[_directory4_] ... int main(int argc, char* argv[]) { try{ //0. Set log file LogStream clog; LogStream::init(true); clog << "\n-----\nag_merge "; for(int argNo = 1; argNo < argc; argNo++) clog << argv[argNo] << " "; clog << "\n\n"; //1. Set input parameters from command line int startTiGN = 1; double startAlpha = 0.5; int firstDirNo = 0; stringv args(argc); for(int argNo = 0; argNo < argc; argNo++) args[argNo] = string(argv[argNo]); //parse and save input parameters for(int argNo = 1; argNo < argc; argNo += 2) { if(!args[argNo].compare("-n")) startTiGN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-a")) startAlpha = atofExt(argv[argNo + 1]); else if(!args[argNo].compare("-d")) { firstDirNo = argNo + 1; break; } else throw INPUT_ERR; } //check that there are at least two directories if(argc < (firstDirNo + 2)) throw INPUT_ERR; //convert names of input directories to strings and check that they exist int folderN = argc - firstDirNo; stringv folders(folderN); for(int argNo = firstDirNo; argNo < argc; argNo++) { folders[argNo - firstDirNo] = string(argv[argNo]); struct stat status; if((stat(argv[argNo], &status) != 0) || !(status.st_mode & S_IFDIR)) throw DIR_ERR; } //1.a) delete all temp files from the previous run and create a directory AGTemp #ifdef _WIN32 //in windows CreateDirectory("AGTemp", NULL); #else // in linux system("rm -rf ./AGTemp/"); system("mkdir ./AGTemp/"); #endif //2. Set parameters from AGTemp/params.txt from the first directory TrainInfo ti; //set of model parameters in the current directory double prevBest; //best value of performance achieved on the previous run fstream fparam; string paramPathName = folders[0] + "/AGTemp/params.txt"; fparam.open(paramPathName.c_str(), ios_base::in); string modeStr, metric; fparam >> ti.seed >> ti.trainFName >> ti.validFName >> ti.attrFName >> ti.minAlpha >> ti.maxTiGN >> ti.bagN >> modeStr >> metric; //modeStr should be "fast" or "slow" or "layered" if(modeStr.compare("fast") == 0) ti.mode = FAST; else if(modeStr.compare("slow") == 0) ti.mode = SLOW; else if(modeStr.compare("layered") == 0) ti.mode = LAYERED; else throw TEMP_ERR; //metric should be "roc" or "rms" if(metric.compare("rms") == 0) ti.rms = true; else if(metric.compare("roc") == 0) ti.rms = false; else throw TEMP_ERR; if(fparam.fail()) throw TEMP_ERR; fparam.close(); fparam.clear(); //read best value of performance on previous run fstream fbest; double stub; int itemN; // number of data points in the train set, need to calculate possible values of alpha string fbestPathName = folders[0] + "/AGTemp/best.txt"; fbest.open(fbestPathName.c_str(), ios_base::in); fbest >> prevBest >> stub >> stub >> stub >> itemN; if(fbest.fail()) throw TEMP_ERR; fbest.close(); int alphaN = getAlphaN(ti.minAlpha, itemN); //number of different alpha values int tigNN = getTiGNN(ti.maxTiGN); //direction of initialization (1 - up, 0 - right), used in fast mode only doublevv dir(tigNN, doublev(alphaN, 0)); //outer array: column (by TiGN) //middle array: row (by alpha) //direction of initialization (1 - up, 0 - right), collects average in the slow mode doublevv dirStat(tigNN, doublev(alphaN, 0)); if(ti.mode == FAST) {//read part of the directions table from file fstream fdir; string fdirPathName = folders[0] + "/AGTemp/dir.txt"; fdir.open(fdirPathName.c_str(), ios_base::in); for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) for(int alphaNo = 0; alphaNo < alphaN; alphaNo++) fdir >> dir[tigNNo][alphaNo]; if(fdir.fail()) throw TEMP_ERR; fdir.close(); } //3. Read main parameters from all other directories and check that they match int allBagN = ti.bagN; intv bagNs(folderN, 0); bagNs[0] = ti.bagN; intv prevBagNs(folderN + 1, 0); //sums of bagNs of all previous directories prevBagNs[1] = ti.bagN; int lastSeed = ti.seed; for(int folderNo = 1; folderNo < folderN; folderNo++) { TrainInfo extraTI; //set of model parameters in the additional directory string fparamPathName = folders[folderNo] + "/AGTemp/params.txt"; fparam.open(fparamPathName.c_str(), ios_base::in); fparam >> extraTI.seed >> extraTI.trainFName >> extraTI.validFName >> extraTI.attrFName >> extraTI.minAlpha >> extraTI.maxTiGN >> extraTI.bagN; if(fparam.fail()) { clog << fparamPathName << '\n'; throw TEMP_ERR; } fparam.close(); if((ti.minAlpha != extraTI.minAlpha) || (ti.maxTiGN != extraTI.maxTiGN)) { clog << fparamPathName << '\n'; throw MERGE_MISMATCH_ERR; } if(extraTI.seed == ti.seed) throw SAME_SEED_ERR; if(folderNo == (folderN - 1)) lastSeed = extraTI.seed; allBagN += extraTI.bagN; bagNs[folderNo] = extraTI.bagN; prevBagNs[folderNo + 1] = allBagN; string fdirStatPathName = folders[folderNo] + "/AGTemp/dirstat.txt"; fstream fdirStat; fdirStat.open("./AGTemp/dirstat.txt", ios_base::in); for(int alphaNo = 0; alphaNo < alphaN; alphaNo++) for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) { double ds; fdirStat >> ds; dirStat[tigNNo][alphaNo] += ds * extraTI.bagN; } } //4. Load data INDdata data("", ti.validFName.c_str(), "", ti.attrFName.c_str()); CGrove::setData(data); CTreeNode::setData(data); doublev validTar; int validN = data.getTargets(validTar, VALID); clog << "Alpha = " << ti.minAlpha << "\nN = " << ti.maxTiGN << "\n" << allBagN << " bagging iterations\n"; if(ti.mode == FAST) clog << "fast mode\n\n"; else if(ti.mode == SLOW) clog << "slow mode\n\n"; else //if(ti.mode == LAYERED) clog << "layered mode\n\n"; //5. Initialize some internal process variables //surfaces of performance values for validation set. //Always calculate rms (for convergence analysis), if needed, calculate roc doublevvv rmsV(tigNN, doublevv(alphaN, doublev(allBagN, 0))); doublevvv rocV; if(!ti.rms) rocV.resize(tigNN, doublevv(alphaN, doublev(allBagN, 0))); //outer array: column (by TiGN) //middle array: row (by alpha) //inner array: bagging iterations. Performance is kept for all iterations to create bagging curves //sums of predictions for each data point (raw material to calculate performance) doublevvv predsumsV(tigNN, doublevv(alphaN, doublev(validN, 0))); //outer array: column (by TiGN) //middle array: row (by alpha) //inner array: data points in the validation set //6. Read and merge models from the directories int startAlphaNo = getAlphaN(startAlpha, itemN) - 1; int startTiGNNo = getTiGNN(startTiGN) - 1; for(int alphaNo = startAlphaNo; alphaNo < alphaN; alphaNo++) { double alpha; if(alphaNo < alphaN - 1) alpha = alphaVal(alphaNo); else //this is a special case because minAlpha can be zero alpha = ti.minAlpha; cout << "Merging models with alpha = " << alpha << endl; for(int tigNNo = startTiGNNo; tigNNo < tigNN; tigNNo++) { int tigN = tigVal(tigNNo); //number of trees in the current grove //temp file in the extra directory that keeps models corresponding to alpha and tigN string prefix = string("/AGTemp/ag.a.") + alphaToStr(alpha) + ".n." + itoa(tigN, 10); string tempFName = prefix + ".tmp"; //this will kill the pre-existing file in the output directory fstream fsave((string(".") + tempFName).c_str(), ios_base::binary | ios_base::out); for(int folderNo = 0; folderNo < folderN; folderNo++) { string inTempFName = folders[folderNo] + tempFName; fstream ftemp((inTempFName).c_str(), ios_base::binary | ios_base::in); if(ftemp.fail()) { clog << inTempFName << '\n'; throw TEMP_ERR; } //merge all extra models with the same (alpha, tigN) parameter values into existing models for(int bagNo = prevBagNs[folderNo]; bagNo < prevBagNs[folderNo + 1]; bagNo++) { //retrieve next grove CGrove extraGrove(alpha, tigN); try{ extraGrove.load(ftemp); }catch(TE_ERROR err){ clog << inTempFName << '\n'; throw err; } //add the loaded grove to a model file with alpha and tigN values in the name extraGrove.save((string(".") + tempFName).c_str()); //generate predictions and performance for validation set doublev predictions(validN); for(int itemNo = 0; itemNo < validN; itemNo++) { predsumsV[tigNNo][alphaNo][itemNo] += extraGrove.predict(itemNo, VALID); predictions[itemNo] = predsumsV[tigNNo][alphaNo][itemNo] / (bagNo + 1); } if(bagNo == allBagN - 1) { string predsFName = prefix + ".preds.txt"; fstream fpreds((string(".") + predsFName).c_str(), ios_base::out); for(int itemNo = 0; itemNo < validN; itemNo++) fpreds << predictions[itemNo] << endl; fpreds.close(); } rmsV[tigNNo][alphaNo][bagNo] = rmse(predictions, validTar); if(!ti.rms) rocV[tigNNo][alphaNo][bagNo] = roc(predictions, validTar); }// end for(int bagNo = ti.bagN; bagNo < ti.bagN + extraTI.bagN; bagNo++) ftemp.close(); }//end for(int folderNo = 0; folderNo < folderN; folderNo++) }//end for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) }//end for(int alphaNo = 0; alphaNo < alphaN; alphaNo++) //4. Output ti.bagN = allBagN; ti.seed = lastSeed; if(ti.rms) trainOut(ti, dir, rmsV, rmsV, predsumsV, itemN, dirStat, startAlphaNo, startTiGNNo); else trainOut(ti, dir, rmsV, rocV, predsumsV, itemN, dirStat, startAlphaNo, startTiGNNo); }catch(TE_ERROR err){
int main(int argc, char* argv[]) { try{ //1. Analyze input parameters //convert input parameters to string from char* stringv args(argc); for(int argNo = 0; argNo < argc; argNo++) args[argNo] = string(argv[argNo]); //check that the number of arguments is even (flags + value pairs) if(argc % 2 == 0) throw INPUT_ERR; #ifndef _WIN32 int threadN = 6; //number of threads #endif TrainInfo ti; //model training parameters int topAttrN = 0; //how many top attributes to output and keep in the cut data //(0 = do not do feature selection) //(-1 = output all available features) //parse and save input parameters //indicators of presence of required flags in the input bool hasTrain = false; bool hasVal = false; bool hasAttr = false; int treeN = 100; double shrinkage = 0.01; double subsample = -1; for(int argNo = 1; argNo < argc; argNo += 2) { if(!args[argNo].compare("-t")) { ti.trainFName = args[argNo + 1]; hasTrain = true; } else if(!args[argNo].compare("-v")) { ti.validFName = args[argNo + 1]; hasVal = true; } else if(!args[argNo].compare("-r")) { ti.attrFName = args[argNo + 1]; hasAttr = true; } else if(!args[argNo].compare("-a")) ti.alpha = atofExt(argv[argNo + 1]); else if(!args[argNo].compare("-n")) treeN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-i")) ti.seed = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-k")) topAttrN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-sh")) shrinkage = atofExt(argv[argNo + 1]); else if(!args[argNo].compare("-sub")) subsample = atofExt(argv[argNo + 1]); else if(!args[argNo].compare("-c")) { if(!args[argNo + 1].compare("roc")) ti.rms = false; else if(!args[argNo + 1].compare("rms")) ti.rms = true; else throw INPUT_ERR; } else if(!args[argNo].compare("-h")) #ifndef _WIN32 threadN = atoiExt(argv[argNo + 1]); #else throw WIN_ERR; #endif else throw INPUT_ERR; }//end for(int argNo = 1; argNo < argc; argNo += 2) //parse and save input parameters if(!(hasTrain && hasVal && hasAttr)) throw INPUT_ERR; if((ti.alpha < 0) || (ti.alpha > 1)) throw ALPHA_ERR; //1.a) Set log file LogStream clog; LogStream::init(true); clog << "\n-----\ngbt_train "; for(int argNo = 1; argNo < argc; argNo++) clog << argv[argNo] << " "; clog << "\n\n"; //1.b) Initialize random number generator. srand(ti.seed); //2. Load data INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), ti.attrFName.c_str()); CTree::setData(data); CTreeNode::setData(data); //2.a) Start thread pool #ifndef _WIN32 TThreadPool pool(threadN); CTree::setPool(pool); #endif //------------------ int attrN = data.getAttrN(); if(topAttrN == -1) topAttrN = attrN; idpairv attrCounts; //counts of attribute importance bool doFS = (topAttrN != 0); //whether feature selection is requested if(doFS) {//initialize attrCounts attrCounts.resize(attrN); for(int attrNo = 0; attrNo < attrN; attrNo++) { attrCounts[attrNo].first = attrNo; //number of attribute attrCounts[attrNo].second = 0; //counts } } fstream frmscurve("boosting_rms.txt", ios_base::out); //bagging curve (rms) frmscurve.close(); fstream froccurve; if(!ti.rms) { froccurve.open("boosting_roc.txt", ios_base::out); //bagging curve (roc) froccurve.close(); } doublev validTar; int validN = data.getTargets(validTar, VALID); doublev trainTar; int trainN = data.getTargets(trainTar, TRAIN); int sampleN; if(subsample == -1) sampleN = trainN; else sampleN = (int) (trainN * subsample); doublev validPreds(validN, 0); doublev trainPreds(trainN, 0); for(int treeNo = 0; treeNo < treeN; treeNo++) { if(treeNo % 10 == 0) cout << "\titeration " << treeNo + 1 << " out of " << treeN << endl; if(subsample == -1) data.newBag(); else data.newSample(sampleN); CTree tree(ti.alpha); tree.setRoot(); tree.resetRoot(trainPreds); idpairv stub; tree.grow(doFS, attrCounts); //update predictions for(int itemNo = 0; itemNo < trainN; itemNo++) trainPreds[itemNo] += shrinkage * tree.predict(itemNo, TRAIN); for(int itemNo = 0; itemNo < validN; itemNo++) validPreds[itemNo] += shrinkage * tree.predict(itemNo, VALID); //output frmscurve.open("boosting_rms.txt", ios_base::out | ios_base::app); frmscurve << rmse(validPreds, validTar) << endl; frmscurve.close(); if(!ti.rms) { froccurve.open("boosting_roc.txt", ios_base::out | ios_base::app); froccurve << roc(validPreds, validTar) << endl; froccurve.close(); } } //output feature selection results if(doFS) { sort(attrCounts.begin(), attrCounts.end(), idGreater); if(topAttrN > attrN) topAttrN = attrN; fstream ffeatures("feature_scores.txt", ios_base::out); ffeatures << "Top " << topAttrN << " features\n"; for(int attrNo = 0; attrNo < topAttrN; attrNo++) ffeatures << data.getAttrName(attrCounts[attrNo].first) << "\t" << attrCounts[attrNo].second / ti.bagN / trainN << "\n"; ffeatures << "\n\nColumn numbers (beginning with 1)\n"; for(int attrNo = 0; attrNo < topAttrN; attrNo++) ffeatures << data.getColNo(attrCounts[attrNo].first) + 1 << " "; ffeatures << "\nLabel column number: " << data.getTarColNo() + 1; ffeatures.close(); //output new attribute file for(int attrNo = topAttrN; attrNo < attrN; attrNo++) data.ignoreAttr(attrCounts[attrNo].first); data.outAttr(ti.attrFName); } //output predictions fstream fpreds; fpreds.open("preds.txt", ios_base::out); for(int itemNo = 0; itemNo < validN; itemNo++) fpreds << validPreds[itemNo] << endl; fpreds.close(); //------------------ }catch(TE_ERROR err){
int main(int argc, char* argv[]) { try{ //1. Analyze input parameters //convert input parameters to string from char* stringv args(argc); for(int argNo = 0; argNo < argc; argNo++) args[argNo] = string(argv[argNo]); //check that the number of arguments is even (flags + value pairs) if(argc % 2 == 0) throw INPUT_ERR; #ifndef _WIN32 int threadN = 6; //number of threads #endif TrainInfo ti; //model training parameters string modelFName = "model.bin"; //name of the output file for the model int topAttrN = 0; //how many top attributes to output and keep in the cut data //(0 = do not do feature selection) //(-1 = output all available features) bool doOut = true; //whether to output log information to stdout //parse and save input parameters //indicators of presence of required flags in the input bool hasTrain = false; bool hasVal = false; bool hasAttr = false; for(int argNo = 1; argNo < argc; argNo += 2) { if(!args[argNo].compare("-t")) { ti.trainFName = args[argNo + 1]; hasTrain = true; } else if(!args[argNo].compare("-v")) { ti.validFName = args[argNo + 1]; hasVal = true; } else if(!args[argNo].compare("-r")) { ti.attrFName = args[argNo + 1]; hasAttr = true; } else if(!args[argNo].compare("-a")) ti.alpha = atofExt(argv[argNo + 1]); else if(!args[argNo].compare("-b")) ti.bagN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-i")) ti.seed = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-k")) topAttrN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-m")) { modelFName = args[argNo + 1]; if(modelFName.empty()) throw EMPTY_MODEL_NAME_ERR; } else if(!args[argNo].compare("-l")) { if(!args[argNo + 1].compare("log")) doOut = true; else if(!args[argNo + 1].compare("nolog")) doOut = false; else throw INPUT_ERR; } else if(!args[argNo].compare("-c")) { if(!args[argNo + 1].compare("roc")) ti.rms = false; else if(!args[argNo + 1].compare("rms")) ti.rms = true; else throw INPUT_ERR; } else if(!args[argNo].compare("-h")) #ifndef _WIN32 threadN = atoiExt(argv[argNo + 1]); #else throw WIN_ERR; #endif else throw INPUT_ERR; }//end for(int argNo = 1; argNo < argc; argNo += 2) //parse and save input parameters if(!(hasTrain && hasVal && hasAttr)) throw INPUT_ERR; if((ti.alpha < 0) || (ti.alpha > 1)) throw ALPHA_ERR; //1.a) Set log file LogStream clog; LogStream::init(doOut); clog << "\n-----\nbt_train "; for(int argNo = 1; argNo < argc; argNo++) clog << argv[argNo] << " "; clog << "\n\n"; //1.b) Initialize random number generator. srand(ti.seed); //2. Load data INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), ti.attrFName.c_str()); CTree::setData(data); CTreeNode::setData(data); //2.a) Start thread pool #ifndef _WIN32 TThreadPool pool(threadN); CTree::setPool(pool); #endif //3. Train models doublev validTar; int validN = data.getTargets(validTar, VALID); int itemN = data.getTrainN(); //adjust minAlpha, if needed double newAlpha = adjustAlpha(ti.alpha, itemN); if(ti.alpha != newAlpha) { if(newAlpha == 0) clog << "Warning: due to small train set size value of alpha was changed to 0"; else clog << "Warning: alpha value was rounded to the closest valid value " << newAlpha; clog << ".\n\n"; ti.alpha = newAlpha; } clog << "Alpha = " << ti.alpha << "\n" << ti.bagN << " bagging iterations\n"; doublev rmsV(ti.bagN, 0); //bagging curve of rms values for validation set doublev rocV; if(!ti.rms) rocV.resize(ti.bagN, 0); //bagging curve of roc values for validation set doublev predsumsV(validN, 0); //sums of predictions for each data point int attrN = data.getAttrN(); if(topAttrN == -1) topAttrN = attrN; idpairv attrCounts; //counts of attribute importance bool doFS = (topAttrN != 0); //whether feature selection is requested if(doFS) {//initialize attrCounts attrCounts.resize(attrN); for(int attrNo = 0; attrNo < attrN; attrNo++) { attrCounts[attrNo].first = attrNo; //number of attribute attrCounts[attrNo].second = 0; //counts } } fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::out); //header for compatibility with Additive Groves model AG_TRAIN_MODE modeStub = SLOW; fmodel.write((char*) &modeStub, sizeof(enum AG_TRAIN_MODE)); int tigNStub = 1; fmodel.write((char*) &tigNStub, sizeof(int)); fmodel.write((char*) &ti.alpha, sizeof(double)); fmodel.close(); fstream fbagrms("bagging_rms.txt", ios_base::out); //bagging curve (rms) fbagrms.close(); fstream fbagroc; if(!ti.rms) { fbagroc.open("bagging_roc.txt", ios_base::out); //bagging curve (roc) fbagroc.close(); } //make bags, build trees, collect predictions for(int bagNo = 0; bagNo < ti.bagN; bagNo++) { if(doOut) cout << "Iteration " << bagNo + 1 << " out of " << ti.bagN << endl; data.newBag(); CTree tree(ti.alpha); tree.setRoot(); tree.grow(doFS, attrCounts); tree.save(modelFName.c_str()); //generate predictions for validation set doublev predictions(validN); for(int itemNo = 0; itemNo < validN; itemNo++) { predsumsV[itemNo] += tree.predict(itemNo, VALID); predictions[itemNo] = predsumsV[itemNo] / (bagNo + 1); } rmsV[bagNo] = rmse(predictions, validTar); if(!ti.rms) rocV[bagNo] = roc(predictions, validTar); //output an element of bagging curve fbagrms.open("bagging_rms.txt", ios_base::out | ios_base::app); fbagrms << rmsV[bagNo] << endl; fbagrms.close(); //same for roc, if needed if(!ti.rms) { fbagroc.open("bagging_roc.txt", ios_base::out | ios_base::app); fbagroc << rocV[bagNo] << endl; fbagroc.close(); } } if(doFS) //sort attributes by counts sort(attrCounts.begin(), attrCounts.end(), idGreater); //4. Output //output results and recommendations if(ti.rms) clog << "RMSE on validation set = " << rmsV[ti.bagN - 1] << "\n"; else clog << "ROC on validation set = " << rocV[ti.bagN - 1] << "\n"; //analyze whether more bagging should be recommended based on the curve in the best point if(moreBag(rmsV)) { int recBagN = ti.bagN + 100; clog << "\nRecommendation: a greater number of bagging iterations might produce a better model.\n" << "Suggested action: bt_train -b " << recBagN << "\n"; } else clog << "\nThe bagging curve shows good convergence. \n"; clog << "\n"; //standard output in case of turned off log output: final performance on validation set only if(!doOut) if(ti.rms) cout << rmsV[ti.bagN - 1] << endl; else cout << rocV[ti.bagN - 1] << endl; //output feature selection results if(doFS) { if(topAttrN > attrN) topAttrN = attrN; fstream ffeatures("feature_scores.txt", ios_base::out); ffeatures << "Top " << topAttrN << " features\n"; for(int attrNo = 0; attrNo < topAttrN; attrNo++) ffeatures << data.getAttrName(attrCounts[attrNo].first) << "\t" << attrCounts[attrNo].second / ti.bagN / itemN << "\n"; ffeatures << "\n\nColumn numbers (beginning with 1)\n"; for(int attrNo = 0; attrNo < topAttrN; attrNo++) ffeatures << data.getColNo(attrCounts[attrNo].first) + 1 << " "; ffeatures << "\nLabel column number: " << data.getTarColNo() + 1; ffeatures.close(); //output new attribute file for(int attrNo = topAttrN; attrNo < attrN; attrNo++) data.ignoreAttr(attrCounts[attrNo].first); data.outAttr(ti.attrFName); } }catch(TE_ERROR err){