int main(int argc, char* argv[]) { try{ //1. Analyze input parameters //convert input parameters to string from char* stringv args(argc); for(int argNo = 0; argNo < argc; argNo++) args[argNo] = string(argv[argNo]); //check that the number of arguments is even (flags + value pairs) if(argc % 2 == 0) throw INPUT_ERR; #ifndef _WIN32 int threadN = 6; //number of threads #endif TrainInfo ti; //model training parameters string modelFName = "model.bin"; //name of the output file for the model int topAttrN = 0; //how many top attributes to output and keep in the cut data //(0 = do not do feature selection) //(-1 = output all available features) bool doOut = true; //whether to output log information to stdout //parse and save input parameters //indicators of presence of required flags in the input bool hasTrain = false; bool hasVal = false; bool hasAttr = false; for(int argNo = 1; argNo < argc; argNo += 2) { if(!args[argNo].compare("-t")) { ti.trainFName = args[argNo + 1]; hasTrain = true; } else if(!args[argNo].compare("-v")) { ti.validFName = args[argNo + 1]; hasVal = true; } else if(!args[argNo].compare("-r")) { ti.attrFName = args[argNo + 1]; hasAttr = true; } else if(!args[argNo].compare("-a")) ti.alpha = atofExt(argv[argNo + 1]); else if(!args[argNo].compare("-b")) ti.bagN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-i")) ti.seed = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-k")) topAttrN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-m")) { modelFName = args[argNo + 1]; if(modelFName.empty()) throw EMPTY_MODEL_NAME_ERR; } else if(!args[argNo].compare("-l")) { if(!args[argNo + 1].compare("log")) doOut = true; else if(!args[argNo + 1].compare("nolog")) doOut = false; else throw INPUT_ERR; } else if(!args[argNo].compare("-c")) { if(!args[argNo + 1].compare("roc")) ti.rms = false; else if(!args[argNo + 1].compare("rms")) ti.rms = true; else throw INPUT_ERR; } else if(!args[argNo].compare("-h")) #ifndef _WIN32 threadN = atoiExt(argv[argNo + 1]); #else throw WIN_ERR; #endif else throw INPUT_ERR; }//end for(int argNo = 1; argNo < argc; argNo += 2) //parse and save input parameters if(!(hasTrain && hasVal && hasAttr)) throw INPUT_ERR; if((ti.alpha < 0) || (ti.alpha > 1)) throw ALPHA_ERR; //1.a) Set log file LogStream clog; LogStream::init(doOut); clog << "\n-----\nbt_train "; for(int argNo = 1; argNo < argc; argNo++) clog << argv[argNo] << " "; clog << "\n\n"; //1.b) Initialize random number generator. srand(ti.seed); //2. Load data INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), ti.attrFName.c_str()); CTree::setData(data); CTreeNode::setData(data); //2.a) Start thread pool #ifndef _WIN32 TThreadPool pool(threadN); CTree::setPool(pool); #endif //3. Train models doublev validTar; int validN = data.getTargets(validTar, VALID); int itemN = data.getTrainN(); //adjust minAlpha, if needed double newAlpha = adjustAlpha(ti.alpha, itemN); if(ti.alpha != newAlpha) { if(newAlpha == 0) clog << "Warning: due to small train set size value of alpha was changed to 0"; else clog << "Warning: alpha value was rounded to the closest valid value " << newAlpha; clog << ".\n\n"; ti.alpha = newAlpha; } clog << "Alpha = " << ti.alpha << "\n" << ti.bagN << " bagging iterations\n"; doublev rmsV(ti.bagN, 0); //bagging curve of rms values for validation set doublev rocV; if(!ti.rms) rocV.resize(ti.bagN, 0); //bagging curve of roc values for validation set doublev predsumsV(validN, 0); //sums of predictions for each data point int attrN = data.getAttrN(); if(topAttrN == -1) topAttrN = attrN; idpairv attrCounts; //counts of attribute importance bool doFS = (topAttrN != 0); //whether feature selection is requested if(doFS) {//initialize attrCounts attrCounts.resize(attrN); for(int attrNo = 0; attrNo < attrN; attrNo++) { attrCounts[attrNo].first = attrNo; //number of attribute attrCounts[attrNo].second = 0; //counts } } fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::out); //header for compatibility with Additive Groves model AG_TRAIN_MODE modeStub = SLOW; fmodel.write((char*) &modeStub, sizeof(enum AG_TRAIN_MODE)); int tigNStub = 1; fmodel.write((char*) &tigNStub, sizeof(int)); fmodel.write((char*) &ti.alpha, sizeof(double)); fmodel.close(); fstream fbagrms("bagging_rms.txt", ios_base::out); //bagging curve (rms) fbagrms.close(); fstream fbagroc; if(!ti.rms) { fbagroc.open("bagging_roc.txt", ios_base::out); //bagging curve (roc) fbagroc.close(); } //make bags, build trees, collect predictions for(int bagNo = 0; bagNo < ti.bagN; bagNo++) { if(doOut) cout << "Iteration " << bagNo + 1 << " out of " << ti.bagN << endl; data.newBag(); CTree tree(ti.alpha); tree.setRoot(); tree.grow(doFS, attrCounts); tree.save(modelFName.c_str()); //generate predictions for validation set doublev predictions(validN); for(int itemNo = 0; itemNo < validN; itemNo++) { predsumsV[itemNo] += tree.predict(itemNo, VALID); predictions[itemNo] = predsumsV[itemNo] / (bagNo + 1); } rmsV[bagNo] = rmse(predictions, validTar); if(!ti.rms) rocV[bagNo] = roc(predictions, validTar); //output an element of bagging curve fbagrms.open("bagging_rms.txt", ios_base::out | ios_base::app); fbagrms << rmsV[bagNo] << endl; fbagrms.close(); //same for roc, if needed if(!ti.rms) { fbagroc.open("bagging_roc.txt", ios_base::out | ios_base::app); fbagroc << rocV[bagNo] << endl; fbagroc.close(); } } if(doFS) //sort attributes by counts sort(attrCounts.begin(), attrCounts.end(), idGreater); //4. Output //output results and recommendations if(ti.rms) clog << "RMSE on validation set = " << rmsV[ti.bagN - 1] << "\n"; else clog << "ROC on validation set = " << rocV[ti.bagN - 1] << "\n"; //analyze whether more bagging should be recommended based on the curve in the best point if(moreBag(rmsV)) { int recBagN = ti.bagN + 100; clog << "\nRecommendation: a greater number of bagging iterations might produce a better model.\n" << "Suggested action: bt_train -b " << recBagN << "\n"; } else clog << "\nThe bagging curve shows good convergence. \n"; clog << "\n"; //standard output in case of turned off log output: final performance on validation set only if(!doOut) if(ti.rms) cout << rmsV[ti.bagN - 1] << endl; else cout << rocV[ti.bagN - 1] << endl; //output feature selection results if(doFS) { if(topAttrN > attrN) topAttrN = attrN; fstream ffeatures("feature_scores.txt", ios_base::out); ffeatures << "Top " << topAttrN << " features\n"; for(int attrNo = 0; attrNo < topAttrN; attrNo++) ffeatures << data.getAttrName(attrCounts[attrNo].first) << "\t" << attrCounts[attrNo].second / ti.bagN / itemN << "\n"; ffeatures << "\n\nColumn numbers (beginning with 1)\n"; for(int attrNo = 0; attrNo < topAttrN; attrNo++) ffeatures << data.getColNo(attrCounts[attrNo].first) + 1 << " "; ffeatures << "\nLabel column number: " << data.getTarColNo() + 1; ffeatures.close(); //output new attribute file for(int attrNo = topAttrN; attrNo < attrN; attrNo++) data.ignoreAttr(attrCounts[attrNo].first); data.outAttr(ti.attrFName); } }catch(TE_ERROR err){
//ag_mergepreds [-n _start_N_value_] [-a _start_alpha_value_] -d _directory1_ _directory2_ [_directory3_] //[_directory4_] ... int main(int argc, char* argv[]) { try{ //0. Set log file LogStream clog; LogStream::init(true); clog << "\n-----\nag_mergepreds "; for(int argNo = 1; argNo < argc; argNo++) clog << argv[argNo] << " "; clog << "\n\n"; //1. Set input parameters from command line int startTiGN = 1; double startAlpha = 0.5; int firstDirNo = 0; stringv args(argc); for(int argNo = 0; argNo < argc; argNo++) args[argNo] = string(argv[argNo]); //parse and save input parameters for(int argNo = 1; argNo < argc; argNo += 2) { if(!args[argNo].compare("-n")) startTiGN = atoiExt(argv[argNo + 1]); else if(!args[argNo].compare("-a")) startAlpha = atofExt(argv[argNo + 1]); else if(!args[argNo].compare("-d")) { firstDirNo = argNo + 1; break; } else throw INPUT_ERR; } //check that there are at least two directories if(argc < (firstDirNo + 2)) throw INPUT_ERR; //convert names of input directories to strings and check that they exist int folderN = argc - firstDirNo; stringv folders(folderN); for(int argNo = firstDirNo; argNo < argc; argNo++) { folders[argNo - firstDirNo] = string(argv[argNo]); struct stat status; if((stat(argv[argNo], &status) != 0) || !(status.st_mode & S_IFDIR)) throw DIR_ERR; } //1.a) delete all temp files from the previous run and create a directory AGTemp #ifdef WIN32 //in windows CreateDirectory("AGTemp", NULL); #else // in linux system("rm -rf ./AGTemp/"); system("mkdir ./AGTemp/"); #endif //2. Set parameters from AGTemp/params.txt from the first directory TrainInfo ti; //set of model parameters in the current directory double prevBest; //best value of performance achieved on the previous run fstream fparam; string paramPathName = folders[0] + "/AGTemp/params.txt"; fparam.open(paramPathName.c_str(), ios_base::in); string modeStr, metric; fparam >> ti.seed >> ti.trainFName >> ti.validFName >> ti.attrFName >> ti.minAlpha >> ti.maxTiGN >> ti.bagN >> modeStr >> metric; //modeStr should be "fast" or "slow" or "layered" if(modeStr.compare("fast") == 0) ti.mode = FAST; else if(modeStr.compare("slow") == 0) ti.mode = SLOW; else if(modeStr.compare("layered") == 0) ti.mode = LAYERED; else throw TEMP_ERR; //metric should be "roc" or "rms" if(metric.compare("rms") == 0) ti.rms = true; else if(metric.compare("roc") == 0) ti.rms = false; else throw TEMP_ERR; if(fparam.fail()) throw TEMP_ERR; fparam.close(); fparam.clear(); //read best value of performance on previous run fstream fbest; double stub; double trainV; // number of data points in the train set, need to calculate possible values of alpha string fbestPathName = folders[0] + "/AGTemp/best.txt"; fbest.open(fbestPathName.c_str(), ios_base::in); fbest >> prevBest >> stub >> stub >> stub >> trainV; if(fbest.fail()) throw TEMP_ERR; fbest.close(); int alphaN = getAlphaN(ti.minAlpha, trainV); //number of different alpha values int tigNN = getTiGNN(ti.maxTiGN); //direction of initialization (1 - up, 0 - right), used in fast mode only doublevv dir(tigNN, doublev(alphaN, 0)); //outer array: column (by TiGN) //middle array: row (by alpha) //direction of initialization (1 - up, 0 - right), collects average in the slow mode doublevv dirStat(tigNN, doublev(alphaN, 0)); if(ti.mode == FAST) {//read part of the directions table from file fstream fdir; string fdirPathName = folders[0] + "/AGTemp/dir.txt"; fdir.open(fdirPathName.c_str(), ios_base::in); for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) for(int alphaNo = 0; alphaNo < alphaN; alphaNo++) fdir >> dir[tigNNo][alphaNo]; if(fdir.fail()) throw TEMP_ERR; fdir.close(); } //3. Read main parameters from all other directories and check that they match int allBagN = ti.bagN; int lastSeed = ti.seed; for(int folderNo = 1; folderNo < folderN; folderNo++) { TrainInfo extraTI; //set of model parameters in the additional directory string fparamPathName = folders[folderNo] + "/AGTemp/params.txt"; fparam.open(fparamPathName.c_str(), ios_base::in); fparam >> extraTI.seed >> extraTI.trainFName >> extraTI.validFName >> extraTI.attrFName >> extraTI.minAlpha >> extraTI.maxTiGN >> extraTI.bagN; if(fparam.fail()) { clog << fparamPathName << '\n'; throw TEMP_ERR; } fparam.close(); if((ti.minAlpha != extraTI.minAlpha) || (ti.maxTiGN != extraTI.maxTiGN)) { clog << fparamPathName << '\n'; throw MERGE_MISMATCH_ERR; } if(extraTI.seed == ti.seed) throw SAME_SEED_ERR; if(folderNo == (folderN - 1)) lastSeed = extraTI.seed; allBagN += extraTI.bagN; string fdirStatPathName = folders[folderNo] + "/AGTemp/dirstat.txt"; fstream fdirStat; fdirStat.open("./AGTemp/dirstat.txt", ios_base::in); for(int alphaNo = 0; alphaNo < alphaN; alphaNo++) for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) { double ds; fdirStat >> ds; dirStat[tigNNo][alphaNo] += ds * extraTI.bagN; } } //4. Load data INDdata data("", ti.validFName.c_str(), "", ti.attrFName.c_str()); doublev validTar; int validN = data.getTargets(validTar, VALID); clog << "Alpha = " << ti.minAlpha << "\nN = " << ti.maxTiGN << "\n" << allBagN << " bagging iterations\n"; if(ti.mode == FAST) clog << "fast mode\n\n"; else if(ti.mode == SLOW) clog << "slow mode\n\n"; else //if(ti.mode == LAYERED) clog << "layered mode\n\n"; //5. Initialize some internal process variables //surfaces of performance values for validation set. //Always calculate rms (for convergence analysis), if needed, calculate roc doublevvv rmsV(tigNN, doublevv(alphaN, doublev(folderN, 0))); doublevvv rocV; if(!ti.rms) rocV.resize(tigNN, doublevv(alphaN, doublev(folderN, 0))); //outer array: column (by TiGN) //middle array: row (by alpha) //inner array: bagging iterations. Performance is kept for all iterations to create bagging curves //sums of predictions for each data point (raw material to calculate performance) doublevvv predsumsV(tigNN, doublevv(alphaN, doublev(validN, 0))); //outer array: column (by TiGN) //middle array: row (by alpha) //inner array: data points in the validation set //6. Read and merge models from the directories int startAlphaNo = getAlphaN(startAlpha, trainV) - 1; int startTiGNNo = getTiGNN(startTiGN) - 1; for(int alphaNo = startAlphaNo; alphaNo < alphaN; alphaNo++) { double alpha; if(alphaNo < alphaN - 1) alpha = alphaVal(alphaNo); else //this is a special case because minAlpha can be zero alpha = ti.minAlpha; cout << "Merging predictions with alpha = " << alpha << endl; for(int tigNNo = startTiGNNo; tigNNo < tigNN; tigNNo++) { int tigN = tigVal(tigNNo); //number of trees in the current grove //temp file in the extra directory that keeps models corresponding to alpha and tigN string prefix = string("/AGTemp/ag.a.") + alphaToStr(alpha) + ".n." + itoa(tigN, 10); string predsFName = prefix + ".preds.txt"; for(int folderNo = 0; folderNo < folderN; folderNo++) { string inPredsFName = folders[folderNo] + predsFName; fstream finpreds((inPredsFName).c_str(), ios_base::in); if(finpreds.fail()) { clog << inPredsFName << '\n'; throw TEMP_ERR; } //generate predictions and performance for validation set doublev predictions(validN); for(int itemNo = 0; itemNo < validN; itemNo++) { double sinpred; finpreds >> sinpred; predsumsV[tigNNo][alphaNo][itemNo] += sinpred;//extraGrove.predict(itemNo, VALID); predictions[itemNo] = predsumsV[tigNNo][alphaNo][itemNo] / (folderNo + 1); } if(folderNo == folderN - 1) { fstream fpreds((string(".") + predsFName).c_str(), ios_base::out); for(int itemNo = 0; itemNo < validN; itemNo++) fpreds << predictions[itemNo] << endl; fpreds.close(); } rmsV[tigNNo][alphaNo][folderNo] = rmse(predictions, validTar); if(!ti.rms) rocV[tigNNo][alphaNo][folderNo] = roc(predictions, validTar); finpreds.close(); }//end for(int folderNo = 0; folderNo < folderN; folderNo++) }//end for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) }//end for(int alphaNo = 0; alphaNo < alphaN; alphaNo++) //4. Output ti.bagN = folderN; ti.seed = lastSeed; if(ti.rms) { double validStD = data.getTarStD(VALID); trainOut(ti, dir, rmsV, rmsV, predsumsV, trainV, dirStat, validStD, startAlphaNo, startTiGNNo); } else trainOut(ti, dir, rmsV, rocV, predsumsV, trainV, dirStat, -1.0, startAlphaNo, startTiGNNo); if(folderN != allBagN) clog << "Warning: bagging curve and -b recommendations could not be calculated correctly " << "in this mode. Each visible bagging step corresponds to several real steps.\n"; }catch(TE_ERROR err){