Exemplo n.º 1
0
//ag_merge [-n _start_N_value_] [-a _start_alpha_value_] -d _directory1_ _directory2_ [_directory3_] 
//[_directory4_] ...
int main(int argc, char* argv[])
{	
	try{
//0. Set log file
	LogStream clog;
	LogStream::init(true);
	clog << "\n-----\nag_merge ";
	for(int argNo = 1; argNo < argc; argNo++)
		clog << argv[argNo] << " ";
	clog << "\n\n";

//1. Set input parameters from command line 

	int startTiGN = 1;
	double startAlpha = 0.5;
	int firstDirNo = 0;

	stringv args(argc); 
	for(int argNo = 0; argNo < argc; argNo++)
		args[argNo] = string(argv[argNo]);

	 //parse and save input parameters
	for(int argNo = 1; argNo < argc; argNo += 2)
	{
		if(!args[argNo].compare("-n"))
			startTiGN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-a"))
			startAlpha = atofExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-d"))
		{
			firstDirNo = argNo + 1;
			break;
		}
		else
			throw INPUT_ERR;
	}

	//check that there are at least two directories 
	if(argc < (firstDirNo + 2))
		throw INPUT_ERR;

	//convert names of input directories to strings and check that they exist
	int folderN = argc - firstDirNo;
	stringv folders(folderN); 
	for(int argNo = firstDirNo; argNo < argc; argNo++)
	{
		folders[argNo - firstDirNo] = string(argv[argNo]);
		struct stat status;
		if((stat(argv[argNo], &status) != 0) || !(status.st_mode & S_IFDIR))
			throw DIR_ERR;
	}

//1.a) delete all temp files from the previous run and create a directory AGTemp
#ifdef _WIN32	//in windows
	CreateDirectory("AGTemp", NULL);
#else // in linux
	system("rm -rf ./AGTemp/");
	system("mkdir ./AGTemp/");
#endif

//2. Set parameters from AGTemp/params.txt from the first directory
	TrainInfo ti;			//set of model parameters in the current directory
	double prevBest;		//best value of performance achieved on the previous run
			
	fstream fparam;
	string paramPathName = folders[0] + "/AGTemp/params.txt";
	fparam.open(paramPathName.c_str(), ios_base::in); 
	string modeStr, metric;
	fparam >> ti.seed >> ti.trainFName >> ti.validFName >> ti.attrFName >> ti.minAlpha >> ti.maxTiGN 
		>> ti.bagN >> modeStr >> metric;	

	//modeStr should be "fast" or "slow" or "layered"	
	if(modeStr.compare("fast") == 0)
		ti.mode = FAST;
	else if(modeStr.compare("slow") == 0)
		ti.mode = SLOW;
	else if(modeStr.compare("layered") == 0)
		ti.mode = LAYERED;
	else
		throw TEMP_ERR;

	//metric should be "roc" or "rms"
	if(metric.compare("rms") == 0)
		ti.rms = true;
	else if(metric.compare("roc") == 0)
		ti.rms = false;
	else
		throw TEMP_ERR;

	if(fparam.fail())
		throw TEMP_ERR;
	fparam.close();
	fparam.clear();

	//read best value of performance on previous run
	fstream fbest;
	double stub;
	int itemN; // number of data points in the train set, need to calculate possible values of alpha
	string fbestPathName = folders[0] + "/AGTemp/best.txt";
	fbest.open(fbestPathName.c_str(), ios_base::in); 
	fbest >> prevBest >> stub >> stub >> stub >> itemN;
	if(fbest.fail())
		throw TEMP_ERR;
	fbest.close();

	int alphaN = getAlphaN(ti.minAlpha, itemN); //number of different alpha values
	int tigNN = getTiGNN(ti.maxTiGN);

	//direction of initialization (1 - up, 0 - right), used in fast mode only
	doublevv dir(tigNN, doublev(alphaN, 0)); 
	//outer array: column (by TiGN)
	//middle array: row	(by alpha)
	
	//direction of initialization (1 - up, 0 - right), collects average in the slow mode
	doublevv dirStat(tigNN, doublev(alphaN, 0));

	if(ti.mode == FAST)
	{//read part of the directions table from file
		fstream fdir;
		string fdirPathName = folders[0] + "/AGTemp/dir.txt";
		fdir.open(fdirPathName.c_str(), ios_base::in); 
		for(int tigNNo = 0; tigNNo < tigNN; tigNNo++)
			for(int alphaNo = 0; alphaNo < alphaN; alphaNo++)
				fdir >> dir[tigNNo][alphaNo];
		if(fdir.fail())
			throw TEMP_ERR;
		fdir.close();
	}

//3. Read main parameters from all other directories and check that they match

	int allBagN = ti.bagN;
	intv bagNs(folderN, 0);
	bagNs[0] = ti.bagN;
	intv prevBagNs(folderN + 1, 0); //sums of bagNs of all previous directories
	prevBagNs[1] = ti.bagN;
	int lastSeed = ti.seed;
	for(int folderNo = 1; folderNo < folderN; folderNo++)
	{
		TrainInfo extraTI;	//set of model parameters in the additional directory
		
		string fparamPathName = folders[folderNo] + "/AGTemp/params.txt";
		fparam.open(fparamPathName.c_str(), ios_base::in); 
		fparam >> extraTI.seed >> extraTI.trainFName >> extraTI.validFName >> extraTI.attrFName 
			>> extraTI.minAlpha >> extraTI.maxTiGN >> extraTI.bagN;	

		if(fparam.fail())
		{
			clog << fparamPathName << '\n';
			throw TEMP_ERR;
		}
		fparam.close();

		if((ti.minAlpha != extraTI.minAlpha) || (ti.maxTiGN != extraTI.maxTiGN))
		  {
		    clog << fparamPathName << '\n';
			throw MERGE_MISMATCH_ERR;
		  }
		if(extraTI.seed == ti.seed)
			throw SAME_SEED_ERR;
		if(folderNo == (folderN - 1))
			lastSeed = extraTI.seed;

		allBagN += extraTI.bagN;
		bagNs[folderNo] = extraTI.bagN;
		prevBagNs[folderNo + 1] = allBagN;

		string fdirStatPathName = folders[folderNo] + "/AGTemp/dirstat.txt";
		fstream fdirStat;	
		fdirStat.open("./AGTemp/dirstat.txt", ios_base::in);
		for(int alphaNo = 0; alphaNo < alphaN; alphaNo++)
			for(int tigNNo = 0; tigNNo < tigNN; tigNNo++)
			{
				double ds;
				fdirStat >> ds;
				dirStat[tigNNo][alphaNo] += ds * extraTI.bagN;
			}
	}

//4. Load data
	INDdata data("", ti.validFName.c_str(), "", ti.attrFName.c_str());
	CGrove::setData(data);
	CTreeNode::setData(data);

	doublev validTar;
	int validN = data.getTargets(validTar, VALID);

	clog << "Alpha = " << ti.minAlpha << "\nN = " << ti.maxTiGN << "\n" 
		<< allBagN << " bagging iterations\n";
	if(ti.mode == FAST)
		clog << "fast mode\n\n";
	else if(ti.mode == SLOW)
		clog << "slow mode\n\n";
	else //if(ti.mode == LAYERED)
		clog << "layered mode\n\n";

	//5. Initialize some internal process variables

	//surfaces of performance values for validation set. 
	//Always calculate rms (for convergence analysis), if needed, calculate roc
	doublevvv rmsV(tigNN, doublevv(alphaN, doublev(allBagN, 0))); 
	doublevvv rocV;
	if(!ti.rms)
		rocV.resize(tigNN, doublevv(alphaN, doublev(allBagN, 0))); 
	//outer array: column (by TiGN)
	//middle array: row (by alpha)
	//inner array: bagging iterations. Performance is kept for all iterations to create bagging curves

	//sums of predictions for each data point (raw material to calculate performance)
	doublevvv predsumsV(tigNN, doublevv(alphaN, doublev(validN, 0)));
	//outer array: column (by TiGN)
	//middle array: row	(by alpha)
	//inner array: data points in the validation set
	

//6. Read and merge models from the directories
	int startAlphaNo = getAlphaN(startAlpha, itemN) - 1; 
	int startTiGNNo = getTiGNN(startTiGN) - 1;

	for(int alphaNo = startAlphaNo; alphaNo < alphaN; alphaNo++)
	{
		double alpha;
		if(alphaNo < alphaN - 1)
			alpha = alphaVal(alphaNo);
		else	//this is a special case because minAlpha can be zero
			alpha = ti.minAlpha;

		cout << "Merging models with alpha = " << alpha << endl;

		for(int tigNNo = startTiGNNo; tigNNo < tigNN; tigNNo++) 
		{
			int tigN = tigVal(tigNNo);	//number of trees in the current grove

			//temp file in the extra directory that keeps models corresponding to alpha and tigN
			string prefix = string("/AGTemp/ag.a.") 
								+ alphaToStr(alpha)
								+ ".n." 
								+ itoa(tigN, 10);
			string tempFName = prefix + ".tmp";

			//this will kill the pre-existing file in the output directory
			fstream fsave((string(".") + tempFName).c_str(), ios_base::binary | ios_base::out);	

			for(int folderNo = 0; folderNo < folderN; folderNo++)
			{
				string inTempFName = folders[folderNo] + tempFName;
				fstream ftemp((inTempFName).c_str(), ios_base::binary | ios_base::in);
				if(ftemp.fail())
				{
				    clog << inTempFName << '\n';
					throw TEMP_ERR;
				}
			
				//merge all extra models with the same (alpha, tigN) parameter values into existing models
				for(int bagNo = prevBagNs[folderNo]; bagNo < prevBagNs[folderNo + 1]; bagNo++)
				{
					//retrieve next grove
					CGrove extraGrove(alpha, tigN);
					try{
					extraGrove.load(ftemp);
					}catch(TE_ERROR err){
					  clog << inTempFName << '\n';
					  throw err;
					}
					//add the loaded grove to a model file with alpha and tigN values in the name
					extraGrove.save((string(".") + tempFName).c_str());

					//generate predictions and performance for validation set
					doublev predictions(validN);
					for(int itemNo = 0; itemNo < validN; itemNo++)
					{
						predsumsV[tigNNo][alphaNo][itemNo] += extraGrove.predict(itemNo, VALID);
						predictions[itemNo] = predsumsV[tigNNo][alphaNo][itemNo] / (bagNo + 1);
					}
					if(bagNo == allBagN - 1)
					{
						string predsFName = prefix + ".preds.txt";
						fstream fpreds((string(".") + predsFName).c_str(), ios_base::out);
						for(int itemNo = 0; itemNo < validN; itemNo++)
							fpreds << predictions[itemNo] << endl;
						fpreds.close();
					}

					rmsV[tigNNo][alphaNo][bagNo] = rmse(predictions, validTar);
					if(!ti.rms)
						rocV[tigNNo][alphaNo][bagNo] = roc(predictions, validTar);

				}// end for(int bagNo = ti.bagN; bagNo < ti.bagN + extraTI.bagN; bagNo++)
				ftemp.close();
			}//end for(int folderNo = 0; folderNo < folderN; folderNo++)
		}//end for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) 
	}//end for(int alphaNo = 0; alphaNo < alphaN; alphaNo++)

	//4. Output
	ti.bagN = allBagN;
	ti.seed = lastSeed;
	if(ti.rms)
		trainOut(ti, dir, rmsV, rmsV, predsumsV, itemN, dirStat, startAlphaNo, startTiGNNo);
	else
		trainOut(ti, dir, rmsV, rocV, predsumsV, itemN, dirStat, startAlphaNo, startTiGNNo);

	}catch(TE_ERROR err){
Exemplo n.º 2
0
int main(int argc, char* argv[])
{	
	try{
//0. Set log file
	LogStream clog;
	clog << "\n-----\nag_nway ";
	for(int argNo = 1; argNo < argc; argNo++)
		clog << argv[argNo] << " ";
	clog << "\n\n";

//1. Analyze input parameters
	//convert input parameters to string from char*
	stringv args(argc); 
	for(int argNo = 0; argNo < argc; argNo++)
		args[argNo] = string(argv[argNo]);
	
	//check that the number of arguments is even (flags + value pairs)
	if(argc % 2 == 0)
		throw INPUT_ERR;

	TrainInfo ti;
	double meanPerf = -1;
	double stdPerf = -1;
	string nwayFName; //file with feature names that are involved in the interaction to test
	string modelFName = "restricted_model.bin";	//name of the output file for the model
	ti.mode = LAYERED;

#ifndef _WIN32
	int threadN = 6;	//number of threads
#endif

	//parse and save input parameters
	//indicators of presence of required flags in the input
	bool hasTrain = false;
	bool hasVal = false; 
	bool hasAttr = false; 
	bool hasAlpha = false;
	bool hasTiGN = false;
	bool hasBagN = false;
	bool hasMean = false;
	bool hasStD = false;
	bool hasNWay = false;

	for(int argNo = 1; argNo < argc; argNo += 2)
	{
		if(!args[argNo].compare("-t"))
		{
			ti.trainFName = args[argNo + 1];
			hasTrain = true;
		}
		else if(!args[argNo].compare("-v"))
		{
			ti.validFName = args[argNo + 1];
			hasVal = true;
		}
		else if(!args[argNo].compare("-r"))
		{
			ti.attrFName = args[argNo + 1];
			hasAttr = true;
		}
		else if(!args[argNo].compare("-a"))
		{
			ti.minAlpha = atofExt(argv[argNo + 1]);
			hasAlpha = true;
		}
		else if(!args[argNo].compare("-n"))
		{
			ti.maxTiGN = atoiExt(argv[argNo + 1]);
			hasTiGN = true;
		}
		else if(!args[argNo].compare("-b"))
		{
			ti.bagN = atoiExt(argv[argNo + 1]);
			hasBagN = true;
		}
		else if(!args[argNo].compare("-ave"))
		{
			meanPerf = atofExt(argv[argNo + 1]);
			hasMean = true;
		}
		else if(!args[argNo].compare("-std"))
		{
			stdPerf = atofExt(argv[argNo + 1]);
			hasStD = true;
		}
		else if(!args[argNo].compare("-w"))
		{
			nwayFName = args[argNo + 1];
			hasNWay = true;
		}
		else if(!args[argNo].compare("-c"))
		{
			if(!args[argNo + 1].compare("roc"))
				ti.rms = false;
			else if(!args[argNo + 1].compare("rms"))
				ti.rms = true;
			else
				throw INPUT_ERR;
		}
		else if(!args[argNo].compare("-i"))
			ti.seed = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-m"))
		{
			modelFName = args[argNo + 1];
			if(modelFName.empty())
				throw EMPTY_MODEL_NAME_ERR;
		}
		else if(!args[argNo].compare("-h"))
#ifndef _WIN32 
			threadN = atoiExt(argv[argNo + 1]);
#else
			throw WIN_ERR;
#endif
		else
			throw INPUT_ERR;
	}//end for(int argNo = 1; argNo < argc; argNo += 2) //parse and save input parameters

	if(!(hasTrain && hasVal && hasAttr && hasAlpha && hasTiGN && hasBagN && hasMean && hasStD && hasNWay))
		throw INPUT_ERR;

	if(ti.trainFName.compare(ti.validFName) == 0)
		throw TRAIN_EQ_VALID_ERR;

	if((ti.minAlpha < 0) || (ti.minAlpha > 1))
		throw ALPHA_ERR;

	if(ti.maxTiGN < 1)
		throw TIGN_ERR;

//1.b) Initialize random number generator. 
	srand(ti.seed);

//2. Load data
	INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), ti.attrFName.c_str());
	CGrove::setData(data);
	CTreeNode::setData(data);

	doublev validTar;
	int validN = data.getTargets(validTar, VALID);
	int itemN = data.getTrainN();
	double trainV = data.getTrainV();

	//adjust minAlpha, if needed
	double newAlpha = adjustAlpha(ti.minAlpha, trainV);
	if(ti.minAlpha != newAlpha)
	{
		if(newAlpha == 0)
			clog << "Warning: due to small train set size value of alpha was changed to 0"; 
		else 
			clog << "Warning: alpha value was rounded to the closest valid value " << newAlpha;
		clog << ".\n\n";
		ti.minAlpha = newAlpha;	
	}
	//adjust maxTiGN, if needed
	int newTiGN = adjustTiGN(ti.maxTiGN);
	if(ti.maxTiGN != newTiGN)
	{
		clog << "Warning: N value was rounded to the closest smaller valid value " << newTiGN << ".\n\n";
		ti.maxTiGN = newTiGN;	
	}
	clog << "Alpha = " << ti.minAlpha << "\nN = " << ti.maxTiGN << "\n" 
		<< ti.bagN << " bagging iterations\n";

//2.a) Start thread pool
#ifndef _WIN32
	TThreadPool pool(threadN);
	CGrove::setPool(pool);
#endif

//3. Main part - run interaction detection

	//read interaction features
	fstream fnway;
	fnway.open(nwayFName.c_str(), ios_base::in);
	if(!fnway) 
		throw OPEN_NWAY_ERR;
	string nwayAttr;
	fnway >> nwayAttr;
	while(!fnway.fail())
	{
		int nwayAttrNo = data.getAttrId(nwayAttr);
		if(!data.isActive(nwayAttrNo))
			throw ATTR_NAME_ERR;
		ti.interaction.push_back(nwayAttrNo);
		fnway >> nwayAttr;
	}

	//test interaction
	clog << "\nRestricting interaction between "; 
	for(int nwayNo = 0; nwayNo < (int)ti.interaction.size() - 1; nwayNo++)
		clog << data.getAttrName(ti.interaction[nwayNo]) << ", "; 
	clog << data.getAttrName(ti.interaction[ti.interaction.size() - 1]) << "\n";
		
	double rPerf = layeredGroves(data, ti, modelFName);
	double score = (meanPerf - rPerf) / stdPerf;
	if(ti.rms)
		score *= -1; 
	clog << "\tPerformance: " << rPerf << ". " << score << " standard deviations from the mean. ";
	if(score > 3)
		clog << "Interaction is present.\n";
	else
		clog << "Interaction is absent.\n";

	}catch(TE_ERROR err){
Exemplo n.º 3
0
int main(int argc, char* argv[])
{	
	try{

//1. Analyze input parameters
	//convert input parameters to string from char*
	stringv args(argc); 
	for(int argNo = 0; argNo < argc; argNo++)
		args[argNo] = string(argv[argNo]);
	
	//check that the number of arguments is even (flags + value pairs)
	if(argc % 2 == 0)
		throw INPUT_ERR;

#ifndef _WIN32
	int threadN = 6;	//number of threads
#endif

	TrainInfo ti; //model training parameters
	int topAttrN = 0;  //how many top attributes to output and keep in the cut data 
						//(0 = do not do feature selection)
						//(-1 = output all available features)

	//parse and save input parameters
	//indicators of presence of required flags in the input
	bool hasTrain = false;
	bool hasVal = false; 
	bool hasAttr = false; 

	int treeN = 100;
	double shrinkage = 0.01;
	double subsample = -1;

	for(int argNo = 1; argNo < argc; argNo += 2)
	{
		if(!args[argNo].compare("-t"))
		{
			ti.trainFName = args[argNo + 1];
			hasTrain = true;
		}
		else if(!args[argNo].compare("-v"))
		{
			ti.validFName = args[argNo + 1];
			hasVal = true;
		}
		else if(!args[argNo].compare("-r"))
		{
			ti.attrFName = args[argNo + 1];
			hasAttr = true;
		}
		else if(!args[argNo].compare("-a"))
			ti.alpha = atofExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-n"))
			treeN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-i"))
			ti.seed = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-k"))
			topAttrN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-sh"))
			shrinkage = atofExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-sub"))
			subsample = atofExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-c"))
		{
			if(!args[argNo + 1].compare("roc"))
				ti.rms = false;
			else if(!args[argNo + 1].compare("rms"))
				ti.rms = true;
			else
				throw INPUT_ERR;
		}
		else if(!args[argNo].compare("-h"))
#ifndef _WIN32 
			threadN = atoiExt(argv[argNo + 1]);
#else
			throw WIN_ERR;
#endif
		else
			throw INPUT_ERR;
	}//end for(int argNo = 1; argNo < argc; argNo += 2) //parse and save input parameters

	if(!(hasTrain && hasVal && hasAttr))
		throw INPUT_ERR;

	if((ti.alpha < 0) || (ti.alpha > 1))
		throw ALPHA_ERR;

//1.a) Set log file
	LogStream clog;
	LogStream::init(true);
	clog << "\n-----\ngbt_train ";
	for(int argNo = 1; argNo < argc; argNo++)
		clog << argv[argNo] << " ";
	clog << "\n\n";

//1.b) Initialize random number generator. 
	srand(ti.seed);

//2. Load data
	INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), 
				 ti.attrFName.c_str());
	CTree::setData(data);
	CTreeNode::setData(data);

//2.a) Start thread pool
#ifndef _WIN32
	TThreadPool pool(threadN);
	CTree::setPool(pool);
#endif

//------------------
	int attrN = data.getAttrN();
	if(topAttrN == -1)
		topAttrN = attrN;
	idpairv attrCounts;	//counts of attribute importance
	bool doFS = (topAttrN != 0);	//whether feature selection is requested
	if(doFS)
	{//initialize attrCounts
		attrCounts.resize(attrN);
		for(int attrNo = 0; attrNo < attrN; attrNo++)
		{
			attrCounts[attrNo].first = attrNo;	//number of attribute	
			attrCounts[attrNo].second = 0;		//counts
		}
	}

	fstream frmscurve("boosting_rms.txt", ios_base::out); //bagging curve (rms)
	frmscurve.close();
	fstream froccurve;
	if(!ti.rms)
	{
		froccurve.open("boosting_roc.txt", ios_base::out); //bagging curve (roc) 
		froccurve.close();
	}

	doublev validTar;
	int validN = data.getTargets(validTar, VALID);

	doublev trainTar;
	int trainN = data.getTargets(trainTar, TRAIN);

	int sampleN;
	if(subsample == -1)
		sampleN = trainN;
	else
		sampleN = (int) (trainN * subsample);
	
	doublev validPreds(validN, 0);
	doublev trainPreds(trainN, 0);
	
	for(int treeNo = 0; treeNo < treeN; treeNo++)
	{
		if(treeNo % 10 == 0)
			cout << "\titeration " << treeNo + 1 << " out of " << treeN << endl;

		if(subsample == -1)
			data.newBag();
		else
			data.newSample(sampleN);

		CTree tree(ti.alpha);
		tree.setRoot();
		tree.resetRoot(trainPreds);
		idpairv stub;
		tree.grow(doFS, attrCounts);

		//update predictions
		for(int itemNo = 0; itemNo < trainN; itemNo++)
			trainPreds[itemNo] += shrinkage * tree.predict(itemNo, TRAIN);
		for(int itemNo = 0; itemNo < validN; itemNo++)
			validPreds[itemNo] += shrinkage * tree.predict(itemNo, VALID);

		//output
		frmscurve.open("boosting_rms.txt", ios_base::out | ios_base::app); 
		frmscurve << rmse(validPreds, validTar) << endl;
		frmscurve.close();
		
		if(!ti.rms)
		{
			froccurve.open("boosting_roc.txt", ios_base::out | ios_base::app); 
			froccurve << roc(validPreds, validTar) << endl;
			froccurve.close();
		}

	}

	//output feature selection results
	if(doFS)
	{
		sort(attrCounts.begin(), attrCounts.end(), idGreater);
		if(topAttrN > attrN)
			topAttrN = attrN;

		fstream ffeatures("feature_scores.txt", ios_base::out);
		ffeatures << "Top " << topAttrN << " features\n";
		for(int attrNo = 0; attrNo < topAttrN; attrNo++)
			ffeatures << data.getAttrName(attrCounts[attrNo].first) << "\t"
			<< attrCounts[attrNo].second / ti.bagN / trainN << "\n";
		ffeatures << "\n\nColumn numbers (beginning with 1)\n";
		for(int attrNo = 0; attrNo < topAttrN; attrNo++)
			ffeatures << data.getColNo(attrCounts[attrNo].first) + 1 << " ";
		ffeatures << "\nLabel column number: " << data.getTarColNo() + 1;
		ffeatures.close();

		//output new attribute file
		for(int attrNo = topAttrN; attrNo < attrN; attrNo++)
			data.ignoreAttr(attrCounts[attrNo].first);
		data.outAttr(ti.attrFName);
	}

	//output predictions
	fstream fpreds;
	fpreds.open("preds.txt", ios_base::out);
	for(int itemNo = 0; itemNo < validN; itemNo++)
		fpreds << validPreds[itemNo] << endl;
	fpreds.close();

//------------------

	}catch(TE_ERROR err){
Exemplo n.º 4
0
int main(int argc, char* argv[])
{	
	try{

//1. Analyze input parameters
	//convert input parameters to string from char*
	stringv args(argc); 
	for(int argNo = 0; argNo < argc; argNo++)
		args[argNo] = string(argv[argNo]);
	
	//check that the number of arguments is even (flags + value pairs)
	if(argc % 2 == 0)
		throw INPUT_ERR;

#ifndef _WIN32
	int threadN = 6;	//number of threads
#endif

	TrainInfo ti; //model training parameters
	string modelFName = "model.bin";	//name of the output file for the model
	int topAttrN = 0;  //how many top attributes to output and keep in the cut data 
							//(0 = do not do feature selection)
							//(-1 = output all available features)
	bool doOut = true; //whether to output log information to stdout

	//parse and save input parameters
	//indicators of presence of required flags in the input
	bool hasTrain = false;
	bool hasVal = false; 
	bool hasAttr = false; 

	for(int argNo = 1; argNo < argc; argNo += 2)
	{
		if(!args[argNo].compare("-t"))
		{
			ti.trainFName = args[argNo + 1];
			hasTrain = true;
		}
		else if(!args[argNo].compare("-v"))
		{
			ti.validFName = args[argNo + 1];
			hasVal = true;
		}
		else if(!args[argNo].compare("-r"))
		{
			ti.attrFName = args[argNo + 1];
			hasAttr = true;
		}
		else if(!args[argNo].compare("-a"))
			ti.alpha = atofExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-b"))
			ti.bagN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-i"))
			ti.seed = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-k"))
			topAttrN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-m"))
		{
			modelFName = args[argNo + 1];
			if(modelFName.empty())
				throw EMPTY_MODEL_NAME_ERR;
		}
		else if(!args[argNo].compare("-l"))
		{
			if(!args[argNo + 1].compare("log"))
				doOut = true;
			else if(!args[argNo + 1].compare("nolog"))
				doOut = false;
			else
				throw INPUT_ERR;
		}
		else if(!args[argNo].compare("-c"))
		{
			if(!args[argNo + 1].compare("roc"))
				ti.rms = false;
			else if(!args[argNo + 1].compare("rms"))
				ti.rms = true;
			else
				throw INPUT_ERR;
		}
		else if(!args[argNo].compare("-h"))
#ifndef _WIN32 
			threadN = atoiExt(argv[argNo + 1]);
#else
			throw WIN_ERR;
#endif
		else
			throw INPUT_ERR;
	}//end for(int argNo = 1; argNo < argc; argNo += 2) //parse and save input parameters

	if(!(hasTrain && hasVal && hasAttr))
		throw INPUT_ERR;

	if((ti.alpha < 0) || (ti.alpha > 1))
		throw ALPHA_ERR;
	
//1.a) Set log file
	LogStream clog;
	LogStream::init(doOut);
	clog << "\n-----\nbt_train ";
	for(int argNo = 1; argNo < argc; argNo++)
		clog << argv[argNo] << " ";
	clog << "\n\n";

//1.b) Initialize random number generator. 
	srand(ti.seed);

//2. Load data
	INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), 
				 ti.attrFName.c_str());
	CTree::setData(data);
	CTreeNode::setData(data);

//2.a) Start thread pool
#ifndef _WIN32
	TThreadPool pool(threadN);
	CTree::setPool(pool);
#endif

//3. Train models
	doublev validTar;
	int validN = data.getTargets(validTar, VALID);
	int itemN = data.getTrainN();

	//adjust minAlpha, if needed
	double newAlpha = adjustAlpha(ti.alpha, itemN);
	if(ti.alpha != newAlpha)
	{
		if(newAlpha == 0)
			clog << "Warning: due to small train set size value of alpha was changed to 0"; 
		else 
			clog << "Warning: alpha value was rounded to the closest valid value " << newAlpha;
		clog << ".\n\n";
		ti.alpha = newAlpha;	
	}
	clog << "Alpha = " << ti.alpha << "\n" 
		<< ti.bagN << " bagging iterations\n";

	doublev rmsV(ti.bagN, 0); 				//bagging curve of rms values for validation set
	doublev rocV;							 
	if(!ti.rms)
		rocV.resize(ti.bagN, 0);			//bagging curve of roc values for validation set
	doublev predsumsV(validN, 0); 			//sums of predictions for each data point

	int attrN = data.getAttrN();
	if(topAttrN == -1)
		topAttrN = attrN;
	idpairv attrCounts;	//counts of attribute importance
	bool doFS = (topAttrN != 0);	//whether feature selection is requested
	if(doFS)
	{//initialize attrCounts
		attrCounts.resize(attrN);
		for(int attrNo = 0; attrNo < attrN; attrNo++)
		{
			attrCounts[attrNo].first = attrNo;	//number of attribute	
			attrCounts[attrNo].second = 0;		//counts
		}
	}
	fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::out);
	//header for compatibility with Additive Groves model
	AG_TRAIN_MODE modeStub = SLOW;
	fmodel.write((char*) &modeStub, sizeof(enum AG_TRAIN_MODE));
	int tigNStub = 1;
	fmodel.write((char*) &tigNStub, sizeof(int));
	fmodel.write((char*) &ti.alpha, sizeof(double));
	fmodel.close();
	
	fstream fbagrms("bagging_rms.txt", ios_base::out); //bagging curve (rms)
	fbagrms.close();
	fstream fbagroc;
	if(!ti.rms)
	{
		fbagroc.open("bagging_roc.txt", ios_base::out); //bagging curve (roc) 
		fbagroc.close();
	}

	//make bags, build trees, collect predictions
	for(int bagNo = 0; bagNo < ti.bagN; bagNo++)
	{
		if(doOut)
			cout << "Iteration " << bagNo + 1 << " out of " << ti.bagN << endl;

		data.newBag();
		CTree tree(ti.alpha);
		tree.setRoot();
		tree.grow(doFS, attrCounts);
		tree.save(modelFName.c_str());

		//generate predictions for validation set
		doublev predictions(validN);
		for(int itemNo = 0; itemNo < validN; itemNo++)
		{
			predsumsV[itemNo] += tree.predict(itemNo, VALID);
			predictions[itemNo] = predsumsV[itemNo] / (bagNo + 1);
		}
		rmsV[bagNo] = rmse(predictions, validTar);
		if(!ti.rms)
			rocV[bagNo] = roc(predictions, validTar);

		//output an element of bagging curve 
		fbagrms.open("bagging_rms.txt", ios_base::out | ios_base::app); 
		fbagrms << rmsV[bagNo] << endl;
		fbagrms.close();

		//same for roc, if needed
		if(!ti.rms)
		{
			fbagroc.open("bagging_roc.txt", ios_base::out | ios_base::app); 
			fbagroc << rocV[bagNo] << endl;
			fbagroc.close();
		}
	}

	if(doFS)	//sort attributes by counts
		sort(attrCounts.begin(), attrCounts.end(), idGreater);
	
//4. Output
		
	//output results and recommendations
	if(ti.rms)
		clog << "RMSE on validation set = " << rmsV[ti.bagN - 1] << "\n";
	else
		clog << "ROC on validation set = " << rocV[ti.bagN - 1] << "\n";


	//analyze whether more bagging should be recommended based on the curve in the best point
	if(moreBag(rmsV))
	{
		int recBagN = ti.bagN + 100;
		clog << "\nRecommendation: a greater number of bagging iterations might produce a better model.\n"
			<< "Suggested action: bt_train -b " << recBagN << "\n";
	}
	else
		clog << "\nThe bagging curve shows good convergence. \n"; 
	clog << "\n";

	//standard output in case of turned off log output: final performance on validation set only
	if(!doOut)
		if(ti.rms)
			cout << rmsV[ti.bagN - 1] << endl;
		else
			cout << rocV[ti.bagN - 1] << endl;

	//output feature selection results
	if(doFS)
	{
		if(topAttrN > attrN)
			topAttrN = attrN;

		fstream ffeatures("feature_scores.txt", ios_base::out);	
		ffeatures << "Top " << topAttrN << " features\n";
		for(int attrNo = 0; attrNo < topAttrN; attrNo++)
			ffeatures << data.getAttrName(attrCounts[attrNo].first) << "\t" 
				<< attrCounts[attrNo].second / ti.bagN / itemN << "\n";
		ffeatures << "\n\nColumn numbers (beginning with 1)\n";
		for(int attrNo = 0; attrNo < topAttrN; attrNo++)
			ffeatures << data.getColNo(attrCounts[attrNo].first) + 1 << " ";
		ffeatures << "\nLabel column number: " << data.getTarColNo() + 1;
		ffeatures.close();

		//output new attribute file
		for(int attrNo = topAttrN; attrNo < attrN; attrNo++)
			data.ignoreAttr(attrCounts[attrNo].first);
		data.outAttr(ti.attrFName);
	}

	}catch(TE_ERROR err){