示例#1
0
int main(int argc, char* argv[])
{	
	try{

//1. Analyze input parameters
	//convert input parameters to string from char*
	stringv args(argc); 
	for(int argNo = 0; argNo < argc; argNo++)
		args[argNo] = string(argv[argNo]);
	
	//check that the number of arguments is even (flags + value pairs)
	if(argc % 2 == 0)
		throw INPUT_ERR;

#ifndef _WIN32
	int threadN = 6;	//number of threads
#endif

	TrainInfo ti; //model training parameters
	string modelFName = "model.bin";	//name of the output file for the model
	int topAttrN = 0;  //how many top attributes to output and keep in the cut data 
							//(0 = do not do feature selection)
							//(-1 = output all available features)
	bool doOut = true; //whether to output log information to stdout

	//parse and save input parameters
	//indicators of presence of required flags in the input
	bool hasTrain = false;
	bool hasVal = false; 
	bool hasAttr = false; 

	for(int argNo = 1; argNo < argc; argNo += 2)
	{
		if(!args[argNo].compare("-t"))
		{
			ti.trainFName = args[argNo + 1];
			hasTrain = true;
		}
		else if(!args[argNo].compare("-v"))
		{
			ti.validFName = args[argNo + 1];
			hasVal = true;
		}
		else if(!args[argNo].compare("-r"))
		{
			ti.attrFName = args[argNo + 1];
			hasAttr = true;
		}
		else if(!args[argNo].compare("-a"))
			ti.alpha = atofExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-b"))
			ti.bagN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-i"))
			ti.seed = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-k"))
			topAttrN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-m"))
		{
			modelFName = args[argNo + 1];
			if(modelFName.empty())
				throw EMPTY_MODEL_NAME_ERR;
		}
		else if(!args[argNo].compare("-l"))
		{
			if(!args[argNo + 1].compare("log"))
				doOut = true;
			else if(!args[argNo + 1].compare("nolog"))
				doOut = false;
			else
				throw INPUT_ERR;
		}
		else if(!args[argNo].compare("-c"))
		{
			if(!args[argNo + 1].compare("roc"))
				ti.rms = false;
			else if(!args[argNo + 1].compare("rms"))
				ti.rms = true;
			else
				throw INPUT_ERR;
		}
		else if(!args[argNo].compare("-h"))
#ifndef _WIN32 
			threadN = atoiExt(argv[argNo + 1]);
#else
			throw WIN_ERR;
#endif
		else
			throw INPUT_ERR;
	}//end for(int argNo = 1; argNo < argc; argNo += 2) //parse and save input parameters

	if(!(hasTrain && hasVal && hasAttr))
		throw INPUT_ERR;

	if((ti.alpha < 0) || (ti.alpha > 1))
		throw ALPHA_ERR;
	
//1.a) Set log file
	LogStream clog;
	LogStream::init(doOut);
	clog << "\n-----\nbt_train ";
	for(int argNo = 1; argNo < argc; argNo++)
		clog << argv[argNo] << " ";
	clog << "\n\n";

//1.b) Initialize random number generator. 
	srand(ti.seed);

//2. Load data
	INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), 
				 ti.attrFName.c_str());
	CTree::setData(data);
	CTreeNode::setData(data);

//2.a) Start thread pool
#ifndef _WIN32
	TThreadPool pool(threadN);
	CTree::setPool(pool);
#endif

//3. Train models
	doublev validTar;
	int validN = data.getTargets(validTar, VALID);
	int itemN = data.getTrainN();

	//adjust minAlpha, if needed
	double newAlpha = adjustAlpha(ti.alpha, itemN);
	if(ti.alpha != newAlpha)
	{
		if(newAlpha == 0)
			clog << "Warning: due to small train set size value of alpha was changed to 0"; 
		else 
			clog << "Warning: alpha value was rounded to the closest valid value " << newAlpha;
		clog << ".\n\n";
		ti.alpha = newAlpha;	
	}
	clog << "Alpha = " << ti.alpha << "\n" 
		<< ti.bagN << " bagging iterations\n";

	doublev rmsV(ti.bagN, 0); 				//bagging curve of rms values for validation set
	doublev rocV;							 
	if(!ti.rms)
		rocV.resize(ti.bagN, 0);			//bagging curve of roc values for validation set
	doublev predsumsV(validN, 0); 			//sums of predictions for each data point

	int attrN = data.getAttrN();
	if(topAttrN == -1)
		topAttrN = attrN;
	idpairv attrCounts;	//counts of attribute importance
	bool doFS = (topAttrN != 0);	//whether feature selection is requested
	if(doFS)
	{//initialize attrCounts
		attrCounts.resize(attrN);
		for(int attrNo = 0; attrNo < attrN; attrNo++)
		{
			attrCounts[attrNo].first = attrNo;	//number of attribute	
			attrCounts[attrNo].second = 0;		//counts
		}
	}
	fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::out);
	//header for compatibility with Additive Groves model
	AG_TRAIN_MODE modeStub = SLOW;
	fmodel.write((char*) &modeStub, sizeof(enum AG_TRAIN_MODE));
	int tigNStub = 1;
	fmodel.write((char*) &tigNStub, sizeof(int));
	fmodel.write((char*) &ti.alpha, sizeof(double));
	fmodel.close();
	
	fstream fbagrms("bagging_rms.txt", ios_base::out); //bagging curve (rms)
	fbagrms.close();
	fstream fbagroc;
	if(!ti.rms)
	{
		fbagroc.open("bagging_roc.txt", ios_base::out); //bagging curve (roc) 
		fbagroc.close();
	}

	//make bags, build trees, collect predictions
	for(int bagNo = 0; bagNo < ti.bagN; bagNo++)
	{
		if(doOut)
			cout << "Iteration " << bagNo + 1 << " out of " << ti.bagN << endl;

		data.newBag();
		CTree tree(ti.alpha);
		tree.setRoot();
		tree.grow(doFS, attrCounts);
		tree.save(modelFName.c_str());

		//generate predictions for validation set
		doublev predictions(validN);
		for(int itemNo = 0; itemNo < validN; itemNo++)
		{
			predsumsV[itemNo] += tree.predict(itemNo, VALID);
			predictions[itemNo] = predsumsV[itemNo] / (bagNo + 1);
		}
		rmsV[bagNo] = rmse(predictions, validTar);
		if(!ti.rms)
			rocV[bagNo] = roc(predictions, validTar);

		//output an element of bagging curve 
		fbagrms.open("bagging_rms.txt", ios_base::out | ios_base::app); 
		fbagrms << rmsV[bagNo] << endl;
		fbagrms.close();

		//same for roc, if needed
		if(!ti.rms)
		{
			fbagroc.open("bagging_roc.txt", ios_base::out | ios_base::app); 
			fbagroc << rocV[bagNo] << endl;
			fbagroc.close();
		}
	}

	if(doFS)	//sort attributes by counts
		sort(attrCounts.begin(), attrCounts.end(), idGreater);
	
//4. Output
		
	//output results and recommendations
	if(ti.rms)
		clog << "RMSE on validation set = " << rmsV[ti.bagN - 1] << "\n";
	else
		clog << "ROC on validation set = " << rocV[ti.bagN - 1] << "\n";


	//analyze whether more bagging should be recommended based on the curve in the best point
	if(moreBag(rmsV))
	{
		int recBagN = ti.bagN + 100;
		clog << "\nRecommendation: a greater number of bagging iterations might produce a better model.\n"
			<< "Suggested action: bt_train -b " << recBagN << "\n";
	}
	else
		clog << "\nThe bagging curve shows good convergence. \n"; 
	clog << "\n";

	//standard output in case of turned off log output: final performance on validation set only
	if(!doOut)
		if(ti.rms)
			cout << rmsV[ti.bagN - 1] << endl;
		else
			cout << rocV[ti.bagN - 1] << endl;

	//output feature selection results
	if(doFS)
	{
		if(topAttrN > attrN)
			topAttrN = attrN;

		fstream ffeatures("feature_scores.txt", ios_base::out);	
		ffeatures << "Top " << topAttrN << " features\n";
		for(int attrNo = 0; attrNo < topAttrN; attrNo++)
			ffeatures << data.getAttrName(attrCounts[attrNo].first) << "\t" 
				<< attrCounts[attrNo].second / ti.bagN / itemN << "\n";
		ffeatures << "\n\nColumn numbers (beginning with 1)\n";
		for(int attrNo = 0; attrNo < topAttrN; attrNo++)
			ffeatures << data.getColNo(attrCounts[attrNo].first) + 1 << " ";
		ffeatures << "\nLabel column number: " << data.getTarColNo() + 1;
		ffeatures.close();

		//output new attribute file
		for(int attrNo = topAttrN; attrNo < attrN; attrNo++)
			data.ignoreAttr(attrCounts[attrNo].first);
		data.outAttr(ti.attrFName);
	}

	}catch(TE_ERROR err){
示例#2
0
//ag_mergepreds [-n _start_N_value_] [-a _start_alpha_value_] -d _directory1_ _directory2_ [_directory3_] 
//[_directory4_] ...
int main(int argc, char* argv[])
{	
	try{
//0. Set log file
	LogStream clog;
	LogStream::init(true);
	clog << "\n-----\nag_mergepreds ";
	for(int argNo = 1; argNo < argc; argNo++)
		clog << argv[argNo] << " ";
	clog << "\n\n";

//1. Set input parameters from command line 

	int startTiGN = 1;
	double startAlpha = 0.5;
	int firstDirNo = 0;

	stringv args(argc); 
	for(int argNo = 0; argNo < argc; argNo++)
		args[argNo] = string(argv[argNo]);

	//parse and save input parameters
	for(int argNo = 1; argNo < argc; argNo += 2)
	{
		if(!args[argNo].compare("-n"))
			startTiGN = atoiExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-a"))
			startAlpha = atofExt(argv[argNo + 1]);
		else if(!args[argNo].compare("-d"))
		{
			firstDirNo = argNo + 1;
			break;
		}
		else
			throw INPUT_ERR;
	}

	//check that there are at least two directories 
	if(argc < (firstDirNo + 2))
		throw INPUT_ERR;
	//convert names of input directories to strings and check that they exist
	int folderN = argc - firstDirNo;
	stringv folders(folderN); 
	for(int argNo = firstDirNo; argNo < argc; argNo++)
	{
		folders[argNo - firstDirNo] = string(argv[argNo]);
		struct stat status;
		if((stat(argv[argNo], &status) != 0) || !(status.st_mode & S_IFDIR))
			throw DIR_ERR;
	}

//1.a) delete all temp files from the previous run and create a directory AGTemp
#ifdef WIN32	//in windows
	CreateDirectory("AGTemp", NULL);
#else // in linux
	system("rm -rf ./AGTemp/");
	system("mkdir ./AGTemp/");
#endif

//2. Set parameters from AGTemp/params.txt from the first directory
	TrainInfo ti;			//set of model parameters in the current directory
	double prevBest;		//best value of performance achieved on the previous run
			
	fstream fparam;
	string paramPathName = folders[0] + "/AGTemp/params.txt";
	fparam.open(paramPathName.c_str(), ios_base::in); 
	string modeStr, metric;
	fparam >> ti.seed >> ti.trainFName >> ti.validFName >> ti.attrFName >> ti.minAlpha >> ti.maxTiGN 
		>> ti.bagN >> modeStr >> metric;	

	//modeStr should be "fast" or "slow" or "layered"	
	if(modeStr.compare("fast") == 0)
		ti.mode = FAST;
	else if(modeStr.compare("slow") == 0)
		ti.mode = SLOW;
	else if(modeStr.compare("layered") == 0)
		ti.mode = LAYERED;
	else
		throw TEMP_ERR;

	//metric should be "roc" or "rms"
	if(metric.compare("rms") == 0)
		ti.rms = true;
	else if(metric.compare("roc") == 0)
		ti.rms = false;
	else
		throw TEMP_ERR;

	if(fparam.fail())
		throw TEMP_ERR;
	fparam.close();
	fparam.clear();

	//read best value of performance on previous run
	fstream fbest;
	double stub;
	double trainV; // number of data points in the train set, need to calculate possible values of alpha
	string fbestPathName = folders[0] + "/AGTemp/best.txt";
	fbest.open(fbestPathName.c_str(), ios_base::in); 
	fbest >> prevBest >> stub >> stub >> stub >> trainV;
	if(fbest.fail())
		throw TEMP_ERR;
	fbest.close();

	int alphaN = getAlphaN(ti.minAlpha, trainV); //number of different alpha values
	int tigNN = getTiGNN(ti.maxTiGN);

	//direction of initialization (1 - up, 0 - right), used in fast mode only
	doublevv dir(tigNN, doublev(alphaN, 0)); 
	//outer array: column (by TiGN)
	//middle array: row	(by alpha)

	//direction of initialization (1 - up, 0 - right), collects average in the slow mode
	doublevv dirStat(tigNN, doublev(alphaN, 0));

	if(ti.mode == FAST)
	{//read part of the directions table from file
		fstream fdir;
		string fdirPathName = folders[0] + "/AGTemp/dir.txt";
		fdir.open(fdirPathName.c_str(), ios_base::in); 
		for(int tigNNo = 0; tigNNo < tigNN; tigNNo++)
			for(int alphaNo = 0; alphaNo < alphaN; alphaNo++)
				fdir >> dir[tigNNo][alphaNo];
		if(fdir.fail())
			throw TEMP_ERR;
		fdir.close();
	}

//3. Read main parameters from all other directories and check that they match

	int allBagN = ti.bagN;
	int lastSeed = ti.seed;
	for(int folderNo = 1; folderNo < folderN; folderNo++)
	{
		TrainInfo extraTI;	//set of model parameters in the additional directory
		
		string fparamPathName = folders[folderNo] + "/AGTemp/params.txt";
		fparam.open(fparamPathName.c_str(), ios_base::in); 
		fparam >> extraTI.seed >> extraTI.trainFName >> extraTI.validFName >> extraTI.attrFName 
			>> extraTI.minAlpha >> extraTI.maxTiGN >> extraTI.bagN;	

		if(fparam.fail())
		{
			clog << fparamPathName << '\n';
			throw TEMP_ERR;
		}
		fparam.close();

		if((ti.minAlpha != extraTI.minAlpha) || (ti.maxTiGN != extraTI.maxTiGN))
		{
		    clog << fparamPathName << '\n';
			throw MERGE_MISMATCH_ERR;
		}
		if(extraTI.seed == ti.seed)
			throw SAME_SEED_ERR;
		if(folderNo == (folderN - 1))
			lastSeed = extraTI.seed;

		allBagN += extraTI.bagN;

		string fdirStatPathName = folders[folderNo] + "/AGTemp/dirstat.txt";
		fstream fdirStat;	
		fdirStat.open("./AGTemp/dirstat.txt", ios_base::in);
		for(int alphaNo = 0; alphaNo < alphaN; alphaNo++)
			for(int tigNNo = 0; tigNNo < tigNN; tigNNo++)
			{
				double ds;
				fdirStat >> ds;
				dirStat[tigNNo][alphaNo] += ds * extraTI.bagN;
			}
	}

//4. Load data
	INDdata data("", ti.validFName.c_str(), "", ti.attrFName.c_str());
	doublev validTar;
	int validN = data.getTargets(validTar, VALID);

	clog << "Alpha = " << ti.minAlpha << "\nN = " << ti.maxTiGN << "\n" 
		<< allBagN << " bagging iterations\n";
	if(ti.mode == FAST)
		clog << "fast mode\n\n";
	else if(ti.mode == SLOW)
		clog << "slow mode\n\n";
	else //if(ti.mode == LAYERED)
		clog << "layered mode\n\n";

	//5. Initialize some internal process variables

	//surfaces of performance values for validation set. 
	//Always calculate rms (for convergence analysis), if needed, calculate roc
	doublevvv rmsV(tigNN, doublevv(alphaN, doublev(folderN, 0))); 
	doublevvv rocV;
	if(!ti.rms)
		rocV.resize(tigNN, doublevv(alphaN, doublev(folderN, 0))); 
	//outer array: column (by TiGN)
	//middle array: row (by alpha)
	//inner array: bagging iterations. Performance is kept for all iterations to create bagging curves

	//sums of predictions for each data point (raw material to calculate performance)
	doublevvv predsumsV(tigNN, doublevv(alphaN, doublev(validN, 0)));
	//outer array: column (by TiGN)
	//middle array: row	(by alpha)
	//inner array: data points in the validation set
	

//6. Read and merge models from the directories
	int startAlphaNo = getAlphaN(startAlpha, trainV) - 1; 
	int startTiGNNo = getTiGNN(startTiGN) - 1;

	for(int alphaNo = startAlphaNo; alphaNo < alphaN; alphaNo++)
	{
		double alpha;
		if(alphaNo < alphaN - 1)
			alpha = alphaVal(alphaNo);
		else	//this is a special case because minAlpha can be zero
			alpha = ti.minAlpha;

		cout << "Merging predictions with alpha = " << alpha << endl;

		for(int tigNNo = startTiGNNo; tigNNo < tigNN; tigNNo++) 
		{
			int tigN = tigVal(tigNNo);	//number of trees in the current grove

			//temp file in the extra directory that keeps models corresponding to alpha and tigN
			string prefix = string("/AGTemp/ag.a.") 
								+ alphaToStr(alpha)
								+ ".n." 
								+ itoa(tigN, 10);
			string predsFName = prefix + ".preds.txt";

			for(int folderNo = 0; folderNo < folderN; folderNo++)
			{
				string inPredsFName = folders[folderNo] + predsFName;
				fstream finpreds((inPredsFName).c_str(), ios_base::in);
				if(finpreds.fail())
				{
				    clog << inPredsFName << '\n';
					throw TEMP_ERR;
				}
				//generate predictions and performance for validation set
				doublev predictions(validN);
				for(int itemNo = 0; itemNo < validN; itemNo++)
				{
					double sinpred;
					finpreds >> sinpred;

					predsumsV[tigNNo][alphaNo][itemNo] += sinpred;//extraGrove.predict(itemNo, VALID);
					predictions[itemNo] = predsumsV[tigNNo][alphaNo][itemNo] / (folderNo + 1);
				}
				if(folderNo == folderN - 1)
				{
					fstream fpreds((string(".") + predsFName).c_str(), ios_base::out);
					for(int itemNo = 0; itemNo < validN; itemNo++)
						fpreds << predictions[itemNo] << endl;
					fpreds.close();
				}

				rmsV[tigNNo][alphaNo][folderNo] = rmse(predictions, validTar);
				if(!ti.rms)
					rocV[tigNNo][alphaNo][folderNo] = roc(predictions, validTar);

				finpreds.close();
			}//end for(int folderNo = 0; folderNo < folderN; folderNo++)
		}//end for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) 
	}//end for(int alphaNo = 0; alphaNo < alphaN; alphaNo++)

	//4. Output
	ti.bagN = folderN;
	ti.seed = lastSeed;
	if(ti.rms)
	{
		double validStD = data.getTarStD(VALID);
		trainOut(ti, dir, rmsV, rmsV, predsumsV, trainV, dirStat, validStD, startAlphaNo, startTiGNNo);
	}
	else
		trainOut(ti, dir, rmsV, rocV, predsumsV, trainV, dirStat, -1.0, startAlphaNo, startTiGNNo);

	if(folderN != allBagN)
		clog << "Warning: bagging curve and -b recommendations could not be calculated correctly "
			<< "in this mode. Each visible bagging step corresponds to several real steps.\n";

	}catch(TE_ERROR err){