Пример #1
0
//Train model with a regression dataset
void CARTTrainer::train(ModelType& model, RegressionDataset const& dataset)
{
	//Store the number of input dimensions
	m_inputDimension = inputDimension(dataset);

	//Pass input dimension (i.e., number of attributes) to tree model
	model.setInputDimension(m_inputDimension);

	//Store the size of the labels
	m_labelDimension = labelDimension(dataset);

	// create cross-validation folds
	RegressionDataset set=dataset;
	CVFolds<RegressionDataset > folds = createCVSameSize(set, m_numberOfFolds);
	double bestErrorRate = std::numeric_limits<double>::max();
	CARTClassifier<RealVector>::TreeType bestTree;
	
	for (unsigned fold = 0; fold < m_numberOfFolds; ++fold){
		//Run through all the cross validation sets
		RegressionDataset dataTrain = folds.training(fold);
		RegressionDataset dataTest = folds.validation(fold);
		std::size_t numTrainElements = dataTrain.numberOfElements();

		AttributeTables tables = createAttributeTables(dataTrain.inputs());

		std::vector < RealVector > labels(numTrainElements);
		boost::copy(dataTrain.labels().elements(),labels.begin());
		//Build tree form this fold
		CARTClassifier<RealVector>::TreeType tree = buildTree(tables, dataTrain, labels, 0, dataTrain.numberOfElements());
		//Add the tree to the model and prune
		model.setTree(tree);
		while(true){
			//evaluate the error of current tree
			SquaredLoss<> loss;
			double error = loss.eval(dataTest.labels(), model(dataTest.inputs()));

			if(error < bestErrorRate){
				//We have found a subtree that has a smaller error rate when tested!
				bestErrorRate = error;
				bestTree = tree;
			}
                        if(tree.size() == 1) break;
			pruneTree(tree);
			model.setTree(tree);
		}
	}
        SHARK_CHECK(bestTree.size() > 0, "We should never set a tree that is empty.");
	model.setTree(bestTree);
}
Пример #2
0
int main(int argc, char **argv) {
	RegressionDataset data;
	importCSV(data, "blogData_train.csv", LAST_COLUMN,1,',','#', 2<<16);

	LinearRegression trainer(100);
	LinearModel<> model;
	
	Timer time;
	trainer.train(model, data);
	double time_taken = time.stop();

	SquaredLoss<> loss;
	cout << "Residual sum of squares:" << loss(data.labels(),model(data.inputs()))<<std::endl;
	cout << "Time:\n" << time_taken << endl;
	cout << time_taken << endl;
}
Пример #3
0
// Regression
void RFTrainer::train(RFClassifier& model, RegressionDataset const& dataset)
{
	model.clearModels();   // added by TG 23.02.2015

	//TODO O.K.: i am just fixing these things for now so that they are working.

	//Store the number of input dimensions
	m_inputDimension = inputDimension(dataset);

	//Store the size of the labels
	m_labelDimension = labelDimension(dataset);

	model.setInputDimension(m_inputDimension);
	model.setLabelDimension(m_labelDimension);

	m_regressionLearner = true;
	setDefaults();
	
	//we need direct element access sicne we need to generate elementwise subsets
	std::size_t subsetSize = static_cast<std::size_t>(dataset.numberOfElements()*m_OOBratio);
	DataView<RegressionDataset const> elements(dataset);

	//Generate m_B trees
	SHARK_PARALLEL_FOR(int i = 0; i < (int)m_B; ++i){
		//For each tree generate a subset of the dataset
		//generate indices of the dataset (pick k out of n elements)
		std::vector<std::size_t> subsetIndices(dataset.numberOfElements());
		boost::iota(subsetIndices,0);
		boost::random_shuffle(subsetIndices);

		// create oob indices
		std::vector<std::size_t>::iterator oobStart = subsetIndices.begin() + subsetSize;
		std::vector<std::size_t>::iterator oobEnd   = subsetIndices.end();
		
		//generate the dataset by copying (TODO: this is a quick fix!
		subsetIndices.erase(oobStart, oobEnd);
		RegressionDataset dataTrain = toDataset(subset(elements,subsetIndices));

		AttributeTables tables;
		createAttributeTables(dataTrain.inputs(), tables);

		std::size_t dataTrainSize = dataTrain.numberOfElements();
		std::vector<RealVector> labels;
		for(std::size_t i = 0; i < dataTrainSize; i++){
			labels.push_back(dataTrain.element(i).label);
		}

		CARTClassifier<RealVector>::TreeType tree = buildTree(tables, dataTrain, labels, 0);
		CARTClassifier<RealVector> cart(tree, m_inputDimension);

		// if oob error or importances have to be computed, create an oob sample
		if(m_computeOOBerror || m_computeFeatureImportances){
			std::vector<std::size_t> subsetIndicesOOB(oobStart, oobEnd);
			RegressionDataset dataOOB = toDataset(subset(elements, subsetIndicesOOB));

			// if importances should be computed, oob errors are computed implicitly
			if(m_computeFeatureImportances){
				cart.computeFeatureImportances(dataOOB);
			} // if importances should not be computed, only compute the oob errors
			else{
				cart.computeOOBerror(dataOOB);
			}
		}

		SHARK_CRITICAL_REGION{
			model.addModel(cart);
		}
	}

	if(m_computeOOBerror){
		model.computeOOBerror();
	}

	if(m_computeFeatureImportances){
		model.computeFeatureImportances();
	}
}