Exemplo n.º 1
0
int main (int argc, char* argv[]){
  RegressionDataset dataset;
  dataset.setName("IAM-sequenced10px");
  
  fillImageVector(argv[1], 0, dataset, 10);
  cout << dataset.getMean();
  cout << dataset.getStandardDeviation();
  dataset.save("../xml/IAM-10.xml");
  return EXIT_SUCCESS;
}
Exemplo n.º 2
0
int main(int argc, char **argv) {
	RegressionDataset data;
	importCSV(data, "blogData_train.csv", LAST_COLUMN,1,',','#', 2<<16);

	LinearRegression trainer(100);
	LinearModel<> model;
	
	Timer time;
	trainer.train(model, data);
	double time_taken = time.stop();

	SquaredLoss<> loss;
	cout << "Residual sum of squares:" << loss(data.labels(),model(data.inputs()))<<std::endl;
	cout << "Time:\n" << time_taken << endl;
	cout << time_taken << endl;
}
Exemplo n.º 3
0
//Train model with a regression dataset
void CARTTrainer::train(ModelType& model, RegressionDataset const& dataset)
{
	//Store the number of input dimensions
	m_inputDimension = inputDimension(dataset);

	//Pass input dimension (i.e., number of attributes) to tree model
	model.setInputDimension(m_inputDimension);

	//Store the size of the labels
	m_labelDimension = labelDimension(dataset);

	// create cross-validation folds
	RegressionDataset set=dataset;
	CVFolds<RegressionDataset > folds = createCVSameSize(set, m_numberOfFolds);
	double bestErrorRate = std::numeric_limits<double>::max();
	CARTClassifier<RealVector>::TreeType bestTree;
	
	for (unsigned fold = 0; fold < m_numberOfFolds; ++fold){
		//Run through all the cross validation sets
		RegressionDataset dataTrain = folds.training(fold);
		RegressionDataset dataTest = folds.validation(fold);
		std::size_t numTrainElements = dataTrain.numberOfElements();

		AttributeTables tables = createAttributeTables(dataTrain.inputs());

		std::vector < RealVector > labels(numTrainElements);
		boost::copy(dataTrain.labels().elements(),labels.begin());
		//Build tree form this fold
		CARTClassifier<RealVector>::TreeType tree = buildTree(tables, dataTrain, labels, 0, dataTrain.numberOfElements());
		//Add the tree to the model and prune
		model.setTree(tree);
		while(true){
			//evaluate the error of current tree
			SquaredLoss<> loss;
			double error = loss.eval(dataTest.labels(), model(dataTest.inputs()));

			if(error < bestErrorRate){
				//We have found a subtree that has a smaller error rate when tested!
				bestErrorRate = error;
				bestTree = tree;
			}
                        if(tree.size() == 1) break;
			pruneTree(tree);
			model.setTree(tree);
		}
	}
        SHARK_CHECK(bestTree.size() > 0, "We should never set a tree that is empty.");
	model.setTree(bestTree);
}
Exemplo n.º 4
0
void fillImageVector(char* _directoryName,int _mode, RegressionDataset& _dataset, int _sectionLength){
  DIR *dp;
  struct dirent *ep;
  dp = opendir (_directoryName);
  if (dp != NULL){
    while (ep = readdir (dp)){
      //append an image to the vector only if it has a .png extension
      if(strstr(ep->d_name,".png")!=NULL){
	char str[200]="";
	strcat(str,_directoryName);
	strcat(str,"/");
	strcat(str,ep->d_name);
	Mat image = imread(str,_mode);
	int subparts=floor(((float)image.cols)/((float)_sectionLength));
	vector<FeatureVector> features;
	for(int i=0;i<subparts;i++){
	  FeatureVector fv(_sectionLength*image.rows);
	  for(int j=0;j<_sectionLength;j++){
	    for(int k=0;k<image.rows;k++){
	      if((int)image.at<uchar>(k,i*_sectionLength+j)==255){
		fv[j*image.rows+k]=1.0;
	      }
	      else{
		fv[j*image.rows+k]=0.0;
	      }
	    }
	  }
	  features.push_back(fv);
	}
	_dataset.addSequence(features,features);
      }
    }
    (void) closedir (dp);
  }
  else{
    cerr << "ERROR in CreateImageVector : Couldn't open the directory";
    exit(1);
  }
}
Exemplo n.º 5
0
// Regression
void RFTrainer::train(RFClassifier& model, RegressionDataset const& dataset)
{
	model.clearModels();   // added by TG 23.02.2015

	//TODO O.K.: i am just fixing these things for now so that they are working.

	//Store the number of input dimensions
	m_inputDimension = inputDimension(dataset);

	//Store the size of the labels
	m_labelDimension = labelDimension(dataset);

	model.setInputDimension(m_inputDimension);
	model.setLabelDimension(m_labelDimension);

	m_regressionLearner = true;
	setDefaults();
	
	//we need direct element access sicne we need to generate elementwise subsets
	std::size_t subsetSize = static_cast<std::size_t>(dataset.numberOfElements()*m_OOBratio);
	DataView<RegressionDataset const> elements(dataset);

	//Generate m_B trees
	SHARK_PARALLEL_FOR(int i = 0; i < (int)m_B; ++i){
		//For each tree generate a subset of the dataset
		//generate indices of the dataset (pick k out of n elements)
		std::vector<std::size_t> subsetIndices(dataset.numberOfElements());
		boost::iota(subsetIndices,0);
		boost::random_shuffle(subsetIndices);

		// create oob indices
		std::vector<std::size_t>::iterator oobStart = subsetIndices.begin() + subsetSize;
		std::vector<std::size_t>::iterator oobEnd   = subsetIndices.end();
		
		//generate the dataset by copying (TODO: this is a quick fix!
		subsetIndices.erase(oobStart, oobEnd);
		RegressionDataset dataTrain = toDataset(subset(elements,subsetIndices));

		AttributeTables tables;
		createAttributeTables(dataTrain.inputs(), tables);

		std::size_t dataTrainSize = dataTrain.numberOfElements();
		std::vector<RealVector> labels;
		for(std::size_t i = 0; i < dataTrainSize; i++){
			labels.push_back(dataTrain.element(i).label);
		}

		CARTClassifier<RealVector>::TreeType tree = buildTree(tables, dataTrain, labels, 0);
		CARTClassifier<RealVector> cart(tree, m_inputDimension);

		// if oob error or importances have to be computed, create an oob sample
		if(m_computeOOBerror || m_computeFeatureImportances){
			std::vector<std::size_t> subsetIndicesOOB(oobStart, oobEnd);
			RegressionDataset dataOOB = toDataset(subset(elements, subsetIndicesOOB));

			// if importances should be computed, oob errors are computed implicitly
			if(m_computeFeatureImportances){
				cart.computeFeatureImportances(dataOOB);
			} // if importances should not be computed, only compute the oob errors
			else{
				cart.computeOOBerror(dataOOB);
			}
		}

		SHARK_CRITICAL_REGION{
			model.addModel(cart);
		}
	}

	if(m_computeOOBerror){
		model.computeOOBerror();
	}

	if(m_computeFeatureImportances){
		model.computeFeatureImportances();
	}
}
Exemplo n.º 6
0
CARTClassifier<RealVector>::TreeType RFTrainer::buildTree(AttributeTables& tables, RegressionDataset const& dataset, std::vector<RealVector> const& labels, std::size_t nodeId ){

	//Construct tree
	CARTClassifier<RealVector>::NodeInfo nodeInfo;

	nodeInfo.nodeId = nodeId;
	nodeInfo.attributeIndex = 0;
	nodeInfo.attributeValue = 0.0;
	nodeInfo.leftNodeId = 0;
	nodeInfo.rightNodeId = 0;
	nodeInfo.label = average(labels);
	nodeInfo.misclassProp = 0.0;
	nodeInfo.r = 0;
	nodeInfo.g = 0.0;

	CARTClassifier<RealVector>::TreeType tree, lTree, rTree;

	//n = Total number of cases in the dataset
	std::size_t n = tables[0].size();
	bool isLeaf = false;
	if(n <= m_nodeSize){
		isLeaf = true;
	}else{

		//label vectors
		std::vector<RealVector> bestLabels, tmpLabels;
		RealVector labelSumAbove(m_labelDimension), labelSumBelow(m_labelDimension);

		//Randomly select the attributes to test for split
		set<std::size_t> tableIndicies;
		generateRandomTableIndicies(tableIndicies);

		//Index of attributes
		std::size_t attributeIndex, bestAttributeIndex, bestAttributeValIndex;

		//Attribute values
		double bestAttributeVal;
		double bestImpurity = -1;

		std::size_t prev;
		bool doSplit = false;
		for (set<std::size_t>::iterator it=tableIndicies.begin() ; it != tableIndicies.end(); it++ ){
			attributeIndex = *it;

			labelSumBelow.clear();
			labelSumAbove.clear();
			tmpLabels.clear();

			//Create a labels table, that corresponds to the sorted attribute
			for(std::size_t k=0; k<tables[attributeIndex].size(); k++){
				tmpLabels.push_back(dataset.element(tables[attributeIndex][k].id).label);
				labelSumBelow += dataset.element(tables[attributeIndex][k].id).label;
			}
			labelSumAbove += tmpLabels[0];
			labelSumBelow -= tmpLabels[0];

			for(std::size_t i=1; i<n; i++){
				prev = i-1;
				if(tables[attributeIndex][prev].value!=tables[attributeIndex][i].value){
					std::size_t n1=i;
					std::size_t n2 = n-n1;
					//Calculate the squared error of the split
					double impurity = (n1*totalSumOfSquares(tmpLabels,0,n1,labelSumAbove)+n2*totalSumOfSquares(tmpLabels,n1,n2,labelSumBelow))/(double)(n);

					if(impurity<bestImpurity || bestImpurity<0){
						//Found a more pure split, store the attribute index and value
						doSplit = true;
						bestImpurity = impurity;
						bestAttributeIndex = attributeIndex;
						bestAttributeValIndex = prev;
						bestAttributeVal = tables[attributeIndex][bestAttributeValIndex].value;
						bestLabels = tmpLabels;
					}
				}

				labelSumAbove += tmpLabels[i];
				labelSumBelow -= tmpLabels[i];
			}
		}

		if(doSplit){

			//Split the attribute tables
			AttributeTables rTables, lTables;
			splitAttributeTables(tables, bestAttributeIndex, bestAttributeValIndex, lTables, rTables);
			tables.clear();//save memory

			//Split the labels
			std::vector<RealVector> lLabels, rLabels;
			for(std::size_t i = 0; i <= bestAttributeValIndex; i++){
				lLabels.push_back(bestLabels[i]);
			}
			for(std::size_t i = bestAttributeValIndex+1; i < bestLabels.size(); i++){
				rLabels.push_back(bestLabels[i]);
			}

			//Continue recursively
			nodeInfo.attributeIndex = bestAttributeIndex;
			nodeInfo.attributeValue = bestAttributeVal;
			nodeInfo.leftNodeId = 2*nodeId+1;
			nodeInfo.rightNodeId = 2*nodeId+2;

			lTree = buildTree(lTables, dataset, lLabels, nodeInfo.leftNodeId);
			rTree = buildTree(rTables, dataset, rLabels, nodeInfo.rightNodeId);
		}else{
			//Leaf node
			isLeaf = true;
		}

	}

	if(isLeaf){
		tree.push_back(nodeInfo);
		return tree;
	}

	tree.push_back(nodeInfo);
	tree.insert(tree.end(), lTree.begin(), lTree.end());
	tree.insert(tree.end(), rTree.begin(), rTree.end());

	//Store entry in the tree
	return tree;

}
Exemplo n.º 7
0
//Build CART tree in the regression case
CARTTrainer::TreeType CARTTrainer::buildTree(AttributeTables const& tables, RegressionDataset const& dataset, std::vector<RealVector> const& labels, std::size_t nodeId, std::size_t trainSize){
        size_t nextId = 0;
        std::queue<BuildData> bd;
        bd.emplace(tables, labels, nextId++);
        
        TreeType tree;
        while(!bd.empty())
        {
                BuildData current(std::move(bd.front()));
                bd.pop();
                //Construct tree
                CARTClassifier<RealVector>::NodeInfo nodeInfo;

                nodeInfo.nodeId = current.nodeId;
                nodeInfo.label = mean(current.labels);
                nodeInfo.leftNodeId = 0;
                nodeInfo.rightNodeId = 0;

                //Store the Total Sum of Squares (TSS)
                RealVector labelSum = current.labels[0];
                for(std::size_t i=1; i< current.labels.size(); i++){
                        labelSum += current.labels[0];
                }

                nodeInfo.misclassProp = totalSumOfSquares(current.labels, 0, current.labels.size(), labelSum)*((double)dataset.numberOfElements()/trainSize);

                //n = Total number of cases in the dataset
                //n1 = Number of cases to the left child node
                //n2 = number of cases to the right child node
                std::size_t n, n1, n2;

                n = current.tables[0].size();

                size_t splitcount =  n/m_numSplits;
                splitcount = splitcount ? splitcount : 1; // Make sure splitcount is never 0
                
                std::cout << labels.size() << " " << splitcount << " " << m_nodeSize << std::endl;
                
                if(n > m_nodeSize){
                        //label vectors
                        std::vector<RealVector> bestLabels, tmpLabels;
                        RealVector labelSumAbove(m_labelDimension), labelSumBelow(m_labelDimension);

                        //Index of attributes
                        std::size_t bestAttributeIndex = 0;
                        std::size_t bestAttributeValIndex = m_nodeSize;

                        //Attribute values
                        double bestAttributeVal = current.tables[bestAttributeIndex][bestAttributeValIndex-1].value;
                        double impurity, fullImpurity, bestImpurity = -1;

                        bool doSplit = false;
                        for (size_t attributeIndex = 0; attributeIndex< m_inputDimension; attributeIndex++){

                                labelSumBelow.clear();
                                labelSumAbove.clear();

                                tmpLabels.clear();
                                //Create a labels table, that corresponds to the sorted attribute
                                for(std::size_t k=0; k<current.tables[attributeIndex].size(); k++){
                                        tmpLabels.push_back(dataset.element(current.tables[attributeIndex][k].id).label);
                                        noalias(labelSumBelow) += dataset.element(current.tables[attributeIndex][k].id).label;
                                }

                                for(std::size_t i=splitcount; i<n; i += splitcount){
                                        // cerr << "Trying split at att: " << attributeIndex << " and point: " << i << endl;
                                        for(std::size_t j = i-splitcount; j < i; j++)
                                        {
                                                noalias(labelSumAbove) += tmpLabels[j];
                                                noalias(labelSumBelow) -= tmpLabels[j];
                                        }
                                        
                                        if(current.tables[attributeIndex][i-splitcount].value!=current.tables[attributeIndex][i].value){
                                                n1=i;
                                                n2 = n-n1;
                                                //Calculate the squared error of the split
                                                fullImpurity = totalSumOfSquares(tmpLabels,0,n,labelSumBelow + labelSumAbove);
                                                impurity = (n1*totalSumOfSquares(tmpLabels,0,n1,labelSumAbove)+n2*totalSumOfSquares(tmpLabels,n1,n2,labelSumBelow))/(double)(n);

                                                double improvement = (fullImpurity - impurity) / fullImpurity;
                                                
                                                if(improvement*100 >= m_splitImpurityGain && (impurity<bestImpurity || bestImpurity<0)){
                                                        //Found a more pure split, store the attribute index and value
                                                        doSplit = true;
                                                        bestImpurity = impurity;
                                                        bestAttributeIndex = attributeIndex;
                                                        bestAttributeValIndex = i;
                                                        bestAttributeVal = current.tables[attributeIndex][bestAttributeValIndex-1].value;
                                                        bestLabels = tmpLabels;
                                                }
                                        }
                                }
                        }

                        if(doSplit){

                                BuildData leftNode;
                                BuildData rightNode;
                            
                                //Split the attribute tables
                                splitAttributeTables(current.tables, bestAttributeIndex, bestAttributeValIndex-1, leftNode.tables, rightNode.tables);

                                //Split the labels
                                for(std::size_t i = 0; i < bestAttributeValIndex; i++){
                                        leftNode.labels.push_back(bestLabels[i]);
                                }
                                for(std::size_t i = bestAttributeValIndex; i < bestLabels.size(); i++){
                                        rightNode.labels.push_back(bestLabels[i]);
                                }

                                //Continue recursively
                                nodeInfo.attributeIndex = bestAttributeIndex;
                                nodeInfo.attributeValue = bestAttributeVal;
                                nodeInfo.leftNodeId = nextId++;
                                nodeInfo.rightNodeId = nextId++;
                                
                                leftNode.nodeId = nodeInfo.leftNodeId;
                                rightNode.nodeId = nodeInfo.rightNodeId;
                                
                                bd.push(std::move(leftNode));
                                bd.push(std::move(rightNode));
                        }
                }
                
                tree.push_back(nodeInfo);

        }
        std::cerr << "Tree size: " << tree.size() << std::endl;
        cerr << "Will return\n";
	return tree;

}
int main(int argc, char* argv[]) {
	RegressionDataset dataset;
	dataset.load(argv[2]);

	RegressionDataset dataset2;
	dataset2.load(argv[3]);


	cout << "Dataset loaded, total elements : " << dataset.getNumSamples() << endl;
	Mask mask;
	LearningParams params;
	PBDNN pop;
	ifstream inStream(argv[1]);
	inStream >> pop;
	inStream >> params;

	vector<Vec3b> colors = createColorRepartition(pop.getPopulation().size());
	ofstream log("training.log");
	PopulationClusterBP pbp(pop, dataset, params, dataset2,mask, mask,log);
	AEMeasurer mae;
	DiversityMeasurer diversity(pop, dataset, mae);
	diversity.measurePerformance();
	cout << "Starting diversity" << endl << diversity.getDisagreementMatrix() << endl;
	cout << "Starting overall diversity" << endl << diversity.getDisagreementScalar() << endl;
	double t = (double) getTickCount();
	pbp.train();
	t = ((double) getTickCount() - t) / getTickFrequency();
	log << "Time :" << t << endl;
	cout << endl << "Saving network" << endl;
	ofstream outStream("IAMpop.txt");
	outStream << pop;
	DiversityMeasurer diversity2(pop, dataset2, mae);
	vector<NeuralNetworkPtr> population = pop.getPopulation();

	vector<vector<int> > assignedTo = diversity2.findBestNetwork();
	vector<vector<FeatureVector> > recomposed = diversity2.buildBestOutput();
	vector<int> pngParams = vector<int>();
	pngParams.push_back(CV_IMWRITE_PNG_COMPRESSION);
	pngParams.push_back(3);
	cout << "Recording Data" << endl;
	for (uint i = 0; i < population.size(); i++) {
		ostringstream dir;
		dir << "network" << i;
		if (mkdir(dir.str().c_str(), S_IRWXU) == 0) {
			for (uint j = 0; j < dataset2.getNumSequences(); j++) {
				ostringstream name;
				name << "network" << i << "\/neuralNet" << i << "sample" << j << ".png";
				vector<FeatureVector> features;
				for (uint k = 0; k < dataset2[j].size(); k++) {
					population[i]->forward(dataset2[j][k]);
					features.push_back(population[i]->getOutputSignal());
				}
				vector<int> color = vector<int>(features.size(), i);
				Mat image = buildColorMapImage(features, 3, color, colors);
				imwrite(name.str(), image, pngParams);
			}
		} else {
			throw invalid_argument("pbdnnCluster : could not create directory");
		}
	}
	ostringstream dirR;
	dirR << "recomposed";
	if (mkdir(dirR.str().c_str(), S_IRWXU) == 0) {
		for (uint j = 0; j < recomposed.size(); j++) {
			ostringstream name;
			name << "recomposed\/recomposedSample" << j << ".png";
			vector<FeatureVector> features;
			Mat image = buildColorMapImage(recomposed[j], 3, assignedTo[j], colors);
			imwrite(name.str(), image, pngParams);
		}
	}
	log.close();
	return EXIT_SUCCESS;
}
Exemplo n.º 9
0
int main(int argc, char* argv[]) {
  vector<string> arguments;
  arguments.push_back("population size");
  arguments.push_back("number of hidden units");
  arguments.push_back("number of iterations");
  arguments.push_back("learning dataset");
  arguments.push_back("validation dataset");
  arguments.push_back("simple load mode");
  cout << helper("Pbdnn cluster", "Train a population of neural networks on a regression task.", arguments) << endl;
  if (argc != arguments.size() + 1) {
    cerr << "Not enough arguments, " << argc - 1 << " given and " << arguments.size() << " required" << endl;
    return EXIT_FAILURE;
  }
  int simpleMode = atoi(argv[6]);
  RegressionDataset dataset;
  RegressionDataset dataset2;
  if(simpleMode!=0) {
    dataset.simpleLoad(argv[4]);
    dataset2.simpleLoad(argv[5]);
  }
  else {
    dataset.load(argv[4]);
    dataset2.load(argv[5]);
  }
  cout << "Learning dataset loaded, total elements : " << dataset.getNumSamples() << endl;
  cout << "Validation dataset loaded, total elements : " << dataset2.getNumSamples() << endl;
  int populationSize = atoi(argv[1]);
  int numberOfHiddenUnits = atoi(argv[2]);
  int iterations = atoi(argv[3]);
  vector<Vec3b> colors = createColorRepartition(populationSize);
  AEMeasurer mae;
  PBDNN pop = PBDNN(populationSize, dataset.getFeatureVectorLength(), numberOfHiddenUnits, dataset.getMean(), dataset.getStandardDeviation());
  DiversityMeasurer diversity(pop, dataset2, mae,0.01);

  // 07/02/13 : Not sure if useful or not so stop doing it
  /*do {
    pop = PBDNN(populationSize, dataset.getFeatureVectorLength(), numberOfHiddenUnits, dataset.getMean(), dataset.getStandardDeviation());
    diversity.measurePerformance();
    } while (diversity.getDisagreementScalar() < 0.17);*/
  Mask mask;
  LearningParams params;
  params.setActualIteration(0);
  params.setMaxIterations(iterations);
  params.setLearningRate(0.001);
  params.setMaxTrainedPercentage(0.1);
  params.setSavedDuringProcess(true);
  params.setValidateEveryNIteration(100);
  ofstream log("training.log");
  PopulationClusterBP pbp(pop, dataset, params, dataset2, mask, mask, log);
  // 07/02/13 : Not sure if useful or not so stop doing it
  /*cout << "Starting diversity" << endl << diversity.getDisagreementMatrix() << endl;
    cout << "Starting overall diversity : " << diversity.getDisagreementScalar() << endl;*/

  cout << "Training" << endl;
  double t = (double) getTickCount();
  pbp.train();
  t = ((double) getTickCount() - t) / getTickFrequency();
  cout << "Time :" << t << endl;

  cout << endl << "Saving network" << endl;
  ofstream outStream("IAMpop.pop");
  outStream << pop;

  if(simpleMode == 0) {
    cout << "Recording Data" << endl;
    vector<NeuralNetworkPtr> population = pop.getPopulation();
    vector<vector<int> > assignedTo = diversity.findBestNetwork();
    vector<vector<FeatureVector> > recomposed = diversity.buildBestOutput();
    vector<int> pngParams = vector<int>();
    pngParams.push_back(CV_IMWRITE_PNG_COMPRESSION);
    pngParams.push_back(3);

    for (uint i = 0; i < population.size(); i++) {
      ostringstream dir;
      dir << "network" << i;
      if (mkdir(dir.str().c_str(), S_IRWXU) == 0) {
	for (uint j = 0; j < dataset2.getNumSequences(); j++) {
	  ostringstream name;
	  name << "network" << i << "\/neuralNet" << i << "sample" << j << ".png";
	  vector<FeatureVector> features;
	  for (uint k = 0; k < dataset2[j].size(); k++) {
	    population[i]->forward(dataset2[j][k]);
	    features.push_back(population[i]->getOutputSignal());
	  }
	  vector<int> color = vector<int>(features.size(), i);
	  Mat image = buildColorMapImage(features, 3, color, colors);
	  imwrite(name.str(), image, pngParams);
	}
      }
      else {
	throw invalid_argument("pbdnnCluster : could not create directory");
      }
    }

    ostringstream dirR;
    dirR << "recomposed";
    if (mkdir(dirR.str().c_str(), S_IRWXU) == 0) {
      for (uint j = 0; j < recomposed.size(); j++) {
	ostringstream name;
	name << "recomposed\/recomposedSample" << j << ".png";
	vector<FeatureVector> features;
	Mat image = buildColorMapImage(recomposed[j], 3, assignedTo[j], colors);
	imwrite(name.str(), image, pngParams);
      }
    }
  }
  return EXIT_SUCCESS;
}
Exemplo n.º 10
0
//Build CART tree in the regression case
CARTTrainer::TreeType CARTTrainer::buildTree(AttributeTables const& tables, RegressionDataset const& dataset, std::vector<RealVector> const& labels, std::size_t nodeId, std::size_t trainSize){

	//Construct tree
	CARTClassifier<RealVector>::NodeInfo nodeInfo;

	nodeInfo.nodeId = nodeId;
	nodeInfo.label = mean(labels);
	nodeInfo.leftNodeId = 0;
	nodeInfo.rightNodeId = 0;

	//Store the Total Sum of Squares (TSS)
	RealVector labelSum = labels[0];
	for(std::size_t i=1; i< labels.size(); i++){
		labelSum += labels[0];
	}

	nodeInfo.misclassProp = totalSumOfSquares(labels, 0, labels.size(), labelSum)*((double)dataset.numberOfElements()/trainSize);

	TreeType tree, lTree, rTree;

	//n = Total number of cases in the dataset
	//n1 = Number of cases to the left child node
	//n2 = number of cases to the right child node
	std::size_t n, n1, n2;

	n = tables[0].size();

	if(n > m_nodeSize){
		//label vectors
		std::vector<RealVector> bestLabels, tmpLabels;
		RealVector labelSumAbove(m_labelDimension), labelSumBelow(m_labelDimension);

		//Index of attributes
		std::size_t attributeIndex, bestAttributeIndex, bestAttributeValIndex;

		//Attribute values
		double bestAttributeVal;
		double impurity, bestImpurity = -1;

		std::size_t prev;
		bool doSplit = false;
		for ( attributeIndex = 0; attributeIndex< m_inputDimension; attributeIndex++){

			labelSumBelow.clear();
			labelSumAbove.clear();

			tmpLabels.clear();
			//Create a labels table, that corresponds to the sorted attribute
			for(std::size_t k=0; k<tables[attributeIndex].size(); k++){
				tmpLabels.push_back(dataset.element(tables[attributeIndex][k].id).label);
				labelSumBelow += dataset.element(tables[attributeIndex][k].id).label;
			}
			labelSumAbove += tmpLabels[0];
			labelSumBelow -= tmpLabels[0];

			for(std::size_t i=1; i<n; i++){
				prev = i-1;
				if(tables[attributeIndex][prev].value!=tables[attributeIndex][i].value){
					n1=i;
					n2 = n-n1;
					//Calculate the squared error of the split
					impurity = (n1*totalSumOfSquares(tmpLabels,0,n1,labelSumAbove)+n2*totalSumOfSquares(tmpLabels,n1,n2,labelSumBelow))/(double)(n);

					if(impurity<bestImpurity || bestImpurity<0){
						//Found a more pure split, store the attribute index and value
						doSplit = true;
						bestImpurity = impurity;
						bestAttributeIndex = attributeIndex;
						bestAttributeValIndex = prev;
						bestAttributeVal = tables[attributeIndex][bestAttributeValIndex].value;
						bestLabels = tmpLabels;
					}
				}

				labelSumAbove += tmpLabels[i];
				labelSumBelow -= tmpLabels[i];
			}
		}

		if(doSplit){

			//Split the attribute tables
			AttributeTables rTables, lTables;
			splitAttributeTables(tables, bestAttributeIndex, bestAttributeValIndex, lTables, rTables);

			//Split the labels
			std::vector<RealVector> lLabels, rLabels;
			for(std::size_t i = 0; i <= bestAttributeValIndex; i++){
				lLabels.push_back(bestLabels[i]);
			}
			for(std::size_t i = bestAttributeValIndex+1; i < bestLabels.size(); i++){
				rLabels.push_back(bestLabels[i]);
			}

			//Continue recursively
			nodeInfo.attributeIndex = bestAttributeIndex;
			nodeInfo.attributeValue = bestAttributeVal;
			nodeInfo.leftNodeId = nodeId+1;
			lTree = buildTree(lTables, dataset, lLabels, nodeInfo.leftNodeId, trainSize);
                        nodeInfo.rightNodeId = nodeInfo.leftNodeId + lTree.size();
                        rTree = buildTree(rTables, dataset, rLabels, nodeInfo.rightNodeId, trainSize);
		}
	}


	tree.push_back(nodeInfo);
	tree.insert(tree.end(), lTree.begin(), lTree.end());
	tree.insert(tree.end(), rTree.begin(), rTree.end());

	//Store entry in the tree
	return tree;

}