//Train model with a regression dataset void CARTTrainer::train(ModelType& model, RegressionDataset const& dataset) { //Store the number of input dimensions m_inputDimension = inputDimension(dataset); //Pass input dimension (i.e., number of attributes) to tree model model.setInputDimension(m_inputDimension); //Store the size of the labels m_labelDimension = labelDimension(dataset); // create cross-validation folds RegressionDataset set=dataset; CVFolds<RegressionDataset > folds = createCVSameSize(set, m_numberOfFolds); double bestErrorRate = std::numeric_limits<double>::max(); CARTClassifier<RealVector>::TreeType bestTree; for (unsigned fold = 0; fold < m_numberOfFolds; ++fold){ //Run through all the cross validation sets RegressionDataset dataTrain = folds.training(fold); RegressionDataset dataTest = folds.validation(fold); std::size_t numTrainElements = dataTrain.numberOfElements(); AttributeTables tables = createAttributeTables(dataTrain.inputs()); std::vector < RealVector > labels(numTrainElements); boost::copy(dataTrain.labels().elements(),labels.begin()); //Build tree form this fold CARTClassifier<RealVector>::TreeType tree = buildTree(tables, dataTrain, labels, 0, dataTrain.numberOfElements()); //Add the tree to the model and prune model.setTree(tree); while(true){ //evaluate the error of current tree SquaredLoss<> loss; double error = loss.eval(dataTest.labels(), model(dataTest.inputs())); if(error < bestErrorRate){ //We have found a subtree that has a smaller error rate when tested! bestErrorRate = error; bestTree = tree; } if(tree.size() == 1) break; pruneTree(tree); model.setTree(tree); } } SHARK_CHECK(bestTree.size() > 0, "We should never set a tree that is empty."); model.setTree(bestTree); }
int main(int argc, char **argv) { RegressionDataset data; importCSV(data, "blogData_train.csv", LAST_COLUMN,1,',','#', 2<<16); LinearRegression trainer(100); LinearModel<> model; Timer time; trainer.train(model, data); double time_taken = time.stop(); SquaredLoss<> loss; cout << "Residual sum of squares:" << loss(data.labels(),model(data.inputs()))<<std::endl; cout << "Time:\n" << time_taken << endl; cout << time_taken << endl; }
// Regression void RFTrainer::train(RFClassifier& model, RegressionDataset const& dataset) { model.clearModels(); // added by TG 23.02.2015 //TODO O.K.: i am just fixing these things for now so that they are working. //Store the number of input dimensions m_inputDimension = inputDimension(dataset); //Store the size of the labels m_labelDimension = labelDimension(dataset); model.setInputDimension(m_inputDimension); model.setLabelDimension(m_labelDimension); m_regressionLearner = true; setDefaults(); //we need direct element access sicne we need to generate elementwise subsets std::size_t subsetSize = static_cast<std::size_t>(dataset.numberOfElements()*m_OOBratio); DataView<RegressionDataset const> elements(dataset); //Generate m_B trees SHARK_PARALLEL_FOR(int i = 0; i < (int)m_B; ++i){ //For each tree generate a subset of the dataset //generate indices of the dataset (pick k out of n elements) std::vector<std::size_t> subsetIndices(dataset.numberOfElements()); boost::iota(subsetIndices,0); boost::random_shuffle(subsetIndices); // create oob indices std::vector<std::size_t>::iterator oobStart = subsetIndices.begin() + subsetSize; std::vector<std::size_t>::iterator oobEnd = subsetIndices.end(); //generate the dataset by copying (TODO: this is a quick fix! subsetIndices.erase(oobStart, oobEnd); RegressionDataset dataTrain = toDataset(subset(elements,subsetIndices)); AttributeTables tables; createAttributeTables(dataTrain.inputs(), tables); std::size_t dataTrainSize = dataTrain.numberOfElements(); std::vector<RealVector> labels; for(std::size_t i = 0; i < dataTrainSize; i++){ labels.push_back(dataTrain.element(i).label); } CARTClassifier<RealVector>::TreeType tree = buildTree(tables, dataTrain, labels, 0); CARTClassifier<RealVector> cart(tree, m_inputDimension); // if oob error or importances have to be computed, create an oob sample if(m_computeOOBerror || m_computeFeatureImportances){ std::vector<std::size_t> subsetIndicesOOB(oobStart, oobEnd); RegressionDataset dataOOB = toDataset(subset(elements, subsetIndicesOOB)); // if importances should be computed, oob errors are computed implicitly if(m_computeFeatureImportances){ cart.computeFeatureImportances(dataOOB); } // if importances should not be computed, only compute the oob errors else{ cart.computeOOBerror(dataOOB); } } SHARK_CRITICAL_REGION{ model.addModel(cart); } } if(m_computeOOBerror){ model.computeOOBerror(); } if(m_computeFeatureImportances){ model.computeFeatureImportances(); } }