void checkDataRegression(double* values, LabeledData<RealVector,RealVector> const& loaded, std::size_t labelStart, std::size_t labelEnd){ BOOST_REQUIRE_EQUAL(loaded.numberOfElements(),numInputs); std::size_t inputStart =0; std::size_t inputEnd = labelStart; if(labelStart == 0){ inputStart = labelEnd; inputEnd = numDimensions+1; } for (size_t i=0; i != numInputs; ++i){ for (size_t j=0; j != numDimensions+1; ++j) { double element = 0; if(j >= labelStart &&j < labelEnd){ element = loaded.element(i).label(j-labelStart); } if(j >= inputStart && j < inputEnd){ element = loaded.element(i).input(j-inputStart); } if( boost::math::isnan(values[i*(numDimensions+1)+j])){ BOOST_CHECK(boost::math::isnan(element)); } else { BOOST_CHECK_EQUAL(element, values[i*(numDimensions+1)+j]); } } } }
TEST(Csv, ReadCsv) { LabeledData<RealVector, uint32_t> data; EXPECT_EQ(0U, data.size()); EXPECT_FALSE(ReadCsv("null/null", &data)); ASSERT_TRUE(ReadCsv("testdata/data/cls.10.csv", &data, LAST_COLUMN)); const std::size_t kN = 10; EXPECT_EQ(kN, data.size()); uint32_t expected_labels_arr[kN] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; Data<uint32_t> expected_labels; ToData(expected_labels_arr, kN, &expected_labels); EXPECT_EQ(expected_labels, data.labels()); EXPECT_EQ(2U, data.input(0).size()); EXPECT_DOUBLE_EQ(2.4114, data.input(0)(0)); EXPECT_DOUBLE_EQ(-3.8901, data.input(0)(1)); ASSERT_TRUE(ReadCsv("testdata/data/cls.csv", &data)); EXPECT_EQ(1000U, data.size()); LabeledData<RealVector, double> data2; ASSERT_TRUE(ReadCsv("testdata/data/cls.10.csv", &data2, FIRST_COLUMN)); EXPECT_EQ(10U, data2.size()); EXPECT_DOUBLE_EQ(2.4114, data2.label(0)); EXPECT_EQ(2U, data2.input(0).size()); EXPECT_DOUBLE_EQ(-3.8901, data2.input(0)(0)); EXPECT_DOUBLE_EQ(0, data2.input(0)(1)); }
void LinearRegression::train(LinearModel<>& model, LabeledData<RealVector, RealVector> const& dataset){ std::size_t inputDim = inputDimension(dataset); std::size_t outputDim = labelDimension(dataset); std::size_t numInputs = dataset.numberOfElements(); std::size_t numBatches = dataset.numberOfBatches(); //Let P be the matrix of points with n rows and X=(P|1). the 1 rpresents the bias weight //Let A = X^T X + lambda * I //than whe have (for lambda = 0) //A = ( P^T P P^T 1) // ( 1^T P 1^T1) RealMatrix matA(inputDim+1,inputDim+1,0.0); blas::Blocking<RealMatrix> Ablocks(matA,inputDim,inputDim); //compute A and the label matrix batchwise typedef LabeledData<RealVector, RealVector>::const_batch_reference BatchRef; for (std::size_t b=0; b != numBatches; b++){ BatchRef batch = dataset.batch(b); symm_prod(trans(batch.input),Ablocks.upperLeft(),false); noalias(column(Ablocks.upperRight(),0))+=sum_rows(batch.input); } row(Ablocks.lowerLeft(),0) = column(Ablocks.upperRight(),0); matA(inputDim,inputDim) = numInputs; //X^TX+=lambda* I diag(Ablocks.upperLeft())+= blas::repeat(m_regularization,inputDim); //we also need to compute X^T L= (P^TL, 1^T L) where L is the matrix of labels RealMatrix XTL(inputDim + 1,outputDim,0.0); for (std::size_t b=0; b != numBatches; b++){ BatchRef batch = dataset.batch(b); RealSubMatrix PTL = subrange(XTL,0,inputDim,0,outputDim); axpy_prod(trans(batch.input),batch.label,PTL,false); noalias(row(XTL,inputDim))+=sum_rows(batch.label); } //we solve the system A Beta = X^T L //usually this is solved via the moore penrose inverse: //Beta = A^-1 T //but it is faster und numerically more stable, if we solve it as a symmetric system //w can use in-place solve RealMatrix& beta = XTL; blas::solveSymmSemiDefiniteSystemInPlace<blas::SolveAXB>(matA,beta); RealMatrix matrix = subrange(trans(beta), 0, outputDim, 0, inputDim); RealVector offset = row(beta,inputDim); // write parameters into the model model.setStructure(matrix, offset); }
void CMySharkML::Features2SharkData(LabeledData<RealVector, unsigned int> &dataset, cv::Mat &features, std::vector<int> &v_label) { //copy rows of the file into the dataset std::size_t rows = features.rows; std::size_t dimensions = features.cols; std::vector<std::size_t> batchSizes = shark::detail::optimalBatchSizes(rows, 256); // Test data dataset = LabeledData<RealVector, unsigned int>(batchSizes.size()); std::size_t currentRow = 0; for(std::size_t b = 0; b != batchSizes.size(); ++b) { RealMatrix& inputs = dataset.batch(b).input; UIntVector& labels = dataset.batch(b).label; inputs.resize(batchSizes[b], dimensions); labels.resize(batchSizes[b]); //copy the rows into the batch for(std::size_t i = 0; i != batchSizes[b]; ++i,++currentRow){ int rawLabel = v_label[currentRow]; labels[i] = rawLabel; for(std::size_t j = 0; j != dimensions; ++j){ inputs(i,j) = features.at<float>(currentRow, j); } } } }
void checkDataEquality(T* values, unsigned int* labels, LabeledData<V,U> const& loaded){ BOOST_REQUIRE_EQUAL(loaded.numberOfElements(),numInputs); BOOST_REQUIRE_EQUAL(inputDimension(loaded),numDimensions); for (size_t i=0; i != numInputs; ++i){ for (size_t j=0; j != numDimensions; ++j) { if( boost::math::isnan(values[i*numDimensions+j])){ BOOST_CHECK(boost::math::isnan(loaded.element(i).input(j))); } else { BOOST_CHECK_EQUAL(loaded.element(i).input(j), values[i*numDimensions+j]); } } BOOST_CHECK_EQUAL(loaded.element(i).label, labels[i]); } }
/// append new data to the training data void TrainingData::appendData(const LabeledData &labeledData) { assert((labeledData.getNumPosExamples() + labeledData.getNumNegExamples()) == labeledData.getNumExamples()); const size_t initialNumberOfTrainingSamples = getNumExamples(), finalNumberOfTrainingSamples = initialNumberOfTrainingSamples + labeledData.getNumExamples(); if(finalNumberOfTrainingSamples > getMaxNumExamples()) { throw std::runtime_error("TrainingData::appendData is trying to add more data than initially specified"); } // compute and save the feature responses #pragma omp parallel for default(none) ordered shared(labeledData) for (size_t labelDataIndex = 0; labelDataIndex < labeledData.getNumExamples(); ++labelDataIndex) { const meta_datum_t &metaDatum = labeledData.getMetaDatum(labelDataIndex); const LabeledData::integral_channels_t &integralImage = labeledData.getIntegralImage(labelDataIndex); setDatum(initialNumberOfTrainingSamples + labelDataIndex, metaDatum, integralImage); } // end of "for each labeled datum" return; }
int main(){ //get problem data Problem problem(1.0); LabeledData<RealVector,unsigned int> training = problem.generateDataset(1000); LabeledData<RealVector,unsigned int> test = problem.generateDataset(100); std::size_t inputs=inputDimension(training); std::size_t outputs = numberOfClasses(training); std::size_t hiddens = 10; unsigned numberOfSteps = 1000; //create network and initialize weights random uniform FFNet<LogisticNeuron,LinearNeuron> network; network.setStructure(inputs,hiddens,outputs); initRandomUniform(network,-0.1,0.1); //create error function CrossEntropy loss; ErrorFunction error(training,&network,&loss); // loss for evaluation // The zeroOneLoss for multiclass problems assigns the class to the highest output ZeroOneLoss<unsigned int, RealVector> loss01; // evaluate initial network Data<RealVector> prediction = network(training.inputs()); cout << "classification error before learning:\t" << loss01.eval(training.labels(), prediction) << endl; //initialize Rprop IRpropPlus optimizer; optimizer.init(error); for(unsigned step = 0; step != numberOfSteps; ++step) optimizer.step(error); // evaluate solution found by training network.setParameterVector(optimizer.solution().point); // set weights to weights found by learning prediction = network(training.inputs()); cout << "classification error after learning:\t" << loss01(training.labels(), prediction) << endl; }
void LDA::train(LinearClassifier<>& model, LabeledData<RealVector,unsigned int> const& dataset){ if(dataset.empty()){ throw SHARKEXCEPTION("[LDA::train] the dataset must not be empty"); } std::size_t inputs = dataset.numberOfElements(); std::size_t dim = inputDimension(dataset); std::size_t classes = numberOfClasses(dataset); //required statistics UIntVector num(classes,0); RealMatrix means(classes, dim,0.0); RealMatrix covariance(dim, dim,0.0); //we compute the data batch wise for(auto const& batch: dataset.batches()){ UIntVector const& labels = batch.label; RealMatrix const& points = batch.input; //load batch and update mean std::size_t currentBatchSize = points.size1(); for (std::size_t e=0; e != currentBatchSize; e++){ //update mean and class count for this sample std::size_t c = labels(e); ++num(c); noalias(row(means,c))+=row(points,e); } //update second moment matrix noalias(covariance) += prod(trans(points),points); } covariance/=inputs-classes; //calculate mean and the covariance matrix from second moment for (std::size_t c = 0; c != classes; c++){ if (num[c] == 0) throw SHARKEXCEPTION("[LDA::train] LDA can not handle a class without examples"); row(means,c) /= num(c); double factor = num(c); factor/=inputs-classes; noalias(covariance)-= factor*outer_prod(row(means,c),row(means,c)); } //add regularization if(m_regularization>0){ for(std::size_t i=0;i!=dim;++i) covariance(i,i)+=m_regularization; } //the formula for the linear classifier is // arg max_i log(P(x|i) * P(i)) //= arg max_i log(P(x|i)) +log(P(i)) //= arg max_i -(x-m_i)^T C^-1 (x-m_i) +log(P(i)) //= arg max_i -m_i^T C^-1 m_i +2* x^T C^-1 m_i + log(P(i)) //so we compute first C^-1 m_i and then the first term // compute z = m_i^T C^-1 <=> z C = m_i // this is the expensive step of the calculation. RealMatrix transformedMeans = means; blas::solveSymmSemiDefiniteSystemInPlace<blas::SolveXAB>(covariance,transformedMeans); //compute bias terms m_i^T C^-1 m_i - log(P(i)) RealVector bias(classes); for(std::size_t c = 0; c != classes; ++c){ double prior = std::log(double(num(c))/inputs); bias(c) = - 0.5* inner_prod(row(means,c),row(transformedMeans,c)) + prior; } //fill the model model.decisionFunction().setStructure(transformedMeans,bias); }
// optimize the sigmoid using platt's method void SigmoidFitPlatt::train(SigmoidModel& model, LabeledData<RealVector, unsigned int> const& dataset){ SIZE_CHECK( model.numberOfParameters() == 2 ); typedef LabeledData<RealVector, unsigned int>::const_element_range Elements; typedef IndexedIterator<boost::range_iterator<Elements>::type> Iterator; double a, b, c, d, e, d1, d2; double t = 0.0; double oldA, oldB, diff, scale, det; double err = 0.0; double value, p; double lambda = 0.001; double olderr = 1e100; std::vector<std::size_t> classCount = classSizes(dataset); std::size_t pos = classCount[1]; std::size_t neg = classCount[0]; std::size_t ic = pos+neg; Iterator end(dataset.elements().end(),ic); double A = 0.0; double B = std::log((neg + 1.0) / (pos + 1.0)); double lowTarget = 1.0 / (neg + 2.0); double highTarget = (pos + 1.0) / (pos + 2.0); RealVector pp(ic,(pos + 1.0) / (pos + neg + 2.0)); int count = 0; for (std::size_t it = 0; it < 100; it++) { a = b = c = d = e = 0.0; for (Iterator it(dataset.elements().begin(),0); it != end; ++it){ std::size_t i = it.index(); t = (it->label == 1) ? highTarget : lowTarget; d1 = pp(i) - t; d2 = pp(i) * (1.0 - pp(i)); value = it->input(0); a += d2 * value * value; b += d2; c += d2 * value; d += d1 * value; e += d1; } if (std::abs(d) < 1e-9 && std::abs(e) < 1e-9) break; oldA = A; oldB = B; err = 0.0; while (true) { det = (a + lambda) * (b + lambda) - c * c; if (det == 0.0) { lambda *= 10.0; continue; } A = oldA + ((b + lambda) * d - c * e) / det; B = oldB + ((a + lambda) * e - c * d) / det; err = 0.0; for (Iterator it(dataset.elements().begin(),0); it != end; ++it){ std::size_t i = it.index(); p = 1.0 / (1.0 + std::exp(A * it->input(0) + B)); pp(i) = p; err -= t * safeLog(p) + (1.0 - t) * safeLog(1.0 - p); } if (err < 1.0000001 * olderr) { lambda *= 0.1; break; } lambda *= 10.0; if (lambda >= 1e6) { // Something is broken. Give up. break; } } diff = err - olderr; scale = 0.5 * (err + olderr + 1.0); if (diff > -1e-3 * scale && diff < 1e-7 * scale) count++; else count = 0; olderr = err; if (count == 3) break; } RealVector params(2); params(0) = A; params(1) = B; model.setParameterVector(params); }
void testDatasetEquality(LabeledData<int, int> const& set1, LabeledData<int, int> const& set2){ BOOST_REQUIRE_EQUAL(set1.numberOfBatches(),set2.numberOfBatches()); BOOST_REQUIRE_EQUAL(set1.numberOfElements(),set2.numberOfElements()); for(std::size_t i = 0; i != set1.numberOfBatches(); ++i){ BOOST_REQUIRE_EQUAL(set1.batch(i).input.size(),set1.batch(i).label.size()); BOOST_REQUIRE_EQUAL(set2.batch(i).input.size(),set2.batch(i).label.size()); } testSetEquality(set1.inputs(),set2.inputs()); testSetEquality(set1.labels(),set2.labels()); }