void testDatasetEquality(LabeledData<int, int> const& set1, LabeledData<int, int> const& set2){ BOOST_REQUIRE_EQUAL(set1.numberOfBatches(),set2.numberOfBatches()); BOOST_REQUIRE_EQUAL(set1.numberOfElements(),set2.numberOfElements()); for(std::size_t i = 0; i != set1.numberOfBatches(); ++i){ BOOST_REQUIRE_EQUAL(set1.batch(i).input.size(),set1.batch(i).label.size()); BOOST_REQUIRE_EQUAL(set2.batch(i).input.size(),set2.batch(i).label.size()); } testSetEquality(set1.inputs(),set2.inputs()); testSetEquality(set1.labels(),set2.labels()); }
void checkDataRegression(double* values, LabeledData<RealVector,RealVector> const& loaded, std::size_t labelStart, std::size_t labelEnd){ BOOST_REQUIRE_EQUAL(loaded.numberOfElements(),numInputs); std::size_t inputStart =0; std::size_t inputEnd = labelStart; if(labelStart == 0){ inputStart = labelEnd; inputEnd = numDimensions+1; } for (size_t i=0; i != numInputs; ++i){ for (size_t j=0; j != numDimensions+1; ++j) { double element = 0; if(j >= labelStart &&j < labelEnd){ element = loaded.element(i).label(j-labelStart); } if(j >= inputStart && j < inputEnd){ element = loaded.element(i).input(j-inputStart); } if( boost::math::isnan(values[i*(numDimensions+1)+j])){ BOOST_CHECK(boost::math::isnan(element)); } else { BOOST_CHECK_EQUAL(element, values[i*(numDimensions+1)+j]); } } } }
void LinearRegression::train(LinearModel<>& model, LabeledData<RealVector, RealVector> const& dataset){ std::size_t inputDim = inputDimension(dataset); std::size_t outputDim = labelDimension(dataset); std::size_t numInputs = dataset.numberOfElements(); std::size_t numBatches = dataset.numberOfBatches(); //Let P be the matrix of points with n rows and X=(P|1). the 1 rpresents the bias weight //Let A = X^T X + lambda * I //than whe have (for lambda = 0) //A = ( P^T P P^T 1) // ( 1^T P 1^T1) RealMatrix matA(inputDim+1,inputDim+1,0.0); blas::Blocking<RealMatrix> Ablocks(matA,inputDim,inputDim); //compute A and the label matrix batchwise typedef LabeledData<RealVector, RealVector>::const_batch_reference BatchRef; for (std::size_t b=0; b != numBatches; b++){ BatchRef batch = dataset.batch(b); symm_prod(trans(batch.input),Ablocks.upperLeft(),false); noalias(column(Ablocks.upperRight(),0))+=sum_rows(batch.input); } row(Ablocks.lowerLeft(),0) = column(Ablocks.upperRight(),0); matA(inputDim,inputDim) = numInputs; //X^TX+=lambda* I diag(Ablocks.upperLeft())+= blas::repeat(m_regularization,inputDim); //we also need to compute X^T L= (P^TL, 1^T L) where L is the matrix of labels RealMatrix XTL(inputDim + 1,outputDim,0.0); for (std::size_t b=0; b != numBatches; b++){ BatchRef batch = dataset.batch(b); RealSubMatrix PTL = subrange(XTL,0,inputDim,0,outputDim); axpy_prod(trans(batch.input),batch.label,PTL,false); noalias(row(XTL,inputDim))+=sum_rows(batch.label); } //we solve the system A Beta = X^T L //usually this is solved via the moore penrose inverse: //Beta = A^-1 T //but it is faster und numerically more stable, if we solve it as a symmetric system //w can use in-place solve RealMatrix& beta = XTL; blas::solveSymmSemiDefiniteSystemInPlace<blas::SolveAXB>(matA,beta); RealMatrix matrix = subrange(trans(beta), 0, outputDim, 0, inputDim); RealVector offset = row(beta,inputDim); // write parameters into the model model.setStructure(matrix, offset); }
void checkDataEquality(T* values, unsigned int* labels, LabeledData<V,U> const& loaded){ BOOST_REQUIRE_EQUAL(loaded.numberOfElements(),numInputs); BOOST_REQUIRE_EQUAL(inputDimension(loaded),numDimensions); for (size_t i=0; i != numInputs; ++i){ for (size_t j=0; j != numDimensions; ++j) { if( boost::math::isnan(values[i*numDimensions+j])){ BOOST_CHECK(boost::math::isnan(loaded.element(i).input(j))); } else { BOOST_CHECK_EQUAL(loaded.element(i).input(j), values[i*numDimensions+j]); } } BOOST_CHECK_EQUAL(loaded.element(i).label, labels[i]); } }
void LDA::train(LinearClassifier<>& model, LabeledData<RealVector,unsigned int> const& dataset){ if(dataset.empty()){ throw SHARKEXCEPTION("[LDA::train] the dataset must not be empty"); } std::size_t inputs = dataset.numberOfElements(); std::size_t dim = inputDimension(dataset); std::size_t classes = numberOfClasses(dataset); //required statistics UIntVector num(classes,0); RealMatrix means(classes, dim,0.0); RealMatrix covariance(dim, dim,0.0); //we compute the data batch wise for(auto const& batch: dataset.batches()){ UIntVector const& labels = batch.label; RealMatrix const& points = batch.input; //load batch and update mean std::size_t currentBatchSize = points.size1(); for (std::size_t e=0; e != currentBatchSize; e++){ //update mean and class count for this sample std::size_t c = labels(e); ++num(c); noalias(row(means,c))+=row(points,e); } //update second moment matrix noalias(covariance) += prod(trans(points),points); } covariance/=inputs-classes; //calculate mean and the covariance matrix from second moment for (std::size_t c = 0; c != classes; c++){ if (num[c] == 0) throw SHARKEXCEPTION("[LDA::train] LDA can not handle a class without examples"); row(means,c) /= num(c); double factor = num(c); factor/=inputs-classes; noalias(covariance)-= factor*outer_prod(row(means,c),row(means,c)); } //add regularization if(m_regularization>0){ for(std::size_t i=0;i!=dim;++i) covariance(i,i)+=m_regularization; } //the formula for the linear classifier is // arg max_i log(P(x|i) * P(i)) //= arg max_i log(P(x|i)) +log(P(i)) //= arg max_i -(x-m_i)^T C^-1 (x-m_i) +log(P(i)) //= arg max_i -m_i^T C^-1 m_i +2* x^T C^-1 m_i + log(P(i)) //so we compute first C^-1 m_i and then the first term // compute z = m_i^T C^-1 <=> z C = m_i // this is the expensive step of the calculation. RealMatrix transformedMeans = means; blas::solveSymmSemiDefiniteSystemInPlace<blas::SolveXAB>(covariance,transformedMeans); //compute bias terms m_i^T C^-1 m_i - log(P(i)) RealVector bias(classes); for(std::size_t c = 0; c != classes; ++c){ double prior = std::log(double(num(c))/inputs); bias(c) = - 0.5* inner_prod(row(means,c),row(transformedMeans,c)) + prior; } //fill the model model.decisionFunction().setStructure(transformedMeans,bias); }