//' Simple KMeans Train //' //' @export //' // [[Rcpp::depends(BH)]] // [[Rcpp::export]] List SharkKMeansTrain (NumericMatrix X, ssize_t k) { // convert data UnlabeledData<RealVector> data = NumericMatrixToUnlabeledData(X); std::size_t elements = data.numberOfElements(); // compute centroids using k-means clustering Centroids centroids; kMeans (data, k, centroids); // convert cluster centers/centroids Data<RealVector> const& c = centroids.centroids(); NumericMatrix cM = DataRealVectorToNumericMatrix (c); // cluster training data we are given and convert to Rcpp object HardClusteringModel<RealVector> model(¢roids); Data<unsigned> clusters = model(data); NumericVector labels = LabelsToNumericVector (clusters); // return solution found Rcpp::List rl = R_NilValue; rl = Rcpp::List::create( Rcpp::Named("labels") = labels, Rcpp::Named("centroids") = cM ); return (rl); }
//' Simple KMeans Predict //' //' @export //' // [[Rcpp::depends(BH)]] // [[Rcpp::export]] List SharkKMeansPredict (NumericMatrix X, NumericMatrix centroids) { // convert data UnlabeledData<RealVector> data = NumericMatrixToUnlabeledData (X); std::size_t elements = data.numberOfElements(); // convert centroids Centroids c (NumericMatrixToDataRealVector (centroids)); // cluster data we are given and convert to Rcpp object HardClusteringModel<RealVector> model (&c); Data<unsigned> clusters = model (data); NumericVector labels = LabelsToNumericVector (clusters); // return solution found Rcpp::List rl = R_NilValue; rl = Rcpp::List::create(Rcpp::Named("labels") = labels); return (rl); }
void DiscreteLoss::defineBalancedCost(UnlabeledData<unsigned int> const& labels){ std::size_t classes = numberOfClasses(labels); std::size_t ic = labels.numberOfElements(); std::vector<unsigned int> freq(classes); BOOST_FOREACH(unsigned int label, labels.elements()){ freq[label]++; } m_cost.resize(classes, classes); for (std::size_t i = 0; i!= classes; i++){ double c = (freq[i] == 0) ? 1.0 : ic / (double)(classes * freq[i]); for ( std::size_t j = 0; j != classes; j++) m_cost(i, j) = c; m_cost(i, i) = 0.0; } }
void NormalizeComponentsWhitening::train(ModelType& model, UnlabeledData<RealVector> const& input){ std::size_t dc = dataDimension(input); SHARK_CHECK(input.numberOfElements() >= dc + 1, "[NormalizeComponentsWhitening::train] input needs to contain more points than there are input dimensions"); SHARK_CHECK(m_targetVariance > 0.0, "[NormalizeComponentsWhitening::train] target variance must be positive"); // dense model with bias having input and output dimension equal to data dimension model.setStructure(dc, dc, true); RealVector mean; RealMatrix covariance; meanvar(input, mean, covariance); RealMatrix whiteningMatrix = createWhiteningMatrix(covariance); whiteningMatrix *= std::sqrt(m_targetVariance); RealVector offset = -prod(trans(whiteningMatrix),mean); model.setStructure(RealMatrix(trans(whiteningMatrix)), offset); }
/// Set the input data, which is stored in the PCA object. void PCA::setData(UnlabeledData<RealVector> const& inputs) { SHARK_CHECK(inputs.numberOfElements() >= 2, "[PCA::train] input needs to contain at least two points"); m_l = inputs.numberOfElements(); ///< number of data points PCAAlgorithm algorithm = m_algorithm; m_n = dataDimension(inputs); if(algorithm == AUTO) { if(m_n > m_l) algorithm = SMALL_SAMPLE; // more attributes than data points else algorithm = STANDARD; } // decompose covariance matrix if(algorithm == STANDARD) { // standard case RealMatrix S(m_n,m_n);//covariance matrix meanvar(inputs,m_mean,S); m_eigenvalues.resize(m_n); m_eigenvectors.resize(m_n, m_n); eigensymm(S, m_eigenvectors, m_eigenvalues); } else { //let X0 be the design matrix having all inputs as rows //we want to avoid to form it directly but us it's batch represntation in the dataset m_mean = shark::mean(inputs); RealMatrix S(m_l,m_l,0.0);//S=X0 X0^T //as every batch is only a fraction of all samples, we have to loop through all //combinations of the batches of first and second argument and calculate a block of S //so if X0^T = (B0^T,B1^T) //than S = B0 B0^T B0 B1^T // B1 B0^T B1 B1^T std::size_t start1 = 0; for(std::size_t b1 = 0; b1 != inputs.numberOfBatches(); ++b1){ std::size_t batchSize1 = inputs.batch(b1).size1(); RealMatrix X1 = inputs.batch(b1)-repeat(m_mean,batchSize1); std::size_t start2 = 0; //calculate off-diagonal blocks //and the block X2 X1^T is the transpose of X1X2^T and thus can bee calculated for free. for(std::size_t b2 = 0; b2 != b1; ++b2){ std::size_t batchSize2 = inputs.batch(b2).size1(); RealMatrix X2 = inputs.batch(b2)-repeat(m_mean,batchSize2); RealSubMatrix X1X2T= subrange(S,start1,start1+batchSize1,start2,start2+batchSize2); RealSubMatrix X2X1T= subrange(S,start2,start2+batchSize2,start1,start1+batchSize1); noalias(X1X2T) = prod(X1,trans(X2));// X1 X2^T noalias(X2X1T) = trans(X1X2T);// X2 X1^T start2+=batchSize2; } //diagonal block RealSubMatrix X1X1T= subrange(S,start1,start1+batchSize1,start1,start1+batchSize1); symm_prod(X1,X1X1T); start1+=batchSize1; } S /= m_l; m_eigenvalues.resize(m_l); m_eigenvectors.resize(m_n,m_l); m_eigenvectors.clear(); RealMatrix U(m_l, m_l); eigensymm(S, U, m_eigenvalues); // compute true eigenvectors //eigenv=X0^T U std::size_t batchStart = 0; for(std::size_t b = 0; b != inputs.numberOfBatches(); ++b){ std::size_t batchSize = inputs.batch(b).size1(); std::size_t batchEnd = batchStart+batchSize; RealMatrix X = inputs.batch(b)-repeat(m_mean,batchSize); noalias(m_eigenvectors) += prod(trans(X),rows(U,batchStart,batchEnd)); batchStart = batchEnd; } //normalize for(std::size_t i=0; i != m_l; i++) column(m_eigenvectors, i) /= norm_2(column(m_eigenvectors, i)); } }