Пример #1
0
//' Simple KMeans Train
//' 
//' @export
//'
// [[Rcpp::depends(BH)]]    
// [[Rcpp::export]]	
List SharkKMeansTrain (NumericMatrix X, ssize_t k) {

	// convert data
	UnlabeledData<RealVector> data = NumericMatrixToUnlabeledData(X);
	std::size_t elements = data.numberOfElements();

	// compute centroids using k-means clustering
	Centroids centroids;
	kMeans (data, k, centroids);
	
	// convert cluster centers/centroids
	Data<RealVector> const& c = centroids.centroids();
	NumericMatrix cM = DataRealVectorToNumericMatrix (c);
	
	// cluster training data we are given and convert to Rcpp object
	HardClusteringModel<RealVector> model(&centroids);
	Data<unsigned> clusters = model(data);
	NumericVector labels = LabelsToNumericVector (clusters);

	// return solution found 
	Rcpp::List rl = R_NilValue;
	rl = Rcpp::List::create(
		Rcpp::Named("labels") = labels,
		Rcpp::Named("centroids") = cM );
	return (rl);
}
Пример #2
0
void DiscreteLoss::defineBalancedCost(UnlabeledData<unsigned int> const& labels){
	std::size_t classes = numberOfClasses(labels);
	std::size_t ic = labels.numberOfElements();
	
	std::vector<unsigned int> freq(classes);
	BOOST_FOREACH(unsigned int label, labels.elements()){
		freq[label]++;
	}

	m_cost.resize(classes, classes);
	for (std::size_t i = 0; i!= classes; i++){
		double c = (freq[i] == 0) ? 1.0 : ic / (double)(classes * freq[i]);
		for ( std::size_t j = 0; j != classes; j++) 
			m_cost(i, j) = c;
		m_cost(i, i) = 0.0;
	}
}
void NormalizeComponentsWhitening::train(ModelType& model, UnlabeledData<RealVector> const& input){
	std::size_t dc = dataDimension(input);
	SHARK_CHECK(input.numberOfElements() >= dc + 1, "[NormalizeComponentsWhitening::train] input needs to contain more points than there are input dimensions");
	SHARK_CHECK(m_targetVariance > 0.0, "[NormalizeComponentsWhitening::train] target variance must be positive");

	// dense model with bias having input and output dimension equal to data dimension
	model.setStructure(dc, dc, true); 

	RealVector mean;
	RealMatrix covariance;
	meanvar(input, mean, covariance);

	RealMatrix whiteningMatrix = createWhiteningMatrix(covariance);
	whiteningMatrix *= std::sqrt(m_targetVariance);

	RealVector offset = -prod(trans(whiteningMatrix),mean);

	model.setStructure(RealMatrix(trans(whiteningMatrix)), offset);
}
Пример #4
0
//' Simple KMeans Predict
//' 
//' @export
//'
// [[Rcpp::depends(BH)]]    
// [[Rcpp::export]]	
List SharkKMeansPredict (NumericMatrix X, NumericMatrix centroids) {
	
	// convert data
	UnlabeledData<RealVector> data = NumericMatrixToUnlabeledData (X);
	std::size_t elements = data.numberOfElements();
	
	// convert centroids
	Centroids c (NumericMatrixToDataRealVector (centroids));
	
	// cluster data we are given and convert to Rcpp object
	HardClusteringModel<RealVector> model (&c);
	Data<unsigned> clusters = model (data);
	NumericVector labels = LabelsToNumericVector (clusters);
	
	// return solution found 
	Rcpp::List rl = R_NilValue;
	rl = Rcpp::List::create(Rcpp::Named("labels") = labels);
	return (rl);
}
Пример #5
0
Файл: PCA.cpp Проект: EQ94/Shark
/// Set the input data, which is stored in the PCA object.
void PCA::setData(UnlabeledData<RealVector> const& inputs) {
	SHARK_CHECK(inputs.numberOfElements() >= 2, "[PCA::train] input needs to contain at least two points");
	m_l = inputs.numberOfElements(); ///< number of data points
	PCAAlgorithm algorithm = m_algorithm;
	m_n = dataDimension(inputs); 
	
	if(algorithm == AUTO)  {
		if(m_n > m_l) algorithm = SMALL_SAMPLE; // more attributes than data points
		else algorithm = STANDARD;
	}
	
	// decompose covariance matrix
	if(algorithm == STANDARD) { // standard case
		RealMatrix S(m_n,m_n);//covariance matrix
		meanvar(inputs,m_mean,S);
		m_eigenvalues.resize(m_n);
		m_eigenvectors.resize(m_n, m_n);
		eigensymm(S, m_eigenvectors, m_eigenvalues);
	} else {
		//let X0 be the design matrix having all inputs as rows
		//we want to avoid to form it directly but us it's batch represntation in the dataset
		m_mean = shark::mean(inputs);
		RealMatrix S(m_l,m_l,0.0);//S=X0 X0^T
		//as every batch is only  a fraction of all samples, we have to loop through all
		//combinations of the batches of first and second argument and calculate a block of S
		//so if X0^T = (B0^T,B1^T)
		//than S = B0 B0^T B0 B1^T
		//               B1 B0^T  B1 B1^T
		std::size_t start1 = 0;
		for(std::size_t b1 = 0; b1 != inputs.numberOfBatches(); ++b1){
			std::size_t batchSize1 = inputs.batch(b1).size1();
			RealMatrix X1 = inputs.batch(b1)-repeat(m_mean,batchSize1);
			std::size_t start2 = 0;
			//calculate off-diagonal blocks
			//and the block X2 X1^T is the transpose of X1X2^T and thus can bee calculated for free.
			for(std::size_t b2 = 0; b2 != b1; ++b2){
				std::size_t batchSize2 = inputs.batch(b2).size1();
				RealMatrix X2 = inputs.batch(b2)-repeat(m_mean,batchSize2);
				RealSubMatrix X1X2T= subrange(S,start1,start1+batchSize1,start2,start2+batchSize2);
				RealSubMatrix X2X1T= subrange(S,start2,start2+batchSize2,start1,start1+batchSize1);
				noalias(X1X2T) = prod(X1,trans(X2));// X1 X2^T
				noalias(X2X1T) = trans(X1X2T);// X2 X1^T
				start2+=batchSize2;
			}
			//diagonal block
			RealSubMatrix X1X1T= subrange(S,start1,start1+batchSize1,start1,start1+batchSize1);
			symm_prod(X1,X1X1T);
			start1+=batchSize1;
		}
		S /= m_l;
		m_eigenvalues.resize(m_l);
		m_eigenvectors.resize(m_n,m_l);
		m_eigenvectors.clear();
		RealMatrix U(m_l, m_l);
		eigensymm(S, U, m_eigenvalues);
		// compute true eigenvectors
		//eigenv=X0^T U
		std::size_t batchStart  = 0;
		for(std::size_t b = 0; b != inputs.numberOfBatches(); ++b){
			std::size_t batchSize = inputs.batch(b).size1();
			std::size_t batchEnd = batchStart+batchSize;
			RealMatrix X = inputs.batch(b)-repeat(m_mean,batchSize);
			noalias(m_eigenvectors) += prod(trans(X),rows(U,batchStart,batchEnd));
			batchStart = batchEnd;
		}
		//normalize
		for(std::size_t i=0; i != m_l; i++)
			column(m_eigenvectors, i) /= norm_2(column(m_eigenvectors, i));
	}
}