const BallBound<VecType>&
BallBound<VecType>::operator|=(const MatType& data)
{
  if (radius < 0)
  {
    center = data.col(0);
    radius = 0;
  }

  // Now iteratively add points.  There is probably a closed-form solution to
  // find the minimum bounding circle, and it is probably faster.
  for (size_t i = 1; i < data.n_cols; ++i)
  {
    double dist = metric::EuclideanDistance::Evaluate(center, (VecType)
        data.col(i)) - radius;

    if (dist > 0)
    {
      // Move (dist / 2) towards the new point and increase radius by
      // (dist / 2).
      arma::vec diff = data.col(i) - center;
      center += 0.5 * diff;
      radius += 0.5 * dist;
    }
  }

  return *this;
}
  inline static void Initialize(const MatType& V,
                                const size_t r,
                                arma::mat& W,
                                arma::mat& H)
  {
    size_t n = V.n_rows;
    size_t m = V.n_cols;
  
    double V_avg = 0;
    size_t count = 0;
    double min = DBL_MAX;
    for(typename MatType::const_row_col_iterator it = V.begin();it != V.end();it++)
    {
      if(*it != 0)
      {
        count++;
        V_avg += *it;
        if(*it < min) min = *it;
      }
    }
    V_avg = sqrt(((V_avg / (n * m)) - min) / r);

    // Intialize to random values.
    W.randu(n, r);
    H.randu(r, m);
    
    W = W + V_avg;
    H = H + V_avg;
  }
Exemple #3
0
void	ClassificationTree::print_train_log(const TreeNode::PtrSplitNodeBase split, const TrainingSet &train_set) const
{
	MatType				ltype	=	train_set.get_label_type();
	MatType				ftype	=	train_set.get_feature_type();
	int					rows	=	(int)ltype.total();
	cv::Mat_<double>	left_tmp;
	cv::Mat_<double>	right_tmp;
	TrainingSet			left_set(ftype, ltype);
	TrainingSet			right_set(ftype, ltype);

	split->operator()(train_set, left_set, right_set);

	
	left_set.compute_target_mean(left_tmp);
	right_set.compute_target_mean(right_tmp);

	cv::Mat_<double>	left_dist(rows, 1, (double*)left_tmp.data);
	cv::Mat_<double>	right_dist(rows, 1, (double*)right_tmp.data);


	printf("left dist\n");
	for (unsigned ii = 0; ii < left_dist.total(); ++ii) {
		printf("\tlabel%d:%f\n", ii, left_dist.at<double>(ii) / left_set.size());
	}

	printf("right dist\n");
	for (unsigned ii = 0; ii < right_dist.total(); ++ii) {
		printf("\tlabel%d:%f\n", ii, right_dist.at<double>(ii) / right_set.size());
	}

}
Exemple #4
0
inline std::string 
DimsString( const MatType& A, std::string label="Matrix" )
{ 
    std::ostringstream os;
    os << label << " ~ " << A.Height() << " x " << A.Width();
    return os.str();
}
Exemple #5
0
  inline static void Initialize(const MatType& V,
                                const size_t r,
                                arma::mat& W,
                                arma::mat& H)
  {
    const size_t n = V.n_rows;
    const size_t m = V.n_cols;

    double avgV = 0;
    size_t count = 0;
    double min = DBL_MAX;

    // Iterate over all elements in the matrix (for sparse matrices, this only
    // iterates over nonzeros).
    for (typename MatType::const_row_col_iterator it = V.begin();
        it != V.end(); ++it)
    {
      ++count;
      avgV += *it;
      // Track the minimum value.
      if (*it < min)
        min = *it;
    }

    avgV = sqrt(((avgV / (n * m)) - min) / r);

    // Initialize to random values.
    W.randu(n, r);
    H.randu(r, m);

    W = W + avgV;
    H = H + avgV;
  }
const BallBound<MetricType, VecType>&
BallBound<MetricType, VecType>::operator|=(const MatType& data)
{
  if (radius < 0)
  {
    center = data.col(0);
    radius = 0;
  }

  // Now iteratively add points.
  for (size_t i = 0; i < data.n_cols; ++i)
  {
    const ElemType dist = metric->Evaluate(center, (VecType) data.col(i));

    // See if the new point lies outside the bound.
    if (dist > radius)
    {
      // Move towards the new point and increase the radius just enough to
      // accommodate the new point.
      const VecType diff = data.col(i) - center;
      center += ((dist - radius) / (2 * dist)) * diff;
      radius = 0.5 * (dist + radius);
    }
  }

  return *this;
}
Exemple #7
0
 SubMatrix(MatType& m)
     : matrix(m),
       begin_row(0),
       end_row(m.numRows()),
       begin_column(0),
       end_column(m.numColumns())
 {
 }
size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
                                           const size_t emptyCluster,
                                           const arma::mat& oldCentroids,
                                           arma::mat& newCentroids,
                                           arma::Col<size_t>& clusterCounts,
                                           MetricType& metric,
                                           const size_t iteration)
{
  // If necessary, calculate the variances and assignments.
  if (iteration != this->iteration || assignments.n_elem != data.n_cols)
    Precalculate(data, oldCentroids, clusterCounts, metric);
  this->iteration = iteration;

  // Now find the cluster with maximum variance.
  arma::uword maxVarCluster;
  variances.max(maxVarCluster);

  // Now, inside this cluster, find the point which is furthest away.
  size_t furthestPoint = data.n_cols;
  double maxDistance = -DBL_MAX;
  for (size_t i = 0; i < data.n_cols; ++i)
  {
    if (assignments[i] == maxVarCluster)
    {
      const double distance = std::pow(metric.Evaluate(data.col(i),
          newCentroids.col(maxVarCluster)), 2.0);

      if (distance > maxDistance)
      {
        maxDistance = distance;
        furthestPoint = i;
      }
    }
  }

  // Take that point and add it to the empty cluster.
  newCentroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
      double(clusterCounts[maxVarCluster] - 1));
  newCentroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] - 1.0)) *
      arma::vec(data.col(furthestPoint));
  clusterCounts[maxVarCluster]--;
  clusterCounts[emptyCluster]++;
  newCentroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
  assignments[furthestPoint] = emptyCluster;

  // Modify the variances, as necessary.
  variances[emptyCluster] = 0;
  // One has already been subtracted from clusterCounts[maxVarCluster].
  variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster])) *
      ((clusterCounts[maxVarCluster] + 1) * variances[maxVarCluster] - maxDistance);

  // Output some debugging information.
  Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
      emptyCluster << ".\n";

  return 1; // We only changed one point.
}
Exemple #9
0
void Recipe::set_coefficients(int i, int j, const MatType &coef) {
    for(int in_chan = 0; in_chan < coef.rows(); ++in_chan)
    for(int out_chan = 0; out_chan < coef.cols(); ++out_chan)
    {
        int ac_map_i = in_chan*height + i;
        int ac_map_j = out_chan*width +j;
        ac(ac_map_i, ac_map_j) = coef(in_chan,out_chan);
    }
}
Exemple #10
0
size_t PerformSplit(MatType& data,
                    const size_t begin,
                    const size_t count,
                    const typename SplitType::SplitInfo& splitInfo,
                    std::vector<size_t>& oldFromNew)
{
  // This method modifies the input dataset.  We loop both from the left and
  // right sides of the points contained in this node.
  size_t left = begin;
  size_t right = begin + count - 1;

  // First half-iteration of the loop is out here because the termination
  // condition is in the middle.
  while ((left <= right) &&
         (SplitType::AssignToLeftNode(data.col(left), splitInfo)))
    left++;
  while ((!SplitType::AssignToLeftNode(data.col(right), splitInfo)) &&
         (left <= right) && (right > 0))
    right--;

  // Shortcut for when all points are on the right.
  if (left == right && right == 0)
    return left;

  while (left <= right)
  {
    // Swap columns.
    data.swap_cols(left, right);

    // Update the indices for what we changed.
    size_t t = oldFromNew[left];
    oldFromNew[left] = oldFromNew[right];
    oldFromNew[right] = t;

    // See how many points on the left are correct.  When they are correct,
    // increase the left counter accordingly.  When we encounter one that isn't
    // correct, stop.  We will switch it later.
    while (SplitType::AssignToLeftNode(data.col(left), splitInfo) &&
        (left <= right))
      left++;

    // Now see how many points on the right are correct.  When they are correct,
    // decrease the right counter accordingly.  When we encounter one that isn't
    // correct, stop.  We will switch it with the wrong point we found in the
    // previous loop.
    while ((!SplitType::AssignToLeftNode(data.col(right), splitInfo)) &&
        (left <= right))
      right--;
  }

  Log::Assert(left == right + 1);
  return left;
}
size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
                                           const size_t emptyCluster,
                                           const MatType& centroids,
                                           arma::Col<size_t>& clusterCounts,
                                           arma::Col<size_t>& assignments)
{
  // First, we need to find the cluster with maximum variance (by which I mean
  // the sum of the covariance matrices).
  arma::vec variances;
  variances.zeros(clusterCounts.n_elem); // Start with 0.

  // Add the variance of each point's distance away from the cluster.  I think
  // this is the sensible thing to do.
  for (size_t i = 0; i < data.n_cols; i++)
  {
    variances[assignments[i]] += arma::as_scalar(
        arma::var(data.col(i) - centroids.col(assignments[i])));
  }

  // Now find the cluster with maximum variance.
  arma::uword maxVarCluster;
  variances.max(maxVarCluster);

  // Now, inside this cluster, find the point which is furthest away.
  size_t furthestPoint = data.n_cols;
  double maxDistance = -DBL_MAX;
  for (size_t i = 0; i < data.n_cols; i++)
  {
    if (assignments[i] == maxVarCluster)
    {
      double distance = arma::as_scalar(
          arma::var(data.col(i) - centroids.col(maxVarCluster)));

      if (distance > maxDistance)
      {
        maxDistance = distance;
        furthestPoint = i;
      }
    }
  }

  // Take that point and add it to the empty cluster.
  clusterCounts[maxVarCluster]--;
  clusterCounts[emptyCluster]++;
  assignments[furthestPoint] = emptyCluster;

  // Output some debugging information.
  Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
      emptyCluster << ".\n";

  return 1; // We only changed one point.
}
Exemple #12
0
  /** Dimensionality check during initialization */
  bool dimCheck(){

    if( Sx_.rows() != Sx_.cols() ){
      std::cerr << "Error: MatType must be a square matrix \n";
      return false;
    }
    if( Sx_.rows() != x_.size() ){
      std::cerr << "Error: VecType and MatType dimension mismatch \n";
      return false;
    }
    nDim_ = x_.size();
    return true;
  }
Exemple #13
0
void Perceptron<LearnPolicy, WeightInitializationPolicy, MatType>::Train(
    const MatType& data,
    const arma::Row<size_t>& labels,
    const arma::rowvec& instanceWeights)
{
  size_t j, i = 0;
  bool converged = false;
  size_t tempLabel;
  arma::uword maxIndexRow, maxIndexCol;
  arma::mat tempLabelMat;

  LearnPolicy LP;

  const bool hasWeights = (instanceWeights.n_elem > 0);

  while ((i < maxIterations) && (!converged))
  {
    // This outer loop is for each iteration, and we use the 'converged'
    // variable for noting whether or not convergence has been reached.
    i++;
    converged = true;

    // Now this inner loop is for going through the dataset in each iteration.
    for (j = 0; j < data.n_cols; j++)
    {
      // Multiply for each variable and check whether the current weight vector
      // correctly classifies this.
      tempLabelMat = weights.t() * data.col(j) + biases;

      tempLabelMat.max(maxIndexRow, maxIndexCol);

      // Check whether prediction is correct.
      if (maxIndexRow != labels(0, j))
      {
        // Due to incorrect prediction, convergence set to false.
        converged = false;
        tempLabel = labels(0, j);

        // Send maxIndexRow for knowing which weight to update, send j to know
        // the value of the vector to update it with.  Send tempLabel to know
        // the correct class.
        if (hasWeights)
          LP.UpdateWeights(data.col(j), weights, biases, maxIndexRow, tempLabel,
              instanceWeights(j));
        else
          LP.UpdateWeights(data.col(j), weights, biases, maxIndexRow,
              tempLabel);
      }
    }
  }
}
Exemple #14
0
typename std::enable_if<ApplyKernel, bool>::type
MeanShift<UseKernel, KernelType, MatType>::
CalculateCentroid(const MatType& data,
                  const std::vector<size_t>& neighbors,
                  const std::vector<double>& distances,
                  arma::colvec& centroid)
{
  double sumWeight = 0;
  for (size_t i = 0; i < neighbors.size(); ++i)
  {
    if (distances[i] > 0)
    {
      double dist = distances[i] / radius;
      double weight = kernel.Gradient(dist) / dist;
      sumWeight += weight;
      centroid += weight * data.unsafe_col(neighbors[i]);
    }
  }

  if (sumWeight != 0)
  {
    centroid /= sumWeight;
    return true;
  }
  return false;
}
Exemple #15
0
  inline static void Initialize(const MatType& V,
                                const size_t r,
                                arma::mat& W,
                                arma::mat& H)
  {
    const size_t n = V.n_rows;
    const size_t m = V.n_cols;

    if (columnsToAverage > m)
    {
      Log::Warn << "Number of random columns (columnsToAverage) is more than "
          << "the number of columns available in the V matrix; weird results "
          << "may ensue!" << std::endl;
    }

    W.zeros(n, r);

    // Initialize W matrix with random columns.
    for (size_t col = 0; col < r; col++)
    {
      for (size_t randCol = 0; randCol < columnsToAverage; randCol++)
      {
        // .col() does not work in this case, as of Armadillo 3.920.
        W.unsafe_col(col) += V.col(math::RandInt(0, m));
      }
    }

    // Now divide by p.
    W /= columnsToAverage;

    // Initialize H to random values.
    H.randu(r, m);
  }
void MaxVarianceNewCluster::Precalculate(const MatType& data,
                                         const arma::mat& oldCentroids,
                                         arma::Col<size_t>& clusterCounts,
                                         MetricType& metric)
{
  // We have to calculate the variances of each cluster and the assignments of
  // each point.  This is most easily done by iterating through the entire
  // dataset.
  variances.zeros(oldCentroids.n_cols);
  assignments.set_size(data.n_cols);

  // Add the variance of each point's distance away from the cluster.  I think
  // this is the sensible thing to do.
  for (size_t i = 0; i < data.n_cols; ++i)
  {
    // Find the closest centroid to this point.
    double minDistance = std::numeric_limits<double>::infinity();
    size_t closestCluster = oldCentroids.n_cols; // Invalid value.

    for (size_t j = 0; j < oldCentroids.n_cols; j++)
    {
      const double distance = metric.Evaluate(data.col(i), oldCentroids.col(j));

      if (distance < minDistance)
      {
        minDistance = distance;
        closestCluster = j;
      }
    }

    assignments[i] = closestCluster;
    variances[closestCluster] += std::pow(metric.Evaluate(data.col(i),
        oldCentroids.col(closestCluster)), 2.0);
  }

  // Divide by the number of points in the cluster to produce the variance,
  // unless the cluster is empty or contains only one point, in which case we
  // set the variance to 0.
  for (size_t i = 0; i < clusterCounts.n_elem; ++i)
    if (clusterCounts[i] <= 1)
      variances[i] = 0;
    else
      variances[i] /= clusterCounts[i];
}
DecisionStump<MatType>::DecisionStump(const MatType& data,
                                      const arma::Row<size_t>& labels,
                                      const size_t classes,
                                      size_t inpBucketSize)
{
  numClass = classes;
  bucketSize = inpBucketSize;

  // If classLabels are not all identical, proceed with training.
  int bestAtt = 0;
  double entropy;
  const double rootEntropy = CalculateEntropy<size_t>(
      labels.subvec(0, labels.n_elem - 1));

  double gain, bestGain = 0.0;
  for (int i = 0; i < data.n_rows; i++)
  {
    // Go through each attribute of the data.
    if (IsDistinct<double>(data.row(i)))
    {
      // For each attribute with non-identical values, treat it as a potential
      // splitting attribute and calculate entropy if split on it.
      entropy = SetupSplitAttribute(data.row(i), labels);

      // Rcpp::Rcout << "Entropy for attribute " << i << " is " << entropy << ".\n";
      gain = rootEntropy - entropy;
      // Find the attribute with the best entropy so that the gain is
      // maximized.

      // if (entropy < bestEntropy)
      // Instead of the above rule, we are maximizing gain, which was
      // what is returned from SetupSplitAttribute.
      if (gain < bestGain)
      {
        bestAtt = i;
        bestGain = gain;
      }
    }
  }
  splitAttribute = bestAtt;

  // Once the splitting column/attribute has been decided, train on it.
  TrainOnAtt<double>(data.row(splitAttribute), labels);
}
void LogisticRegression<MatType>::Classify(const MatType& dataset,
                                           arma::mat& probabilities) const
{
  // Set correct size of output matrix.
  probabilities.set_size(2, dataset.n_cols);

  probabilities.row(1) = 1.0 / (1.0 + arma::exp(-parameters(0) - dataset.t() *
      parameters.subvec(1, parameters.n_elem - 1))).t();
  probabilities.row(0) = 1.0 - probabilities.row(1);
}
void LogisticRegression<MatType>::Predict(const MatType& predictors,
                                          arma::Row<size_t>& responses,
                                          const double decisionBoundary) const
{
  // Calculate sigmoid function for each point.  The (1.0 - decisionBoundary)
  // term correctly sets an offset so that floor() returns 0 or 1 correctly.
  responses = arma::conv_to<arma::Row<size_t>>::from((1.0 /
      (1.0 + arma::exp(-parameters(0) - predictors.t() *
      parameters.subvec(1, parameters.n_elem - 1)))) +
      (1.0 - decisionBoundary));
}
void UBTreeSplit<BoundType, MatType>::InitializeAddresses(const MatType& data)
{
  addresses.resize(data.n_cols);

  // Calculate all addresses.
  for (size_t i = 0; i < data.n_cols; i++)
  {
    addresses[i].first.zeros(data.n_rows);
    bound::addr::PointToAddress(addresses[i].first, data.col(i));
    addresses[i].second = i;
  }
}
 inline static void Cluster(const MatType& data,
                            const size_t clusters,
                            arma::mat& centroids)
 {
   centroids.set_size(data.n_rows, clusters);
   for (size_t i = 0; i < clusters; ++i)
   {
     // Randomly sample a point.
     const size_t index = math::RandInt(0, data.n_cols);
     centroids.col(i) = data.col(index);
   }
 }
void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
    MatType& data,
    std::vector<size_t>& oldFromNew,
    const size_t maxLeafSize,
    SplitType& splitter)
{
  // This should be a single function for Bound.
  // We need to expand the bounds of this node properly.
  bound |= data.cols(begin, begin + count - 1);

  // Calculate the furthest descendant distance.
  furthestDescendantDistance = 0.5 * bound.Diameter();

  // First, check if we need to split at all.
  if (count <= maxLeafSize)
    return; // We can't split this.

  // splitCol denotes the two partitions of the dataset after the split. The
  // points on its left go to the left child and the others go to the right
  // child.
  size_t splitCol;

  // Split the node. The elements of 'data' are reordered by the splitting
  // algorithm. This function call updates splitCol and oldFromNew.
  const bool split = splitter.SplitNode(bound, data, begin, count, splitCol,
      oldFromNew);

  // The node may not be always split. For instance, if all the points are the
  // same, we can't split them.
  if (!split)
    return;

  // Now that we know the split column, we will recursively split the children
  // by calling their constructors (which perform this splitting process).
  left = new BinarySpaceTree<BoundType, StatisticType, MatType>(data, begin,
      splitCol - begin, oldFromNew, splitter, this, maxLeafSize);
  right = new BinarySpaceTree<BoundType, StatisticType, MatType>(data, splitCol,
      begin + count - splitCol, oldFromNew, splitter, this, maxLeafSize);

  // Calculate parent distances for those two nodes.
  arma::vec centroid, leftCentroid, rightCentroid;
  Centroid(centroid);
  left->Centroid(leftCentroid);
  right->Centroid(rightCentroid);

  const double leftParentDistance = bound.Metric().Evaluate(centroid,
      leftCentroid);
  const double rightParentDistance = bound.Metric().Evaluate(centroid,
      rightCentroid);

  left->ParentDistance() = leftParentDistance;
  right->ParentDistance() = rightParentDistance;
}
void DecisionStump<MatType>::Train(const MatType& data,
                                   const arma::Row<size_t>& labels,
                                   const arma::rowvec& weights)
{
  this->classes = classes;
  this->bucketSize = bucketSize;

  // If classLabels are not all identical, proceed with training.
  size_t bestDim = 0;
  double entropy;
  const double rootEntropy = CalculateEntropy<UseWeights>(labels, weights);

  double gain, bestGain = 0.0;
  for (size_t i = 0; i < data.n_rows; i++)
  {
    // Go through each dimension of the data.
    if (IsDistinct(data.row(i)))
    {
      // For each dimension with non-identical values, treat it as a potential
      // splitting dimension and calculate entropy if split on it.
      entropy = SetupSplitDimension<UseWeights>(data.row(i), labels, weights);

      gain = rootEntropy - entropy;
      // Find the dimension with the best entropy so that the gain is
      // maximized.

      // We are maximizing gain, which is what is returned from
      // SetupSplitDimension().
      if (gain < bestGain)
      {
        bestDim = i;
        bestGain = gain;
      }
    }
  }
  splitDimension = bestDim;

  // Once the splitting column/dimension has been decided, train on it.
  TrainOnDim(data.row(splitDimension), labels);
}
Exemple #24
0
typename std::enable_if<!ApplyKernel, bool>::type
MeanShift<UseKernel, KernelType, MatType>::
CalculateCentroid(const MatType& data,
                  const std::vector<size_t>& neighbors,
                  const std::vector<double>&, /*unused*/
                  arma::colvec& centroid)
{
  for (size_t i = 0; i < neighbors.size(); ++i)
    centroid += data.unsafe_col(neighbors[i]);

  centroid /= neighbors.size();
  return true;
}
Exemple #25
0
void MeanShift<UseKernel, KernelType, MatType>::GenSeeds(
    const MatType& data,
    const double binSize,
    const int minFreq,
    MatType& seeds)
{
  typedef arma::colvec VecType;
  std::map<VecType, int, less<VecType> > allSeeds;
  for (size_t i = 0; i < data.n_cols; ++i)
  {
    VecType binnedPoint = arma::floor(data.unsafe_col(i) / binSize);
    if (allSeeds.find(binnedPoint) == allSeeds.end())
      allSeeds[binnedPoint] = 1;
    else
      allSeeds[binnedPoint]++;
  }

  // Remove seeds with too few points.  First we count the number of seeds we
  // end up with, then we add them.
  std::map<VecType, int, less<VecType> >::iterator it;
  size_t count = 0;
  for (it = allSeeds.begin(); it != allSeeds.end(); ++it)
    if (it->second >= minFreq)
      ++count;

  seeds.set_size(data.n_rows, count);
  count = 0;
  for (it = allSeeds.begin(); it != allSeeds.end(); ++it)
  {
    if (it->second >= minFreq)
    {
      seeds.col(count) = it->first;
      ++count;
    }
  }

  seeds *= binSize;
}
RegularizedSVDFunction<MatType>::RegularizedSVDFunction(const MatType& data,
                                                        const size_t rank,
                                                        const double lambda) :
    data(math::MakeAlias(const_cast<MatType&>(data), false)),
    rank(rank),
    lambda(lambda)
{
  // Number of users and items in the data.
  numUsers = max(data.row(0)) + 1;
  numItems = max(data.row(1)) + 1;

  // Initialize the parameters.
  initialPoint.randu(rank, numUsers + numItems);
}
Exemple #27
0
size_t MeanSplit<BoundType, MatType>::
    PerformSplit(MatType& data,
                 const size_t begin,
                 const size_t count,
                 const size_t splitDimension,
                 const double splitVal,
                 std::vector<size_t>& oldFromNew)
{
  // This method modifies the input dataset.  We loop both from the left and
  // right sides of the points contained in this node.  The points less than
  // splitVal should be on the left side of the matrix, and the points greater
  // than splitVal should be on the right side of the matrix.
  size_t left = begin;
  size_t right = begin + count - 1;

  // First half-iteration of the loop is out here because the termination
  // condition is in the middle.
  while ((data(splitDimension, left) < splitVal) && (left <= right))
    left++;
  while ((data(splitDimension, right) >= splitVal) && (left <= right) && (right > 0))
    right--;

  while (left <= right)
  {
    // Swap columns.
    data.swap_cols(left, right);

    // Update the indices for what we changed.
    size_t t = oldFromNew[left];
    oldFromNew[left] = oldFromNew[right];
    oldFromNew[right] = t;

    // See how many points on the left are correct.  When they are correct,
    // increase the left counter accordingly.  When we encounter one that isn't
    // correct, stop.  We will switch it later.
    while ((data(splitDimension, left) < splitVal) && (left <= right))
      left++;

    // Now see how many points on the right are correct.  When they are correct,
    // decrease the right counter accordingly.  When we encounter one that isn't
    // correct, stop.  We will switch it with the wrong point we found in the
    // previous loop.
    while ((data(splitDimension, right) >= splitVal) && (left <= right))
      right--;
  }

  Log::Assert(left == right + 1);

  return left;
}
Exemple #28
0
void Perceptron<LearnPolicy, WeightInitializationPolicy, MatType>::Classify(
    const MatType& test,
    arma::Row<size_t>& predictedLabels)
{
  arma::vec tempLabelMat;
  arma::uword maxIndex = 0;

  // Could probably be faster if done in batch.
  for (size_t i = 0; i < test.n_cols; i++)
  {
    tempLabelMat = weights.t() * test.col(i) + biases;
    tempLabelMat.max(maxIndex);
    predictedLabels(0, i) = maxIndex;
  }
}
void RefinedStart::Cluster(const MatType& data,
                           const size_t clusters,
                           arma::mat& centroids) const
{
  // This will hold the sampled datasets.
  const size_t numPoints = size_t(percentage * data.n_cols);
  MatType sampledData(data.n_rows, numPoints);
  // vector<bool> is packed so each bool is 1 bit.
  std::vector<bool> pointsUsed(data.n_cols, false);
  arma::mat sampledCentroids(data.n_rows, samplings * clusters);

  for (size_t i = 0; i < samplings; ++i)
  {
    // First, assemble the sampled dataset.
    size_t curSample = 0;
    while (curSample < numPoints)
    {
      // Pick a random point in [0, numPoints).
      size_t sample = (size_t) math::RandInt(data.n_cols);

      if (!pointsUsed[sample])
      {
        // This point isn't used yet.  So we'll put it in our sample.
        pointsUsed[sample] = true;
        sampledData.col(curSample) = data.col(sample);
        ++curSample;
      }
    }

    // Now, using the sampled dataset, run k-means.  In the case of an empty
    // cluster, we re-initialize that cluster as the point furthest away from
    // the cluster with maximum variance.  This is not *exactly* what the paper
    // implements, but it is quite similar, and we'll call it "good enough".
    KMeans<> kmeans;
    kmeans.Cluster(sampledData, clusters, centroids);

    // Store the sampled centroids.
    sampledCentroids.cols(i * clusters, (i + 1) * clusters - 1) = centroids;

    pointsUsed.assign(data.n_cols, false);
  }

  // Now, we run k-means on the sampled centroids to get our final clusters.
  KMeans<> kmeans;
  kmeans.Cluster(sampledCentroids, clusters, centroids);
}
bool RPTreeMaxSplit<BoundType, MatType>::GetSplitVal(
    const MatType& data,
    const size_t begin,
    const size_t count,
    const arma::Col<ElemType>& direction,
    ElemType& splitVal)
{
  const size_t maxNumSamples = 100;
  const size_t numSamples = std::min(maxNumSamples, count);
  arma::uvec samples;

  // Get no more than numSamples distinct samples.
  math::ObtainDistinctSamples(begin, begin + count, numSamples, samples);

  arma::Col<ElemType> values(samples.n_elem);

  // Find the median of scalar products of the samples and the normal vector.
  for (size_t k = 0; k < samples.n_elem; k++)
    values[k] = arma::dot(data.col(samples[k]), direction);

  const ElemType maximum = arma::max(values);
  const ElemType minimum = arma::min(values);
  if (minimum == maximum)
    return false;

  splitVal = arma::median(values);

  // Add a random deviation to the median.
  // This algorithm differs from the method suggested in the random projection
  // tree paper, for two reasons:
  //   1. Evaluating the method proposed in the paper is time-consuming, since
  //      we must solve the furthest-pair problem.
  //   2. The proposed method does not appear to guarantee that a valid split
  //      value will be generated (i.e. it can produce a split value where there
  //      may be no points on the left or the right).
  splitVal += math::Random((minimum - splitVal) * 0.75,
      (maximum - splitVal) * 0.75);

  if (splitVal == maximum)
    splitVal = minimum;

  return true;
}