Пример #1
KmTree::KmTree(int n, int d, Scalar *points): n_(n), d_(d), points_(points) {
  // Initialize memory
  int node_size = sizeof(Node) + d_ * 3 * sizeof(Scalar);
  node_data_ = (char*)malloc((2*n-1) * node_size);
  point_indices_ = (int*)malloc(n * sizeof(int));
  for (int i = 0; i < n; i++)
    point_indices_[i] = i;
  KM_ASSERT(node_data_ != 0 && point_indices_ != 0);

  // Calculate the bounding box for the points
  Scalar *bound_v1 = KMeans_PointAllocate(d_);
  Scalar *bound_v2 = KMeans_PointAllocate(d_);
  KM_ASSERT(bound_v1 != 0 && bound_v2 != 0);
  KMeans_PointCopy(bound_v1, points, d_);
  KMeans_PointCopy(bound_v2, points, d_);
  for (int i = 1; i < n; i++)
  for (int j = 0; j < d; j++) {
    if (bound_v1[j] > points[i*d_ + j]) bound_v1[j] = points[i*d_ + j];
    if (bound_v2[j] < points[i*d_ + j]) bound_v1[j] = points[i*d_ + j];

  // Build the tree
  char *temp_node_data = node_data_;
  top_node_ = BuildNodes(points, 0, n-1, &temp_node_data);

  // Cleanup
Пример #2
// See KMeans.h
Scalar KMeans::RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,
                         Scalar *ret_clusters, int *ret_assignment) {
  KM_ASSERT(k >= 1);

  // Create the tree and log
  LOG(false, "Running k-means++..." << std::endl);
  KmTree tree(n, d, points);
  LOG(false, "Done preprocessing..." << std::endl);

  // Initialization
  Scalar *clusters = (Scalar*)malloc(sizeof(Scalar)*k*d);
  KM_ASSERT(clusters != 0);
  Scalar min_cost = -1, max_cost = -1, total_cost = 0;
  double min_time = -1, max_time = -1, total_time = 0;

  // Run all the attempts
  for (int attempt = 0; attempt < attempts; attempt++) {
    double start_time = GetSeconds();

    // Choose clusters using k-means++ seeding
    tree.SeedKMeansPlusPlus(k, clusters);
    // Run k-means
    RunKMeansOnce(tree, n, k, d, points, clusters, &min_cost, &max_cost, &total_cost, start_time,
                  &min_time, &max_time, &total_time, ret_clusters, ret_assignment);
  LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);

  // Clean up and return
  return min_cost;
Пример #3
Scalar KmTree::SeedKMeansPlusPlus(int k, Scalar *centers) const {
  Scalar *dist_sq = (Scalar*)malloc(n_ * sizeof(Scalar));
  KM_ASSERT(dist_sq != 0);

  // Choose an initial center uniformly at random
  SeedKmppSetClusterIndex(top_node_, 0);
  int i = KMeans_GetRandom(n_);
  memcpy(centers, points_ + point_indices_[i]*d_, d_*sizeof(Scalar));
  Scalar total_cost = 0;
  for (int j = 0; j < n_; j++) {
    dist_sq[j] = KMeans_PointDistSq(points_ + point_indices_[j]*d_, centers, d_);
    total_cost += dist_sq[j];

  // Repeatedly choose more centers
  for (int new_cluster = 1; new_cluster < k; new_cluster++) {
    while (1) {
      Scalar cutoff = (rand() / Scalar(RAND_MAX)) * total_cost;
      Scalar cur_cost = 0;
      for (i = 0; i < n_; i++) {
        cur_cost += dist_sq[i];
        if (cur_cost >= cutoff)
      if (i < n_)
    memcpy(centers + new_cluster*d_, points_ + point_indices_[i]*d_, d_*sizeof(Scalar));
    total_cost = SeedKmppUpdateAssignment(top_node_, new_cluster, centers, dist_sq);

  // Clean up and return
  return total_cost;
Пример #4
// See KMeans.h
Scalar KMeans::RunKMeans(int n, int k, int d, Scalar *points, int attempts,
                 Scalar *ret_clusters, int *ret_assignment) {
  KM_ASSERT(k >= 1);
  // Create the tree and log
  LOG(false, "Running k-means..." << std::endl);
  KmTree tree(n, d, points);
  LOG(false, "Done preprocessing..." << std::endl);

  // Initialization
  Scalar *clusters = (Scalar*)malloc(sizeof(Scalar)*k*d);
  int *unused_clusters = (int*)malloc(sizeof(int)*n);
  KM_ASSERT(clusters != 0 && unused_clusters != 0);
  Scalar min_cost = -1, max_cost = -1, total_cost = 0;
  double min_time = -1, max_time = -1, total_time = 0;
  // Handle k > n
  if (k > n) {
    memset(clusters + n*d, -1, (k-d)*sizeof(Scalar));
    k = n;

  // Run all the attempts
  for (int attempt = 0; attempt < attempts; attempt++) {
    double start_time = GetSeconds();

    // Choose clusters uniformly at random
    for (int i = 0; i < n; i++)
      unused_clusters[i] = i;
    int num_unused_clusters = n;
    for (int i = 0; i < k; i++) {
      int j = KMeans_GetRandom(num_unused_clusters--);
      memcpy(clusters + i*d, points + unused_clusters[j]*d, d*sizeof(Scalar));
      unused_clusters[j] = unused_clusters[num_unused_clusters];
    // Run k-means
    RunKMeansOnce(tree, n, k, d, points, clusters, &min_cost, &max_cost, &total_cost, start_time,
                  &min_time, &max_time, &total_time, ret_clusters, ret_assignment);
  LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);

  // Clean up and return
  return min_cost;
Пример #5
Scalar KmTree::DoKMeansStep(int k, Scalar *centers, int *assignment) const {
  // Create an invalid center for comparison purposes
  Scalar *bad_center = KMeans_PointAllocate(d_);
  KM_ASSERT(bad_center != 0);
  memset(bad_center, 0xff, d_ * sizeof(Scalar));

  // Allocate data
  Scalar *sums = (Scalar*)calloc(k * d_, sizeof(Scalar));
  int *counts = (int*)calloc(k, sizeof(int));
  int num_candidates = 0;
  int *candidates = (int*)malloc(k * sizeof(int));
  KM_ASSERT(sums != 0 && counts != 0 && candidates != 0);
  for (int i = 0; i < k; i++)
  if (memcmp(centers + i*d_, bad_center, d_ * sizeof(Scalar)) != 0)
    candidates[num_candidates++] = i;

  // Find nodes
  Scalar result = DoKMeansStepAtNode(top_node_, num_candidates, candidates, centers, sums,
                                     counts, assignment);

  // Set the new centers
  for (int i = 0; i < k; i++) {
    if (counts[i] > 0) {
      KMeans_PointScale(sums + i*d_, Scalar(1) / counts[i], d_);
      KMeans_PointCopy(centers + i*d_, sums + i*d_, d_);
    } else {
      memcpy(centers + i*d_, bad_center, d_ * sizeof(Scalar));

  // Cleanup memory
  return result;
Пример #6
// A recursive version of DoKMeansStep. This determines which clusters all points that are rooted
// node will be assigned to, and updates sums, counts and assignment (if not null) accordingly.
// candidates maintains the set of cluster indices which could possibly be the closest clusters
// for points in this subtree.
Scalar KmTree::DoKMeansStepAtNode(const Node *node, int k, int *candidates, Scalar *centers,
                                  Scalar *sums, int *counts, int *assignment) const {
  // Determine which center the node center is closest to
  Scalar min_dist_sq = KMeans_PointDistSq(node->median, centers + candidates[0]*d_, d_);
  int closest_i = candidates[0];
  for (int i = 1; i < k; i++) {
    Scalar dist_sq = KMeans_PointDistSq(node->median, centers + candidates[i]*d_, d_);
    if (dist_sq < min_dist_sq) {
      min_dist_sq = dist_sq;
      closest_i = candidates[i];

  // If this is a non-leaf node, recurse if necessary
  if (node->lower_node != 0) {
    // Build the new list of candidates
    int new_k = 0;
    int *new_candidates = (int*)malloc(k * sizeof(int));
    KM_ASSERT(new_candidates != 0);
    for (int i = 0; i < k; i++)
    if (!ShouldBePruned(node->median, node->radius, centers, closest_i, candidates[i]))
      new_candidates[new_k++] = candidates[i];

    // Recurse if there's at least two
    if (new_k > 1) {
      Scalar result = DoKMeansStepAtNode(node->lower_node, new_k, new_candidates, centers,
                                         sums, counts, assignment) +
                      DoKMeansStepAtNode(node->upper_node, new_k, new_candidates, centers,
                                         sums, counts, assignment);
      return result;
    } else {

  // Assigns all points within this node to a single center
  KMeans_PointAdd(sums + closest_i*d_, node->sum, d_);
  counts[closest_i] += node->num_points;
  if (assignment != 0) {
    for (int i = node->first_point_index; i < node->first_point_index + node->num_points; i++)
      assignment[point_indices_[i]] = closest_i;
  return GetNodeCost(node, centers + closest_i*d_);
Пример #7
// Build a kd tree from the given set of points
KmTree::Node *KmTree::BuildNodes(Scalar *points, int first_index, int last_index,
                                 char **next_node_data) {
  // Allocate the node
  Node *node = (Node*)(*next_node_data);
  (*next_node_data) += sizeof(Node);
  node->sum = (Scalar*)(*next_node_data);
  (*next_node_data) += sizeof(Scalar) * d_;
  node->median = (Scalar*)(*next_node_data);
  (*next_node_data) += sizeof(Scalar) * d_;
  node->radius = (Scalar*)(*next_node_data);
  (*next_node_data) += sizeof(Scalar) * d_;

  // Fill in basic info
  node->num_points = (last_index - first_index + 1);
  node->first_point_index = first_index;

  // Calculate the bounding box
  Scalar *first_point = points + point_indices_[first_index] * d_;
  Scalar *bound_p1 = KMeans_PointAllocate(d_);
  Scalar *bound_p2 = KMeans_PointAllocate(d_);
  KM_ASSERT(bound_p1 != 0 && bound_p2 != 0);
  KMeans_PointCopy(bound_p1, first_point, d_);
  KMeans_PointCopy(bound_p2, first_point, d_);
  for (int i = first_index+1; i <= last_index; i++)
  for (int j = 0; j < d_; j++) {
    Scalar c = points[point_indices_[i]*d_ + j];
    if (bound_p1[j] > c) bound_p1[j] = c;
    if (bound_p2[j] < c) bound_p2[j] = c;

  // Calculate bounding box stats and delete the bounding box memory
  Scalar max_radius = -1;
  int split_d = -1;
  for (int j = 0; j < d_; j++) {
    node->median[j] = (bound_p1[j] + bound_p2[j]) / 2;
    node->radius[j] = (bound_p2[j] - bound_p1[j]) / 2;
    if (node->radius[j] > max_radius) {
      max_radius = node->radius[j];
      split_d = j;

  // If the max spread is 0, make this a leaf node
  if (max_radius == 0) {
    node->lower_node = node->upper_node = 0;
    KMeans_PointCopy(node->sum, first_point, d_);
    if (last_index != first_index)
      KMeans_PointScale(node->sum, Scalar(last_index - first_index + 1), d_);
    node->opt_cost = 0;
    return node;

  // Partition the points around the midpoint in this dimension. The partitioning is done in-place
  // by iterating from left-to-right and right-to-left in the same way that partioning is done for
  // quicksort.
  Scalar split_pos = node->median[split_d];
  int i1 = first_index, i2 = last_index, size1 = 0;
  while (i1 <= i2) {
    bool is_i1_good = (points[point_indices_[i1]*d_ + split_d] < split_pos);
    bool is_i2_good = (points[point_indices_[i2]*d_ + split_d] >= split_pos);
    if (!is_i1_good && !is_i2_good) {
      int temp = point_indices_[i1];
      point_indices_[i1] = point_indices_[i2];
      point_indices_[i2] = temp;
      is_i1_good = is_i2_good = true;
    if (is_i1_good) {
    if (is_i2_good) {

  // Create the child nodes
  KM_ASSERT(size1 >= 1 && size1 <= last_index - first_index);
  node->lower_node = BuildNodes(points, first_index, first_index + size1 - 1, next_node_data);
  node->upper_node = BuildNodes(points, first_index + size1, last_index, next_node_data);

  // Calculate the new sum and opt cost
  KMeans_PointCopy(node->sum, node->lower_node->sum, d_);
  KMeans_PointAdd(node->sum, node->upper_node->sum, d_);
  Scalar *center = KMeans_PointAllocate(d_);
  KM_ASSERT(center != 0);
  KMeans_PointCopy(center, node->sum, d_);
  KMeans_PointScale(center, Scalar(1) / node->num_points, d_);
  node->opt_cost = GetNodeCost(node->lower_node, center) + GetNodeCost(node->upper_node, center);
  return node;