Example #1
0
Scalar KmTree::SeedKMeansPlusPlus(int k, Scalar *centers) const {
  Scalar *dist_sq = (Scalar*)malloc(n_ * sizeof(Scalar));
  KM_ASSERT(dist_sq != 0);

  // Choose an initial center uniformly at random
  SeedKmppSetClusterIndex(top_node_, 0);
  int i = KMeans_GetRandom(n_);
  memcpy(centers, points_ + point_indices_[i]*d_, d_*sizeof(Scalar));
  Scalar total_cost = 0;
  for (int j = 0; j < n_; j++) {
    dist_sq[j] = KMeans_PointDistSq(points_ + point_indices_[j]*d_, centers, d_);
    total_cost += dist_sq[j];
  }

  // Repeatedly choose more centers
  for (int new_cluster = 1; new_cluster < k; new_cluster++) {
    while (1) {
      Scalar cutoff = (rand() / Scalar(RAND_MAX)) * total_cost;
      Scalar cur_cost = 0;
      for (i = 0; i < n_; i++) {
        cur_cost += dist_sq[i];
        if (cur_cost >= cutoff)
          break;
      }
      if (i < n_)
        break;
    }
    memcpy(centers + new_cluster*d_, points_ + point_indices_[i]*d_, d_*sizeof(Scalar));
    total_cost = SeedKmppUpdateAssignment(top_node_, new_cluster, centers, dist_sq);
  }

  // Clean up and return
  free(dist_sq);
  return total_cost;
}
Example #2
0
// See KMeans.h
Scalar KMeans::RunKMeans(int n, int k, int d, Scalar *points, int attempts,
                 Scalar *ret_clusters, int *ret_assignment) {
  KM_ASSERT(k >= 1);
  
  // Create the tree and log
  LOG(false, "Running k-means..." << std::endl);
  KmTree tree(n, d, points);
  LOG(false, "Done preprocessing..." << std::endl);

  // Initialization
  Scalar *clusters = (Scalar*)malloc(sizeof(Scalar)*k*d);
  int *unused_clusters = (int*)malloc(sizeof(int)*n);
  KM_ASSERT(clusters != 0 && unused_clusters != 0);
  Scalar min_cost = -1, max_cost = -1, total_cost = 0;
  double min_time = -1, max_time = -1, total_time = 0;
  
  // Handle k > n
  if (k > n) {
    memset(clusters + n*d, -1, (k-d)*sizeof(Scalar));
    k = n;
  }

  // Run all the attempts
  for (int attempt = 0; attempt < attempts; attempt++) {
    double start_time = GetSeconds();

    // Choose clusters uniformly at random
    for (int i = 0; i < n; i++)
      unused_clusters[i] = i;
    int num_unused_clusters = n;
    for (int i = 0; i < k; i++) {
      int j = KMeans_GetRandom(num_unused_clusters--);
      memcpy(clusters + i*d, points + unused_clusters[j]*d, d*sizeof(Scalar));
      unused_clusters[j] = unused_clusters[num_unused_clusters];
    }
    
    // Run k-means
    RunKMeansOnce(tree, n, k, d, points, clusters, &min_cost, &max_cost, &total_cost, start_time,
                  &min_time, &max_time, &total_time, ret_clusters, ret_assignment);
  }
  LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);

  // Clean up and return
  free(unused_clusters);
  free(clusters);
  return min_cost;
}