Scalar KmTree::SeedKMeansPlusPlus(int k, Scalar *centers) const { Scalar *dist_sq = (Scalar*)malloc(n_ * sizeof(Scalar)); KM_ASSERT(dist_sq != 0); // Choose an initial center uniformly at random SeedKmppSetClusterIndex(top_node_, 0); int i = KMeans_GetRandom(n_); memcpy(centers, points_ + point_indices_[i]*d_, d_*sizeof(Scalar)); Scalar total_cost = 0; for (int j = 0; j < n_; j++) { dist_sq[j] = KMeans_PointDistSq(points_ + point_indices_[j]*d_, centers, d_); total_cost += dist_sq[j]; } // Repeatedly choose more centers for (int new_cluster = 1; new_cluster < k; new_cluster++) { while (1) { Scalar cutoff = (rand() / Scalar(RAND_MAX)) * total_cost; Scalar cur_cost = 0; for (i = 0; i < n_; i++) { cur_cost += dist_sq[i]; if (cur_cost >= cutoff) break; } if (i < n_) break; } memcpy(centers + new_cluster*d_, points_ + point_indices_[i]*d_, d_*sizeof(Scalar)); total_cost = SeedKmppUpdateAssignment(top_node_, new_cluster, centers, dist_sq); } // Clean up and return free(dist_sq); return total_cost; }
// See KMeans.h Scalar KMeans::RunKMeans(int n, int k, int d, Scalar *points, int attempts, Scalar *ret_clusters, int *ret_assignment) { KM_ASSERT(k >= 1); // Create the tree and log LOG(false, "Running k-means..." << std::endl); KmTree tree(n, d, points); LOG(false, "Done preprocessing..." << std::endl); // Initialization Scalar *clusters = (Scalar*)malloc(sizeof(Scalar)*k*d); int *unused_clusters = (int*)malloc(sizeof(int)*n); KM_ASSERT(clusters != 0 && unused_clusters != 0); Scalar min_cost = -1, max_cost = -1, total_cost = 0; double min_time = -1, max_time = -1, total_time = 0; // Handle k > n if (k > n) { memset(clusters + n*d, -1, (k-d)*sizeof(Scalar)); k = n; } // Run all the attempts for (int attempt = 0; attempt < attempts; attempt++) { double start_time = GetSeconds(); // Choose clusters uniformly at random for (int i = 0; i < n; i++) unused_clusters[i] = i; int num_unused_clusters = n; for (int i = 0; i < k; i++) { int j = KMeans_GetRandom(num_unused_clusters--); memcpy(clusters + i*d, points + unused_clusters[j]*d, d*sizeof(Scalar)); unused_clusters[j] = unused_clusters[num_unused_clusters]; } // Run k-means RunKMeansOnce(tree, n, k, d, points, clusters, &min_cost, &max_cost, &total_cost, start_time, &min_time, &max_time, &total_time, ret_clusters, ret_assignment); } LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts); // Clean up and return free(unused_clusters); free(clusters); return min_cost; }