void cluster_document::kmedoids(int clusters, std::vector<cluster_document> &document_list) { int length = document_list.size(); std::vector<int> medoid_ids = random_subset(clusters, length); recompute_clusters(medoid_ids, document_list); int steps = KMEDOIDS_STEPS; double cost = compute_cost(medoid_ids, document_list); while (steps-- > 0) { //std::cerr << steps << std::endl; int best_document = -1; double best_cost = cost; for (auto &document: document_list) { int old_medoid = medoid_ids[document.cluster_]; if (old_medoid == document.id_) { continue; } medoid_ids[document.cluster_] = document.id_; double new_cost = compute_cost(medoid_ids, document_list); if (new_cost < best_cost) { cost = best_cost = new_cost; best_document = document.id_; } medoid_ids[document.cluster_] = old_medoid; } if (best_document == -1) { break; } auto &document = document_list[best_document]; medoid_ids[document.cluster_] = document.id_; recompute_clusters(medoid_ids, document_list); } }
void cluster_document::kmeans(int clusters, std::vector<cluster_document> &document_list) { int length = document_list.size(); std::vector<int> centroid_ids = random_subset(clusters, length); int steps = KMEANS_STEPS; while (steps--) { //std::cerr << "Steps left: " << steps << std::endl; //reassigning for (auto &document: document_list) { document.cluster_ = 0; for (int cluster = 0; cluster < clusters; ++cluster) { int id = centroid_ids[cluster]; int old_id = centroid_ids[document.cluster_]; if (document.distance(id) < document.distance(old_id)) { document.cluster_ = cluster; } } } //recomputing centroids std::vector<double> cluster_distance(length); for (int i = 0; i < length; ++i) { for (int j = i + 1; j < length; ++j) { if (document_list[i].cluster_ == document_list[j].cluster_) { cluster_distance[i] += document_list[i].distance(j); cluster_distance[j] += document_list[j].distance(i); } } } centroid_ids.assign(clusters, -1); for (auto &document: document_list) { if (centroid_ids[document.cluster_] == -1 || cluster_distance[centroid_ids[document.cluster_]] > cluster_distance[document.id_]) { centroid_ids[document.cluster_] = document.id_; } } } }
int main() { for (size_t n = 0; n <= 100; ++n) { for (size_t m = 0; m <= n; ++m) { std::vector< size_t > values; for (size_t j = 0; j < n; ++j) { values.push_back(j); } for (int i = 0; i < 100; ++i) { std::vector< size_t > subset = random_subset(values,m); assert(subset.size() == m); std::vector< bool > values_seen(n, false); /* make sure all values are valid and don't repeat */ for (const size_t x : subset) { assert(x < n); assert(values_seen[x] == false); values_seen[x] = true; } } } std::cout << "passed random tests for arrays of size " << n << std::endl; } return 0; }
int main(int argc, char** argv) { int opt_char; enum algorithm_t { e_null = 0, e_permutation = 1, e_subset = 2, }; algorithm_t algorithm = e_null; /** * long options is not portal using getopt, ignored. */ while ((opt_char = getopt(argc, argv, "hs:a:")) != -1) { switch (opt_char) { case 's': { int seed = strtol(optarg, 0, 10); srand(seed); break; } case 'a': switch (*optarg) { case 'p': algorithm = e_permutation; break; case 's': algorithm = e_subset; break; default: fprintf(stderr, "unsupported algorithm %s\n", optarg); exit(1); } break; case 'h': fprintf(stderr, "Usage: %s -s<SEED> integer-list\n", argv[0]); exit(1); case '?': exit(1); } } if (!algorithm) { fprintf(stderr, "algorithm(-a) not specified\n"); exit(1); } switch(algorithm) { case e_permutation: { int len = argc-optind; int* a = new int[len]; for (int i = 0; i < len; ++i) a[i] = strtol(argv[optind+i], 0, 10); random_permutation(len, a); for (int i = 0; i < len; ++i) fprintf(stdout, "%d ", a[i]); fprintf(stdout, "\n"); delete a; break; } case e_subset: { int m = strtol(argv[optind], 0, 10); int len = argc-optind-1; int* b = new int[m]; int* a = new int[len]; for (int i = 0; i < len; ++i) a[i] = strtol(argv[optind+1+i], 0, 10); random_subset(len, a, m, b); std::sort(b, b+m); for (int i = 0; i < m; ++i) fprintf(stdout, "%d ", b[i]); fprintf(stdout, "\n"); delete a; delete b; break; } } return 0; }