void KMeans::run() { bool move; bool some_point_is_moving = true; unsigned int num_iterations = 0; PointId pid; ClusterId cid, to_cluster; Distance d, min; // Initial partition of points initial_partition_points(); // Until not converge while (some_point_is_moving) { some_point_is_moving = false; compute_centroids(); // for each point for (pid=0; pid<n_; ++pid) { // distance from current cluster ClusterId curr_ID = points_to_clusters_[pid]; Point curr_centroid = centroids_.row(curr_ID); min = distance(curr_centroid, X_.getdatai(pid)); // for each centroid cid = 0; move = false; for(int k=0; k<k_;++k) { d = distance(centroids_.row(k), X_.getdatai(pid)); if (d < min) { min = d; move = true; to_cluster = cid; // remove from current cluster clusters_to_points_[points_to_clusters_[pid]].erase(pid); some_point_is_moving = true; } cid++; } // move towards a closer centroid if (move) { // insert points_to_clusters_[pid] = to_cluster; clusters_to_points_[to_cluster].insert(pid); } } num_iterations++; } std::cout<<"Num Iter: "<<num_iterations<<std::endl; }
/* * Clusters data. */ static void kmeans(void) { int i; omp_set_num_threads(NUM_THREADS); for (i = 0; i < NUM_THREADS; i++) omp_init_lock(&lock[i]); /* Cluster data. */ do { populate(); compute_centroids(); } while (again()); }
void Clusters::generate(RecTable & mods0, int nreclusters, int niterations, bool grahamSchmidt) { bool verbose = true; cout << "Generating clusters using " << mods0.size() << " users" << endl; int i; if(!mods0.size()) { cout << "Bad: mods table uninitialized" << endl; return; } if(verbose) { cout << "Mods table has " << mods0.size() << " users" << endl; cout << "generating clusters" << endl; } clear(); // default to every user in a random bin ModsTable::iterator it; for(it = mods0.begin(), i=0; it != mods0.end(); i++, it++) { if(it == mods0.end()) { cout << "Bad: mods table is smaller than the number of clusters" << endl; return; } clusters[i % nclusters].push_back((*it).first); } cout << "Number of clusters: " << clusters.size() << endl; // recompute them compute_centroids(); restart_this_sucker: if (grahamSchmidt) GrahamSchmidt(1); for(int iter = 0; iter < niterations; iter++) { time_t start_time = time(NULL); if(verbose) cout << "Iteration: " << iter << endl; // wipe the existing clusters clear(); if(verbose) cout << " --> Binning users" << endl; int i = 0; for(ModsTable::iterator user = mods0.begin(); user != mods0.end(); user++, i++) add((*user).first); if(verbose) cout << " --> Copying current centroids" << endl; // make a copy of the current centroids vector<User> oldcentroids; for(i = 0; i < nclusters; i++) { oldcentroids.push_back(User()); for(User::iterator article = centroids[i].begin(); article != centroids[i].end(); article++) { oldcentroids[i].add((*article).first, centroids[i][(*article).first]); } } // recompute them compute_centroids(); float quality=0, difference=0; int n_non_zero = 0; for(i = 0; i < nclusters; i++) { if(clusters[i].empty()) continue; n_non_zero++; float q = spread[i]; quality += q; // float d = centroids[i] | oldcentroids[i]; float d = dist(¢roids[i], &oldcentroids[i], true); difference += d; int n_per_user = 0; for(vector<int>::iterator it = clusters[i].begin(); it != clusters[i].end(); it++) { n_per_user += mods0[(*it)].size(); } if(verbose) cout << "\t( #" << setw(2) << i << ", Q:" << setprecision(3) << setw(7) << q << ", Delta:" << setprecision(3) << setw(7) << d << ", Users:" << setw(6) << clusters[i].size() << ", Arts:" << setw(6) << centroids[i].size() << ", Avg # art:" << setw(6) << float(n_per_user)/float(clusters[i].size()) << ")" << endl; } if(n_non_zero) { quality /= float(n_non_zero); difference /= float(n_non_zero); } if(verbose) cout << "Quality " << quality << ", Difference " << difference << ": Total time " << difftime(time(NULL), start_time) << "sec" << endl; if(difference > .98) break; } // dump the self-similarity matrix for the centroids cout << endl << "\x1b[31mCentroid Comparison Matrix\x1b[0m" << endl; for(i = 0; i < nclusters; i++) { printf("\t"); for(int j = 0; j < nclusters; j++) { printf("%6.3f", dist(¢roids[i], ¢roids[j], true)); if(j != nclusters -1) printf(","); } printf("\n"); } // dump the self-similarity matrix for the centroids cout << endl << "\x1b[31mCentroid Comparison Matrix (dot)\x1b[0m" << endl; for(i = 0; i < nclusters; i++) { printf("\t"); for(int j = 0; j < nclusters; j++) { printf("%8.3f", dot(¢roids[i], ¢roids[j])); if(j != nclusters -1) printf(","); } printf("\n"); } // and the corresponding number of entries per cluster cout << endl<< "\x1b[31m Article availability\x1b[0m" << endl; for(i = 0; i < nclusters; i++) { cout << "\tCluster #" << i << ": " << centroids[i].size() << ", Norm: " << norm(¢roids[i]) << endl; } if(nreclusters --) goto restart_this_sucker; }