Esempio n. 1
0
static void kmeans_center_multiple_restarts(
    unsigned nb_restarts, cluster_t nb_center,
    void (*center_init_f)(cluster_t, histogram_c &, dataset_t &, nbgen &),
    histogram_c &center, dataset_t &dataset, nbgen &rng) {
  std::vector<histogram_c> center_c(nb_restarts);
  for (unsigned i = 0; i < nb_restarts; ++i)
    center_init_f(nb_center, center_c[i], dataset, rng);

  unsigned nb_features = dataset[0].histogram.size();
  std::vector<double> cluster_dists(nb_restarts);
  for (unsigned r = 0; r < nb_restarts; ++r) {
    double sum = 0;
    unsigned count = 0;
    std::vector<double> distances(nb_center, 0);
    for (unsigned i = 0; i < nb_center; ++i) {
      for (unsigned j = 0; j < nb_center; ++j) {
        if (j == i)
          continue;
        double dist = l2_distance(center_c[r][i], center_c[r][j], nb_features);
        distances[i] += dist;
        ++count;
      }
      sum += distances[i];
    }
    cluster_dists[r] = sum / count;
    // printf("restart:%u -> %f\n", r, cluster_dists[r]);
  }
  size_t max_cluster = std::distance(
      cluster_dists.begin(),
      std::max_element(cluster_dists.begin(), cluster_dists.end()));
  // printf("min center index: %zu\n", max_cluster);
  center = center_c[max_cluster];
}
Esempio n. 2
0
    void find_optimal_clustering(Mat &dists, std::vector<idx_cluster_t> &idxClusters, const cluster_vars &vars) {

        int optimalClusters = floor(sqrt(dists.rows));
        
        Mat initial;
        double r = vars.mcl_inflation_power;
        double r_dec = r - 1;
        double multStep = 0.6;

        cluster_dists(dists, initial, r, vars);
        interpret_clusters(initial, idxClusters);
        
        double clusterRatio = (int)idxClusters.size() / (double)optimalClusters;
        int iterations = vars.cluster_max_iterations;

        double r_used = r;

        if(clusterRatio < 1) {
            // want more clusters - larger r
            Mat prevmat;
            for(int i = 0; i < iterations; i++) {
                r_dec *= (1 - multStep) + 1;
                r_used = 1 + r_dec;

                Mat mclmat;
                cluster_dists(dists, mclmat, r_used, vars);
                idxClusters.clear();
                interpret_clusters(mclmat, idxClusters);
                clusterRatio = (int)idxClusters.size() / (double)optimalClusters;

                //printf(" - %3d: Testing r of %6.3f and got clusterRatio of %6.3f\n", i, r_used, clusterRatio);

                if(clusterRatio > 1) {
                    interpret_clusters(prevmat, idxClusters);
                    break;
                } 
                else if(clusterRatio == 1.0) {
                    break; 
                } 
                else {
                    mclmat.copyTo(prevmat);
                }
            } 

            //printf(" - Finally using r of %6.3f to get %3d clusters from %3d hists for a clusterRatio of %6.3f\n", r_used, (int)idxClusters.size(), dists.rows, clusterRatio);
        } 
        else if(clusterRatio > 1) {
            // want fewer clusters - smaller r 

            for(int i = 0; i < iterations; i++) {
                r_dec *= multStep;
                r_used = 1 + r_dec;

                Mat mclmat;
                cluster_dists(dists, mclmat, r_used, vars);
                idxClusters.clear();
                interpret_clusters(mclmat, idxClusters);
                clusterRatio = (int)idxClusters.size() / (double)optimalClusters;

                //printf(" - %3d: Testing r of %6.3f and got clusterRatio of %6.3f\n", i, r_used, clusterRatio);

                if(clusterRatio <= 1.0) {
                    break; 
                } 
            } 

            //printf(" - Using r of %6.3f to get %3d clusters from %3d hists for a clusterRatio of %6.3f\n", r_used, (int)idxClusters.size(), dists.rows, clusterRatio);
        } 
        
        /*
        int optimalClustersMax = ceil(sqrt(dists.rows));
        int optimalClustersMin = floor(sqrt(dists.rows));
        int optimalCase = (int)ceil(sqrt((int)dists.rows)*2);
        double optimalRatio = optimalCase / (double)dists.rows;
        */

        /*
        printf("Optimal Case Checks: %d\n", optimalCase);
        printf("Optimal Check Ratio: %.3f\n", optimalRatio);
        printf("Optimal Clusters: %d - %d\n\n", optimalClustersMin, optimalClustersMax);
        */
        
        /*
        Mat initial;
        double r = vars.mcl_inflation_power;

        cluster_dists(dists, initial, r, vars);
        interpret_clusters(initial, idxClusters);
        
        int checkClusters = (int)idxClusters.size();
        int iterations = vars.cluster_max_iterations;
        int base = 7;
        bool makeLarger = (checkClusters < optimalClustersMin);

        for(int i = 0; i < iterations; i++) {
            if(checkClusters < optimalClustersMin && makeLarger)
                r *= (base + 1.0 + i) / base; // need more clusters - larger r
            else if(checkClusters > optimalClustersMax && !makeLarger) 
                r *= (base - 1.0 - i) / base; // need fewer clusters - smaller r
            else
                break;

            if(r <= 1)
                break;


            Mat mclmat;
            cluster_dists(dists, mclmat, r, vars);
            idxClusters.clear();
            interpret_clusters(mclmat, idxClusters);
            checkClusters = (int)idxClusters.size();
        }
        */
    }