Exemple #1
0
void kmeans_1d_dp(const double *x, const size_t N, const double *y,
                  size_t Kmin, size_t Kmax,
                  int* cluster, double* centers,
                  double* withinss, int* size)
{
  // Input:
  //  x -- an array of double precision numbers, not necessarily sorted
  //  Kmin -- the minimum number of clusters expected
  //  Kmax -- the maximum number of clusters expected
  // NOTE: All vectors in this program is considered starting at position 0.

  std::vector<double> x_sorted(N);

  std::vector<double> y_sorted;
  auto is_equally_weighted(true);

  std::vector<size_t> order(N);

  //Number generation using lambda function, not supported by all g++:
  //std::size_t n(0);
  //std::generate(order.begin(), order.end(), [&]{ return n++; });

  for(size_t i=0; i<order.size(); ++i) {
    order[i] = i;
  }

  // Sort the index of x in increasing order of x
  // Sorting using lambda function, not supported by all g++ versions:
  // std::sort(order.begin(), order.end(),
  //          [&](size_t i1, size_t i2) { return x[i1] < x[i2]; } );

  struct CompareIndex {
    const double * m_x;
    CompareIndex(const double * x) : m_x(x) {}
    bool operator() (size_t i, size_t j) { return (m_x[i] < m_x[j]);}
  } compi(x);

  std::sort(order.begin(), order.end(), compi);

  for(size_t i=0; i<order.size(); ++i) {
    x_sorted[i] = x[order[i]];
  }

  // check to see if unequal weight is provided
  if(y != NULL) {
    is_equally_weighted = true;
    for(size_t i=1; i<N; ++i) {
      if(y[i] != y[i-1]) {
        is_equally_weighted = false;
        break;
      }
    }
  }

  if(! is_equally_weighted) {
    y_sorted.resize(N);
    for(size_t i=0; i<order.size(); ++i) {
      y_sorted[i] = y[order[i]];
    }
  }

  const size_t nUnique = numberOfUnique(x_sorted.begin(), x_sorted.end());

  Kmax = nUnique < Kmax ? nUnique : Kmax;

  if(nUnique > 1) { // The case when not all elements are equal.

    std::vector< std::vector< double > > S( Kmax, std::vector<double>(N) );
    std::vector< std::vector< size_t > > J( Kmax, std::vector<size_t>(N) );

    size_t Kopt;

    // Fill in dynamic programming matrix
    if(is_equally_weighted) {

      fill_dp_matrix(x_sorted, S, J);

      // Choose an optimal number of levels between Kmin and Kmax
      Kopt = select_levels(x_sorted, J, Kmin, Kmax);

    } else {
      fill_weighted_dp_matrix(x_sorted, y_sorted, S, J);

      // Choose an optimal number of levels between Kmin and Kmax
      Kopt = select_levels_weighted(x_sorted, y_sorted, J, Kmin, Kmax);
    }


    if (Kopt < Kmax) { // Reform the dynamic programming matrix S and J
      J.erase(J.begin() + Kopt, J.end());
    }

    std::vector<int> cluster_sorted(N);

    // Backtrack to find the clusters beginning and ending indices
    if(is_equally_weighted) {
      backtrack(x_sorted, J, &cluster_sorted[0], centers, withinss, size);
    } else {
      backtrack_weighted(x_sorted, y_sorted, J, &cluster_sorted[0], centers, withinss, size);
    }

    for(size_t i = 0; i < N; ++i) {
      // Obtain clustering on data in the original order
      cluster[order[i]] = cluster_sorted[i];
    }

  } else {  // A single cluster that contains all elements

    for(size_t i=0; i<N; ++i) {
      cluster[i] = 0;
    }

    centers[0] = x[0];
    withinss[0] = 0.0;
    size[0] = N * (is_equally_weighted ? 1 : y[0]);
  }
}  //end of kmeans_1d_dp()
ClusterResult
kmeans_1d_dp(const std::vector<double> & x, size_t Kmin, size_t Kmax)
{
    // Input:
    //  x -- a vector of numbers, not necessarily sorted
    //  Kmin -- the minimum number of clusters expected
    //  Kmax -- the maximum number of clusters expected
    // NOTE: All vectors in this program is considered starting at position 1,
    //       position 0 is not used.
 
    ClusterResult result;
    const size_t N = x.size() - 1;  // N: is the size of input vector
    
    std::vector<double> x_sorted(x);
    std::sort(x_sorted.begin()+1, x_sorted.end());
    const size_t nUnique = numberOfUnique(x_sorted.begin()+1, x_sorted.end());
    
    Kmax = nUnique < Kmax ? nUnique : Kmax;
    
    if(nUnique > 1) { // The case when not all elements are equal.
        
        std::vector< std::vector< double > > D( (Kmax + 1), std::vector<double>(N + 1) );
        std::vector< std::vector< size_t > > B( (Kmax + 1), std::vector<size_t>(N + 1) );
        
        // Fill in dynamic programming matrix
        fill_dp_matrix(x_sorted, D, B);
        
        // Choose an optimal number of levels between Kmin and Kmax
        size_t Kopt = select_levels(x_sorted, B, Kmin, Kmax);

        if (Kopt < Kmax) { // Reform the dynamic programming matrix D and B
            B.erase(B.begin()+ Kopt + 1, B.end());
        }
        
        // Backtrack to find the clusters beginning and ending indices
        backtrack(x_sorted, B, result);
        
        // Perform clustering on the original data
        for(size_t i = 1; i < x.size(); ++i) {
            size_t indexLeft = 1;
            size_t indexRight;
            
            for (size_t k = 1; k < result.size.size(); ++k) {
                indexRight = indexLeft + result.size[k] - 1;
                if ( x[i] <= x_sorted[indexRight] ) {
                    result.cluster[i] = k;
                    break;
                }
                indexLeft = indexRight + 1;
            }
        }
        
    } else {  // A single cluster that contains all elements
        
        result.nClusters = 1;
        
        result.cluster = std::vector<size_t>(N + 1, 1);
        
        result.centers.resize(2);
        result.withinss.resize(2);
        result.size.resize(2);
        
        result.centers[1] = x[1];
        result.withinss[1] = 0.0;
        result.size[1] = N;
    }
    return result;
}  //end of kmeans_1d_dp()