示例#1
0
int main()
{
  // LOAD DATA
  arma::mat X;

  // A) Toy Data
  //  char inputFile[] = "../data_files/toyclusters/toyclusters.dat";
  
  // B) X4.dat
  //char inputFile[] = "./X4.dat";
  
  // C) fisher data
  //char inputFile[] = "./fisher.dat";

  // D) MNIST data
  //char inputFile[] = "../data_files/MNIST/MNIST.dat";
  
  // E) Reduced MNIST (5000x400)
  //char inputFile[] = "../data_files/MNIST/MNISTreduced.dat";
  
  // F) Reduced MNIST (0 and 1) (1000x400)
  //char inputFile[] = "../data_files/MNIST/MNISTlittle.dat";

  // G) Girl.png (512x768, RGB, already unrolled)
  //char inputFile[] = "girl.dat";

  // H) Pool.png (383x512, RGB, already unrolled)
  //char inputFile[] = "pool.dat";

  // I) Cat.png (733x490, RGB, unrolled)
  //char inputFile[] = "cat.dat";
  
  // J) Airplane.png (512x512, RGB, unrolled)
  //char inputFile[] = "airplane.dat";

  // K) Monarch.png (512x768, RGB, unrolled)
  //char inputFile[] = "monarch.dat";

  // L) tulips.png (512x768 ,RGB, unrolled)
  //char inputFile[] = "tulips.dat";
  
  // M) demo.dat (2d data)
  char inputFile[] = "demo.dat";

  // INITIALIZE PARAMETERS
  X.load(inputFile);
  const arma::uword N = X.n_rows;
  const arma::uword D = X.n_cols;

  arma::umat ids(N,1);	// needed to shuffle indices later
  arma::umat shuffled_ids(N,1);
  for (arma::uword i = 0; i < N; ++i) {
    ids(i,0) = i;
  }
  
  arma::arma_rng::set_seed_random(); // set arma rng
  // int seed = time(NULL);	// set RNG seed to current time
  // srand(seed);

  arma::uword initial_K = 32;   // initial number of clusters
  arma::uword K = initial_K;
  
  arma::umat clusters(N,1); // contains cluster assignments for each data point
  for (arma::uword i = 0; i < N; ++i) {
    clusters(i, 0) = i%K;		// initialize as [0,1,...,K-1,0,1,...,K-1,0,...]
  }

  arma::umat cluster_sizes(N,1,arma::fill::zeros); // contains num data points in cluster k
  for (arma::uword i = 0; i < N; ++i) {
    cluster_sizes(clusters(i,0), 0) += 1;
  }

  arma::mat mu(N, D, arma::fill::zeros); // contains cluster mean parameters
  arma::mat filler(D,D,arma::fill::eye);
  std::vector<arma::mat> sigma(N,filler); // contains cluster covariance parameters

  if (K < N) {			// set parameters not belonging to any cluster to -999
    mu.rows(K,N-1).fill(-999);
    for (arma::uword k = K; k < N; ++k) {
      sigma[k].fill(-999);
    }
  }

  arma::umat uword_dummy(1,1);	// dummy 1x1 matrix;
  // for (arma::uword i = 0; i <N; ++i) {
  //   std::cout << sigma[i] << std::endl;
  // }
  // std::cout << X << std::endl
  // 	    << N << std::endl
  // 	    << D << std::endl
  // 	    << K << std::endl
  // 	    << clusters << std::endl
  // 	    << cluster_sizes << std::endl
  //	    << ids << std::endl;

  // INITIALIZE HYPER PARAMETERS
  // Dirichlet Process concentration parameter is alpha:
  double alpha = 1;
  // Dirichlet Process base distribution (i.e. prior) is 
  // H(mu,sigma) = NIW(mu,Sigma|m_0,k_0,S_0,nu_0) = N(mu|m_0,Sigma/k_0)IW(Sigma|S_0,nu_0)
  arma::mat perturbation(D,D,arma::fill::eye);
  perturbation *= 0.000001;
  //const arma::mat S_0 = arma::cov(X,X,1) + perturbation; // S_xbar / N
  const arma::mat S_0(D,D,arma::fill::eye);
  const double nu_0 = D + 2;
  const arma::mat m_0 = mean(X).t();
  const double k_0 = 0.01;
  // std::cout << "S_0" << S_0 << std::endl;
  // std::cout << S_0 << std::endl
  // 	    << nu_0 << std::endl
  // 	    << m_0 << std::endl
  // 	    << k_0 << std::endl;


  // INITIALIZE SAMPLING PARAMETERS
  arma::uword NUM_SWEEPS = 250;	// number of Gibbs iterations
  bool SAVE_CHAIN = false;	// save output of each Gibbs iteration?
  // arma::uword BURN_IN = NUM_SWEEPS - 10;
  // arma::uword CHAINSIZE = NUM_SWEEPS - BURN_IN;
  // std::vector<arma::uword> chain_K(CHAINSIZE, K); // Initialize chain variable to initial parameters for convinience
  // std::vector<arma::umat> chain_clusters(CHAINSIZE, clusters);
  // std::vector<arma::umat> chain_clusterSizes(CHAINSIZE, cluster_sizes);
  // std::vector<arma::mat> chain_mu(CHAINSIZE, mu);
  // std::vector<std::vector<arma::mat> > chain_sigma(CHAINSIZE, sigma);
  
  // for (arma::uword sweep = 0; sweep < CHAINSIZE; ++sweep) {
  //   std::cout << sweep << " K\n" << chain_K[sweep] << std::endl
  // 		<< sweep << " clusters\n" << chain_clusters[sweep] << std::endl
  // 		<< sweep << " sizes\n" << chain_clusterSizes[sweep] << std::endl
  // 		<< sweep << " mu\n" << chain_mu[sweep] << std::endl;
  //   for (arma::uword i = 0; i < N; ++i) { 
  // 	std::cout << sweep << " " << i << " sigma\n" << chain_sigma[sweep][i] << std::endl;
  //   }
  // }
  

  // START CHAIN
  std::cout << "Starting Algorithm with K = " << K << std::endl;
  for (arma::uword sweep = 0; sweep < NUM_SWEEPS; ++sweep) { 
    // shuffle indices
    shuffled_ids = shuffle(ids);
    // std::cout << shuffled_ids << std::endl;

    // SAMPLE CLUSTERS
    for (arma::uword j = 0; j < N; ++j){
      //      std::cout << "j = " << j << std::endl;
      arma::uword i = shuffled_ids(j);
      arma::mat x = X.row(i).t(); // current data point
      
      // Remove i's statistics and any empty clusters
      arma::uword c = clusters(i,0); // current cluster
      cluster_sizes(c,0) -= 1;
      //std::cout << "old c = " << c << std::endl;

      if (cluster_sizes(c,0) == 0) { // remove empty cluster
      	cluster_sizes(c,0) = cluster_sizes(K-1,0); // move entries for K onto position c
	mu.row(c) = mu.row(K-1);
      	sigma[c] = sigma[K-1];
      	uword_dummy(0,0) = c;
	arma::uvec idx = find(clusters == K - 1);
      	clusters.each_row(idx) = uword_dummy;
	cluster_sizes(K-1,0) = 0;
	mu.row(K-1).fill(-999);
	sigma[K-1].fill(-999);
	--K;
      }

      // quick test of logMvnPdf:
      // arma::mat m_(2,1);
      // arma::mat s_(2,2);
      // arma::mat t_(2,1);
      // m_ << 1 << arma::endr << 2;
      // s_ << 3 << -0.2 << arma::endr << -0.2 << 1;
      // t_ << -3 << arma::endr << -3;
      // double lpdf = logMvnPdf(t_, m_, s_);
      // std::cout << lpdf << std::endl; // śhould be -19.1034 (works)


      
      // Find categorical distribution over clusters (tested)
      arma::mat logP(K+1, 1, arma::fill::zeros);
      
      // quick test of logInvWishPdf
      // arma::mat si_(2,2), s_(2,2);
      // double nu_ = 4;
      // si_ << 1 << 0.5 << arma::endr << 0.5 << 4;
      // s_ << 3 << -0.2 << arma::endr << -0.2 << 1;
      // double lpdf = logInvWishPdf(si_, s_, nu_);
      // std::cout << lpdf << std::endl; // should be -7.4399 (it is)

      // quick test for logNormInvWishPdf
      // arma::mat si_(2,2), s_(2,2);
      // arma :: mat mu_(2,1), m_(2,1);
      // double k_ = 0.5, nu_ = 4;
      // si_ << 1 << 0.5 << arma::endr << 0.5 << 4;
      // s_ << 3 << -0.2 << arma::endr << -0.2 << 1;
      // mu_ << -3 << arma::endr << -3;
      // m_ << 1 << arma::endr << 2;
      // double lp = logNormInvWishPdf(mu_,si_,m_,k_,s_,nu_);
      // std::cout << lp << std::endl; // should equal -15.2318 (it is)
      
      // p(existing clusters) (tested)
      for (arma::uword k = 0; k < K; ++k) {
	arma::mat m_ = mu.row(k).t();
	arma::mat s_ = sigma[k];
	logP(k,0) = log(cluster_sizes(k,0)) - log(N-1+alpha) + logMvnPdf(x,m_,s_);
      }

      // p(new cluster): find partition function (tested)
      // arma::mat dummy_mu(D, 1, arma::fill::zeros);
      // arma::mat dummy_sigma(D, D, arma::fill::eye);
      // double logPrior, logLklihd, logPstr, logPartition; 
      // posterior hyperparameters (tested)
      arma::mat m_1(D,1), S_1(D,D);
      double k_1, nu_1;
      k_1 = k_0 + 1;
      nu_1 = nu_0 + 1;
      m_1 = (k_0*m_0 + x) / k_1;
      S_1 = S_0 + x * x.t() + k_0 * (m_0 * m_0.t()) - k_1 * (m_1 * m_1.t());
      // std::cout << k_1 << std::endl
      // 		<< nu_1 << std::endl
      // 		<< m_1 << std::endl
      // 		<< S_1 << std::endl;
      
      // // partition = likelihood*prior/posterior (tested)
      // // (perhaps a direct computation of the partition function would be better)
      // logPrior = logNormInvWishPdf(dummy_mu, dummy_sigma, m_0, k_0, S_0, nu_0);
      // logLklihd = logMvnPdf(x, dummy_mu, dummy_sigma);
      // logPstr = logNormInvWishPdf(dummy_mu, dummy_sigma, m_1, k_1, S_1, nu_1);
      // logPartition = logPrior + logLklihd - logPstr;      
      // std::cout << "log  Prior = " << logPrior << std::endl
      // 		<< "log Likelihood = " << logLklihd << std::endl
      // 		<< "log Posterior = " << logPstr << std::endl
      // 		<< "log Partition = " << logPartition << std::endl;
      
      // Computing partition directly
      double logS0,signS0,logS1,signS1;
      arma::log_det(logS0,signS0,S_0);
      arma::log_det(logS1,signS1,S_1);
      /*std::cout << "log(det(S_0)) = " << logS0 << std::endl
	<< "log(det(S_1)) = " << logS1 << std::endl;*/
      double term1 = 0.5*D*(log(k_0)-log(k_1));
      double term2 = -0.5*D*log(arma::datum::pi);
      double term3 = 0.5*(nu_0*logS0 - nu_1*logS1);
      double term4 = lgamma(0.5*nu_1);
      double term5 = -lgamma(0.5*(nu_1-D));
      double logPartition = term1+term2+term3+term4+term5;
      /*double logPartition = 0.5*D*(log(k_0)-log(k_1)-log(arma::datum::pi)) \
	/+0.5*(nu_0*logS0 - nu_1*logS1) + lgamma(0.5*nu_1) - lgamma(0.5*(nu_1-D));*/
      
      /*      std::cout << "term1 = " << term1 << std::endl
		<< "term2 = " << term2 << std::endl
		<< "term3 = " << term3 << std::endl
		<< "term4 = " << term4 << std::endl
		<< "term5 = " << term5 << std::endl;*/
      
      //std::cout << "logP = " << logPartition << std::endl;

      // p(new cluster): (tested)
      logP(K,0) = log(alpha) - log(N - 1 + alpha) + logPartition;
      //      std::cout << "logP(new cluster) = " << logP(K,0) << std::endl;
      //if(i == 49)
      //assert(false);
      // sample cluster for i
      
      

      arma::uword c_ = logCatRnd(logP);
      clusters(i,0) = c_;
      
      //if (j % 10 == 0){
      //std::cout << "New c = " << c_ << std::endl;
      //std::cout << "logP = \n" << logP << std::endl;
      //}
      // quick test for mvnRnd
      // arma::mat mu, si;
      // mu << 1 << arma::endr << 2;
      // si << 1 << 0.9 << arma::endr << 0.9 << 1;
      // arma::mat m = mvnRnd(mu, si);
      // std::cout << m << std::endl;

      // quick test for invWishRnd
      // double df = 4;
      // arma::mat si(2,2);
      // si << 1 << 1 << arma::endr << 1 << 1;
      // arma::mat S = invWishRnd(si,df);
      // std::cout << S << std::endl;

      if (c_ == K) {	// Sample parameters for any new-born clusters from posterior
	cluster_sizes(K, 0) = 1;
	arma::mat si_ = invWishRnd(S_1, nu_1);
	//arma::mat si_ = S_1;
	arma::mat mu_ = mvnRnd(m_1, si_/k_1);
	//arma::mat mu_ = m_1;
	mu.row(K) = mu_.t();
	sigma[K] = si_;
	K += 1;
      } else {
	cluster_sizes(c_,0) += 1;
      }
      // if (sweep == 0)
      // 	std::cout << " K = " << K << std::endl;
      // // if (j == N-1) {
      // // 	std::cout << logP << std::endl;
      // // 	std::cout << K << std::endl;
      // // 	assert(false);
      // // }
      // std::cout << "K = " << K << "\n" << std::endl;
    }
    
    // sample CLUSTER PARAMETERS FROM POSTERIOR
    for (arma::uword k = 0; k < K; ++k) {
      // std::cout << "k = " << k << std::endl;
      // cluster data
      arma::mat Xk = X.rows(find(clusters == k));
      arma::uword Nk = cluster_sizes(k,0);

      // posterior hyperparameters
      arma::mat m_Nk(D,1), S_Nk(D,D);
      double k_Nk, nu_Nk;
      
      arma::mat sum_k = sum(Xk,0).t();
      arma::mat cov_k(D, D, arma::fill::zeros);
      for (arma::uword l = 0; l < Nk; ++l) { 
	cov_k += Xk.row(l).t() * Xk.row(l);
      }
      
      k_Nk = k_0 + Nk;
      nu_Nk = nu_0 + Nk;
      m_Nk = (k_0 * m_0 + sum_k) / k_Nk;
      S_Nk = S_0 + cov_k + k_0 * (m_0 * m_0.t()) - k_Nk * (m_Nk * m_Nk.t());
      
      // sample fresh parameters
      arma::mat si_ = invWishRnd(S_Nk, nu_Nk);
      //arma::mat si_ = S_Nk;
      arma::mat mu_ = mvnRnd(m_Nk, si_/k_Nk);
      //arma::mat mu_ = m_Nk;
      mu.row(k) = mu_.t();
      sigma[k] = si_;
    }
    std::cout << "Iteration " << sweep + 1 << "/" << NUM_SWEEPS<< " done. K = " << K << std::endl;
        
    // // STORE CHAIN
    // if (SAVE_CHAIN) {
    //   if (sweep >= BURN_IN) { 
    // 	chain_K[sweep - BURN_IN] = K;
    // 	chain_clusters[sweep - BURN_IN] = clusters;
    // 	chain_clusterSizes[sweep - BURN_IN] = cluster_sizes;
    // 	chain_mu[sweep - BURN_IN] = mu;
    // 	chain_sigma[sweep - BURN_IN] = sigma;
    //   }
    // }
	
  }
  std::cout << "Final cluster sizes: " << std::endl
	    << cluster_sizes.rows(0, K-1) << std::endl;






  
  // WRITE OUPUT DATA TO FILE
  arma::mat MU = mu.rows(0,K-1);
  arma::mat SIGMA(D*K,D);
  for (arma::uword k = 0; k < K; ++k) { 
    SIGMA.rows(k*D,(k+1)*D-1) = sigma[k];
  }
  arma::umat IDX = clusters;

  // A) toycluster data
  // char MuFile[] = "../data_files/toyclusters/dpmMU.out";
  // char SigmaFile[] = "../data_files/toyclusters/dpmSIGMA.out";
  // char IdxFile[] = "../data_files/toyclusters/dpmIDX.out";

  // B) X4.dat
  char MuFile[] = "dpmMU.out";
  char SigmaFile[] = "dpmSIGMA.out";
  char IdxFile[] = "dpmIDX.out";
  std::ofstream KFile("K.out");
  
  MU.save(MuFile, arma::raw_ascii);
  SIGMA.save(SigmaFile, arma::raw_ascii);
  IDX.save(IdxFile, arma::raw_ascii);
  KFile << "K = " << K << std::endl;
  
  if (SAVE_CHAIN) {}
  //   std::ofstream chainKFile("chainK.out");
  //   std::ofstream chainClustersFile("chainClusters.out");
  //   std::ofstream chainClusterSizesFile("chainClusterSizes.out");
  //   std::ofstream chainMuFile("chainMu.out");
  //   std::ofstream chainSigmaFile("chainSigma.out");
    
  //   chainKFile << "Dirichlet Process Mixture Model.\nInput: " << inputFile << std::endl
  // 	       << "Number of iterations of Gibbs Sampler: " << NUM_SWEEPS << std::endl
  // 	       << "Burn-In: " << BURN_IN << std::endl
  // 	       << "Initial number of clusters: " << initial_K << std::endl
  // 	       << "Output: Number of cluster (K)\n" << std::endl; 
  //   chainClustersFile << "Dirichlet Process Mixture Model.\nInput: " << inputFile << std::endl
  // 		      << "Number of iterations of Gibbs Sampler: " << NUM_SWEEPS << std::endl
  // 		      << "Burn-In: " << BURN_IN << std::endl
  // 		      << "Initial number of clusters: " << initial_K << std::endl
  // 		      << "Output: Cluster identities (clusters)\n" << std::endl; 
  //   chainClusterSizesFile << "Dirichlet Process Mixture Model.\nInput: " << inputFile << std::endl
  // 			  << "Number of iterations of Gibbs Sampler: " << NUM_SWEEPS << std::endl
  // 			  << "Burn-In: " << BURN_IN << std::endl
  // 			  << "Initial number of clusters: " << initial_K << std::endl
  // 			  << "Output: Size of clusters (cluster_sizes)\n" << std::endl; 
  //   chainMuFile << "Dirichlet Process Mixture Model.\nInput: " << inputFile << std::endl
  // 		<< "Number of iterations of Gibbs Sampler: " << NUM_SWEEPS << std::endl
  // 		<< "Burn-In: " << BURN_IN << std::endl
  // 		<< "Initial number of clusters: " << initial_K << std::endl
  // 		<< "Output: Samples for cluster mean parameters (mu. Note: means stored in rows)\n" << std::endl; 
  //   chainSigmaFile << "Dirichlet Process Mixture Model.\nInput: " << inputFile << std::endl
  // 		   << "Number of iterations of Gibbs Sampler: " << NUM_SWEEPS << std::endl
  // 		   << "Burn-In " << BURN_IN << std::endl
  // 		   << "Initial number of clusters: " << initial_K << std::endl
  // 		   << "Output: Samples for cluster covariances (sigma)\n" << std::endl; 


  //   for (arma::uword sweep = 0; sweep < CHAINSIZE; ++sweep) {
  //     arma::uword K = chain_K[sweep];
  //     chainKFile << "Sweep #" << BURN_IN + sweep + 1 << "\n" << chain_K[sweep] << std::endl;
  //     chainClustersFile << "Sweep #" << BURN_IN + sweep + 1 << "\n" << chain_clusters[sweep]  << std::endl;
  //     chainClusterSizesFile << "Sweep #" << BURN_IN + sweep + 1 << "\n" << chain_clusterSizes[sweep].rows(0, K - 1) << std::endl;
  //     chainMuFile << "Sweep #" << BURN_IN + sweep + 1 << "\n" << chain_mu[sweep].rows(0, K - 1) << std::endl;
  //     chainSigmaFile << "Sweep #" << BURN_IN + sweep + 1<< "\n";
  //     for (arma::uword i = 0; i < K; ++i) { 
  // 	chainSigmaFile << chain_sigma[sweep][i] << std::endl;
  //     }
  //     chainSigmaFile << std::endl;
  //   }
  // }

  return 0;
}
void RunKMedoids(const leveldb::Slice& begin,
                 const leveldb::Slice& end,
                 int K, leveldb::DB* db,
                 leveldb::DB* work_db, int concurrency,
                 std::ostream& ivar_out, std::ostream& cent_out) {
  auto very_start = std::chrono::system_clock::now();
  auto key_centroids = uniform_init(begin, end, K);
  std::vector<GDELTMini> val_centroids(K);
  {
    auto it = iter(db);
    for (int i = 0; i < K; ++i) {
      it->Seek(key_centroids[i]);
      CHECK(it->Valid());
      read(it->value(), val_centroids[i]);
    }
  }

  std::cout << "Divying up range among threads... " << std::flush;
  auto parvec = get_par_ranges(uniform_init(begin, end, concurrency), db, end);
  std::cout << "DONE" << std::endl;

  int i = 0;
  bool centers_changed = true;
  vuad totals(K);
  vuai cluster_sizes(K);
  for (int i = 0; i < K; ++i) {
    cluster_sizes[i].reset(new std::atomic<int>);
    totals[i].reset(new std::atomic<double>);
  }

  while (centers_changed) {
    auto start = std::chrono::system_clock::now();
    assign_closest(parvec, totals, K, work_db, val_centroids, concurrency,
                   cluster_sizes);
    auto tot = std::accumulate(totals.begin(), totals.end(), 0.0,
                               [](double sum, typename vuad::value_type& d) {
                                 return sum + d->load();
                               });
    auto end = std::chrono::system_clock::now();
    std::cout << "Iteration " << ++i << " total intravariance " << tot;
    std::cout << "\n    Assigning medoids took " << secs(start, end)
              << "s" << std::endl;

    start = std::chrono::system_clock::now();
    for (auto& d : totals) {
      ivar_out << d->load() << " ";
    }
    ivar_out << std::endl;
    cent_out << key_centroids << std::endl;
    end = std::chrono::system_clock::now();
    std::cout << "    Saving medoids took " << secs(start, end)
              << " s" << std::endl;

    start = std::chrono::system_clock::now();
    centers_changed = update_medoids(concurrency, K, db, work_db,
                                     val_centroids, key_centroids,
                                     totals, cluster_sizes);
    end = std::chrono::system_clock::now();
    std::cout << "    Medoid update took " << secs(start, end)
              << " s" << std::endl;
    start = end;
  }
  auto very_end = std::chrono::system_clock::now();
  std::cout << "K-medoid clustering COMPLETE in "
            << secs(very_start, very_end) << " s" << std::endl;
}