estimates_t EM(const estimates_t& initial, const dataPoints_t& X, unsigned int maxIterations, Progress *progress) 
{
    // This will be used for probabilites calculation at each step
    prob_matrix_t prob;
    // Number of classes
    const unsigned int classes = initial.size();
    // Sum of probabilites of all classes
    std::vector<double> sums(classes, 0);

    for(unsigned int i = 0; i < classes + 1; ++i)
    {
        prob.push_back(std::vector<double>());
        std::vector<double>& ps = prob.back();
        ps.resize(X.size());
    }
    // Estimates after each iteration
    estimates_t next = initial;

    for(unsigned int iteration = 1; iteration <= maxIterations; ++iteration)
    {
        if (progress != NULL)
        {
            progress->updateProgress("EM running on difference image", 100*iteration/maxIterations, NORMAL);
        }
        std::fill(sums.begin(), sums.end(), 0.0);
        computeProbabilities(prob, sums, X, next);

        estimates_t::iterator estimateItr = next.begin();
        estimates_t::const_iterator prev_estimateItr = next.begin();
        std::vector<double>::const_iterator sumItr = sums.begin();
        prob_matrix_t::const_iterator probItr = prob.begin();

        // M-step
        for(; estimateItr != next.end(); ++estimateItr, ++sumItr, ++probItr, ++prev_estimateItr)
        {
            updateStdDevs(*estimateItr, *sumItr, *probItr, X, *prev_estimateItr);
            updateMeans(*estimateItr, *sumItr, *probItr, X);
            updateWeights(*estimateItr, *sumItr, X.size());
        }
    }
    estimates_t final = next;
    return final;
}
pair<int, int> haplotypeCluster::kmeans(unsigned int cluster,
                                        unsigned int depth) {
  int nchanged = 0;
  int debug = 0;

  if (_DEBUG > 1)
    cout << "Clustering (K-means) K = " << K << ". NSNP = " << nsnp << endl;

  pair<unsigned int, unsigned int> cluster_id(cluster, cluster + pow(2, depth));

  int N = 0;
  for (int i = 0; i < nhap; i++) {
    if (clustering[i] == cluster) {
      cluster_mask[i] = true;
      N++;
    } else
      cluster_mask[i] = false;
  }

  mu.assign(K, vector<float>(nsnp, -1));
  dlook.resize(K);
  pair<int, int> cluster_size;
  vector<float> d(K);
  vector<float> tmp(nsnp);
  float ss;
  //	kplusplus(K,tmp2,initial_centroids);
  int idx1 = getIdx(putils::getRandom(N));

  int idx2 = getIdx(putils::getRandom(N));
  float maxdist = g->vecH[idx1].hamming(g->vecH[idx2], cstart, cstop);

  for (int i = 0; i < 100; i++) {
    int tmp1 = getIdx(putils::getRandom(N));
    float tmp2 = g->vecH[idx1].hamming(g->vecH[tmp1], cstart, cstop);
    if (tmp2 > maxdist) {
      idx2 = tmp1;
      maxdist = tmp2;
    }
  }

  for (int j = 0; j < nsnp; j++) {
    mu[0][j] = (float)(g->vecH[idx1][cstart + j]);
    mu[1][j] = (float)(g->vecH[idx2][cstart + j]);
  }

  if (_DEBUG > 1) {
    printmu(mu);
    cout << endl;
  }
  int closest_cluster;

  // LLOYDS ALGORITHM - standard K-means routine
  for (int iteration = 0; iteration < niteration; iteration++) {
    nchanged = 0;
    for (int i = 0; i < K; i++)
      dlook[i].assign(nsnp / 8, vector<float>(256, -1.0));

    ss = 0.0;
    for (int i = 0; i < nhap; i++) {
      if (cluster_mask[i]) {
        for (int j = 0; j < K; j++)
          d[j] = euc(i, j, mu, dlook);
        if (d[0] < d[1]) {
          ss += d[0];
          closest_cluster = cluster_id.first;
        } else {
          ss += d[1];
          closest_cluster = cluster_id.second;
        }
        if (clustering[i] != closest_cluster)
          nchanged++;

        clustering[i] = closest_cluster;
      }
    }

    updateMeans(cluster_id, cluster_size);
    if (_DEBUG > 1)
      cout << "LLOYDS K-MEANS ITERATION " << iteration
           << " Mean SS = " << ss / (float)N << "\t" << nchanged
           << " haps changed clusters." << endl;
    if (nchanged == 0)
      break;
  }

  //	if(_DEBUG>0) cout << "Lloyds Total SS = " << SS() << "\t"<<nchanged << "
  // haps changed clusters on last iteration" << endl;

  int minind = cluster_size.first > cluster_size.second;
  int minsize = min(cluster_size.first, cluster_size.second);

  if (minsize < ncond && K == 2) { // not partioning well. do a random split.
    vector<pair<int, float> > closest;
    if (_DEBUG > 0)
      cout << "WARNING: " << N << " did not partition well. "
           << cluster_size.first << " " << cluster_size.second
           << ". Regrouping." << endl;
    closestN(N / 2, minind, closest);
    for (int j = 0; j < closest.size();
         j++) { // change cluster of closet N2 guys.
      if (minind == 0) {
        clustering[closest[j].first] = cluster_id.first;
        cluster_size.first++;
        cluster_size.second--;
      } else {
        clustering[closest[j].first] = cluster_id.second;
        cluster_size.first--;
        cluster_size.second++;
      }
    }
  }

  return (cluster_size);
};
Example #3
0
void initKMeans(gsl_rng *ptGSLRNG, t_Cluster *ptCluster, t_Data *ptData)
{
  /*very simple initialisation assign each data point to random cluster*/
  int i = 0, k = 0, nN = ptData->nN, nK = ptCluster->nK, nD = ptData->nD;
  double **aadMu = ptCluster->aadMu, **aadX = ptData->aadX; 
  int *anMaxZ = ptCluster->anMaxZ, *anW = ptCluster->anW, nChange = nN;
  int nIter = 0, nMaxIter = ptCluster->nMaxIter;
  for(i = 0; i < nN; i++){
    int nIK = gsl_rng_uniform_int (ptGSLRNG, nK);

    ptCluster->anMaxZ[i] = nIK;
    anW[nIK]++;
  }
  
  updateMeans(ptCluster, ptData);
  
  while(nChange > 0 && nIter < nMaxIter){
    nChange = 0;
    /*reassign vectors*/
    for(i = 0; i < nN; i++){
      double dMinDist = DBL_MAX;
      int    nMinK = NOT_SET;

      for(k = 0; k < nK; k++){
	double dDist = calcDist(aadX[i],aadMu[k],nD);

	if(dDist < dMinDist){
	  nMinK = k;
	  dMinDist = dDist;
	}
      }

      if(nMinK != anMaxZ[i]){
	int nCurr = anMaxZ[i];
	nChange++;
	anW[nCurr]--;
	anW[nMinK]++;
	anMaxZ[i] = nMinK;

	/*check for empty clusters*/
	if(anW[nCurr] == 0){
	  int nRandI =  gsl_rng_uniform_int (ptGSLRNG, nN);
	  int nKI = 0;
	  /*select at random from non empty clusters*/

	  while(anW[anMaxZ[nRandI]] == 1){
	    nRandI =  gsl_rng_uniform_int (ptGSLRNG, nN);
	  }

	  nKI = anMaxZ[nRandI];
	  anW[nKI]--;
	  anW[nCurr] = 1;
	  anMaxZ[nRandI] = nCurr;
	}
      }
    }
    //printf("%d %d\n",nIter,nChange);
    nIter++;
    updateMeans(ptCluster, ptData);
  }

  for(i = 0; i < nN; i++){
    for(k = 0; k < nK; k++){
      ptCluster->aadZ[i][k] = 0.0;
    }
    ptCluster->aadZ[i][anMaxZ[i]] = 1.0;
  }

  performMStep(ptCluster, ptData);
  return;
}