estimates_t EM(const estimates_t& initial, const dataPoints_t& X, unsigned int maxIterations, Progress *progress) { // This will be used for probabilites calculation at each step prob_matrix_t prob; // Number of classes const unsigned int classes = initial.size(); // Sum of probabilites of all classes std::vector<double> sums(classes, 0); for(unsigned int i = 0; i < classes + 1; ++i) { prob.push_back(std::vector<double>()); std::vector<double>& ps = prob.back(); ps.resize(X.size()); } // Estimates after each iteration estimates_t next = initial; for(unsigned int iteration = 1; iteration <= maxIterations; ++iteration) { if (progress != NULL) { progress->updateProgress("EM running on difference image", 100*iteration/maxIterations, NORMAL); } std::fill(sums.begin(), sums.end(), 0.0); computeProbabilities(prob, sums, X, next); estimates_t::iterator estimateItr = next.begin(); estimates_t::const_iterator prev_estimateItr = next.begin(); std::vector<double>::const_iterator sumItr = sums.begin(); prob_matrix_t::const_iterator probItr = prob.begin(); // M-step for(; estimateItr != next.end(); ++estimateItr, ++sumItr, ++probItr, ++prev_estimateItr) { updateStdDevs(*estimateItr, *sumItr, *probItr, X, *prev_estimateItr); updateMeans(*estimateItr, *sumItr, *probItr, X); updateWeights(*estimateItr, *sumItr, X.size()); } } estimates_t final = next; return final; }
pair<int, int> haplotypeCluster::kmeans(unsigned int cluster, unsigned int depth) { int nchanged = 0; int debug = 0; if (_DEBUG > 1) cout << "Clustering (K-means) K = " << K << ". NSNP = " << nsnp << endl; pair<unsigned int, unsigned int> cluster_id(cluster, cluster + pow(2, depth)); int N = 0; for (int i = 0; i < nhap; i++) { if (clustering[i] == cluster) { cluster_mask[i] = true; N++; } else cluster_mask[i] = false; } mu.assign(K, vector<float>(nsnp, -1)); dlook.resize(K); pair<int, int> cluster_size; vector<float> d(K); vector<float> tmp(nsnp); float ss; // kplusplus(K,tmp2,initial_centroids); int idx1 = getIdx(putils::getRandom(N)); int idx2 = getIdx(putils::getRandom(N)); float maxdist = g->vecH[idx1].hamming(g->vecH[idx2], cstart, cstop); for (int i = 0; i < 100; i++) { int tmp1 = getIdx(putils::getRandom(N)); float tmp2 = g->vecH[idx1].hamming(g->vecH[tmp1], cstart, cstop); if (tmp2 > maxdist) { idx2 = tmp1; maxdist = tmp2; } } for (int j = 0; j < nsnp; j++) { mu[0][j] = (float)(g->vecH[idx1][cstart + j]); mu[1][j] = (float)(g->vecH[idx2][cstart + j]); } if (_DEBUG > 1) { printmu(mu); cout << endl; } int closest_cluster; // LLOYDS ALGORITHM - standard K-means routine for (int iteration = 0; iteration < niteration; iteration++) { nchanged = 0; for (int i = 0; i < K; i++) dlook[i].assign(nsnp / 8, vector<float>(256, -1.0)); ss = 0.0; for (int i = 0; i < nhap; i++) { if (cluster_mask[i]) { for (int j = 0; j < K; j++) d[j] = euc(i, j, mu, dlook); if (d[0] < d[1]) { ss += d[0]; closest_cluster = cluster_id.first; } else { ss += d[1]; closest_cluster = cluster_id.second; } if (clustering[i] != closest_cluster) nchanged++; clustering[i] = closest_cluster; } } updateMeans(cluster_id, cluster_size); if (_DEBUG > 1) cout << "LLOYDS K-MEANS ITERATION " << iteration << " Mean SS = " << ss / (float)N << "\t" << nchanged << " haps changed clusters." << endl; if (nchanged == 0) break; } // if(_DEBUG>0) cout << "Lloyds Total SS = " << SS() << "\t"<<nchanged << " // haps changed clusters on last iteration" << endl; int minind = cluster_size.first > cluster_size.second; int minsize = min(cluster_size.first, cluster_size.second); if (minsize < ncond && K == 2) { // not partioning well. do a random split. vector<pair<int, float> > closest; if (_DEBUG > 0) cout << "WARNING: " << N << " did not partition well. " << cluster_size.first << " " << cluster_size.second << ". Regrouping." << endl; closestN(N / 2, minind, closest); for (int j = 0; j < closest.size(); j++) { // change cluster of closet N2 guys. if (minind == 0) { clustering[closest[j].first] = cluster_id.first; cluster_size.first++; cluster_size.second--; } else { clustering[closest[j].first] = cluster_id.second; cluster_size.first--; cluster_size.second++; } } } return (cluster_size); };
void initKMeans(gsl_rng *ptGSLRNG, t_Cluster *ptCluster, t_Data *ptData) { /*very simple initialisation assign each data point to random cluster*/ int i = 0, k = 0, nN = ptData->nN, nK = ptCluster->nK, nD = ptData->nD; double **aadMu = ptCluster->aadMu, **aadX = ptData->aadX; int *anMaxZ = ptCluster->anMaxZ, *anW = ptCluster->anW, nChange = nN; int nIter = 0, nMaxIter = ptCluster->nMaxIter; for(i = 0; i < nN; i++){ int nIK = gsl_rng_uniform_int (ptGSLRNG, nK); ptCluster->anMaxZ[i] = nIK; anW[nIK]++; } updateMeans(ptCluster, ptData); while(nChange > 0 && nIter < nMaxIter){ nChange = 0; /*reassign vectors*/ for(i = 0; i < nN; i++){ double dMinDist = DBL_MAX; int nMinK = NOT_SET; for(k = 0; k < nK; k++){ double dDist = calcDist(aadX[i],aadMu[k],nD); if(dDist < dMinDist){ nMinK = k; dMinDist = dDist; } } if(nMinK != anMaxZ[i]){ int nCurr = anMaxZ[i]; nChange++; anW[nCurr]--; anW[nMinK]++; anMaxZ[i] = nMinK; /*check for empty clusters*/ if(anW[nCurr] == 0){ int nRandI = gsl_rng_uniform_int (ptGSLRNG, nN); int nKI = 0; /*select at random from non empty clusters*/ while(anW[anMaxZ[nRandI]] == 1){ nRandI = gsl_rng_uniform_int (ptGSLRNG, nN); } nKI = anMaxZ[nRandI]; anW[nKI]--; anW[nCurr] = 1; anMaxZ[nRandI] = nCurr; } } } //printf("%d %d\n",nIter,nChange); nIter++; updateMeans(ptCluster, ptData); } for(i = 0; i < nN; i++){ for(k = 0; k < nK; k++){ ptCluster->aadZ[i][k] = 0.0; } ptCluster->aadZ[i][anMaxZ[i]] = 1.0; } performMStep(ptCluster, ptData); return; }