/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as: CHk=Bk/(k−1)/Wk/(n−k) where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/ double CommunityTypeFinder::calcCHIndex(vector< vector< double> > dists){ try { double CH = 0.0; if (numPartitions < 2) { return CH; } map<int, int> clusterMap; //map sample to partition for (int j = 0; j < numSamples; j++) { double maxValue = -1e6; for (int i = 0; i < numPartitions; i++) { if (m->control_pressed) { return 0.0; } if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be clusterMap[j] = i; maxValue = zMatrix[i][j]; } } } //make countMatrix a relabund vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs] //get relative abundance for(int i=0;i<numSamples;i++){ if (m->control_pressed) { return 0; } int groupTotal = 0; relativeAbundance[i].assign(numOTUs, 0.0); for(int j=0;j<numOTUs;j++){ groupTotal += countMatrix[i][j]; } for(int j=0;j<numOTUs;j++){ relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal; } } //find centers vector<vector<double> > centers = calcCenters(dists, clusterMap, relativeAbundance); if (m->control_pressed) { return 0.0; } double allMeanDist = rMedoid(relativeAbundance, dists); if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); } for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus if (m->control_pressed) { return 0; } //x <- (x - centers[cl, ])^2 relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j])); } } double wgss = 0.0; for (int j = 0; j < numOTUs; j++) { for(int i=0;i<numSamples;i++){ if (m->control_pressed) { return 0.0; } wgss += relativeAbundance[i][j]; } } double bgss = allMeanDist - wgss; CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions)); return CH; } catch(exception& e){ m->errorOut(e, "CommunityTypeFinder", "calcCHIndex"); exit(1); } }
void qFinderDMM::kMeans(){ try { vector<vector<double> > relativeAbundance(numSamples); vector<vector<double> > alphaMatrix; alphaMatrix.resize(numPartitions); lambdaMatrix.resize(numPartitions); for(int i=0;i<numPartitions;i++){ alphaMatrix[i].assign(numOTUs, 0); lambdaMatrix[i].assign(numOTUs, 0); } //get relative abundance for(int i=0;i<numSamples;i++){ if (m->control_pressed) { return; } int groupTotal = 0; relativeAbundance[i].assign(numOTUs, 0.0); for(int j=0;j<numOTUs;j++){ groupTotal += countMatrix[i][j]; } for(int j=0;j<numOTUs;j++){ relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal; } } //randomly assign samples into partitions zMatrix.resize(numPartitions); for(int i=0;i<numPartitions;i++){ zMatrix[i].assign(numSamples, 0); } for(int i=0;i<numSamples;i++){ zMatrix[rand()%numPartitions][i] = 1; } double maxChange = 1; int maxIters = 1000; int iteration = 0; weights.assign(numPartitions, 0); while(maxChange > 1e-6 && iteration < maxIters){ if (m->control_pressed) { return; } //calcualte average relative abundance maxChange = 0.0000; for(int i=0;i<numPartitions;i++){ double normChange = 0.0; weights[i] = 0; for(int j=0;j<numSamples;j++){ weights[i] += (double)zMatrix[i][j]; } vector<double> averageRelativeAbundance(numOTUs, 0); for(int j=0;j<numOTUs;j++){ for(int k=0;k<numSamples;k++){ averageRelativeAbundance[j] += zMatrix[i][k] * relativeAbundance[k][j]; } } for(int j=0;j<numOTUs;j++){ averageRelativeAbundance[j] /= weights[i]; double difference = averageRelativeAbundance[j] - alphaMatrix[i][j]; normChange += difference * difference; alphaMatrix[i][j] = averageRelativeAbundance[j]; } normChange = sqrt(normChange); if(normChange > maxChange){ maxChange = normChange; } } //calcualte distance between each sample in partition adn the average relative abundance for(int i=0;i<numSamples;i++){ if (m->control_pressed) { return; } double normalizationFactor = 0; vector<double> totalDistToPartition(numPartitions, 0); for(int j=0;j<numPartitions;j++){ for(int k=0;k<numOTUs;k++){ double difference = alphaMatrix[j][k] - relativeAbundance[i][k]; totalDistToPartition[j] += difference * difference; } totalDistToPartition[j] = sqrt(totalDistToPartition[j]); normalizationFactor += exp(-50.0 * totalDistToPartition[j]); } for(int j=0;j<numPartitions;j++){ zMatrix[j][i] = exp(-50.0 * totalDistToPartition[j]) / normalizationFactor; } } iteration++; // cout << "K means: " << iteration << '\t' << maxChange << endl; } // cout << "Iter:-1"; for(int i=0;i<numPartitions;i++){ weights[i] = 0.0000; for(int j=0;j<numSamples;j++){ weights[i] += zMatrix[i][j]; } // printf("\tw_%d=%.3f", i, weights[i]); } // cout << endl; for(int i=0;i<numOTUs;i++){ if (m->control_pressed) { return; } for(int j=0;j<numPartitions;j++){ if(alphaMatrix[j][i] > 0){ lambdaMatrix[j][i] = log(alphaMatrix[j][i]); } else{ lambdaMatrix[j][i] = -10.0; } } } } catch(exception& e){ m->errorOut(e, "qFinderDMM", "kMeans"); exit(1); } }
int CommunityTypeFinder::findkMeans(){ try { error.resize(numPartitions); for (int i = 0; i < numPartitions; i++) { error[i].resize(numOTUs, 0.0); } vector<vector<double> > relativeAbundance(numSamples); vector<vector<double> > alphaMatrix; alphaMatrix.resize(numPartitions); lambdaMatrix.resize(numPartitions); for(int i=0;i<numPartitions;i++){ alphaMatrix[i].assign(numOTUs, 0); lambdaMatrix[i].assign(numOTUs, 0); } //get relative abundance for(int i=0;i<numSamples;i++){ if (m->control_pressed) { return 0; } int groupTotal = 0; relativeAbundance[i].assign(numOTUs, 0.0); for(int j=0;j<numOTUs;j++){ groupTotal += countMatrix[i][j]; } for(int j=0;j<numOTUs;j++){ relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal; } } //randomly assign samples into partitions zMatrix.resize(numPartitions); for(int i=0;i<numPartitions;i++){ zMatrix[i].assign(numSamples, 0); } //randomize samples vector<int> temp; for (int i = 0; i < numSamples; i++) { temp.push_back(i); } random_shuffle(temp.begin(), temp.end()); //assign each partition at least one random sample int numAssignedSamples = 0; for (int i = 0; i < numPartitions; i++) { zMatrix[i][temp[numAssignedSamples]] = 1; numAssignedSamples++; } //assign rest of samples to partitions int count = 0; for(int i=numAssignedSamples;i<numSamples;i++){ zMatrix[count%numPartitions][temp[i]] = 1; count++; } double maxChange = 1; int maxIters = 1000; int iteration = 0; weights.assign(numPartitions, 0); while(maxChange > 1e-6 && iteration < maxIters){ if (m->control_pressed) { return 0; } //calcualte average relative abundance maxChange = 0.0000; for(int i=0;i<numPartitions;i++){ double normChange = 0.0; weights[i] = 0; for(int j=0;j<numSamples;j++){ weights[i] += (double)zMatrix[i][j]; } vector<double> averageRelativeAbundance(numOTUs, 0); for(int j=0;j<numOTUs;j++){ for(int k=0;k<numSamples;k++){ averageRelativeAbundance[j] += zMatrix[i][k] * relativeAbundance[k][j]; } } for(int j=0;j<numOTUs;j++){ averageRelativeAbundance[j] /= weights[i]; double difference = averageRelativeAbundance[j] - alphaMatrix[i][j]; normChange += difference * difference; alphaMatrix[i][j] = averageRelativeAbundance[j]; } normChange = sqrt(normChange); if(normChange > maxChange){ maxChange = normChange; } } //calcualte distance between each sample in partition and the average relative abundance for(int i=0;i<numSamples;i++){ if (m->control_pressed) { return 0; } double normalizationFactor = 0; vector<double> totalDistToPartition(numPartitions, 0); for(int j=0;j<numPartitions;j++){ for(int k=0;k<numOTUs;k++){ double difference = alphaMatrix[j][k] - relativeAbundance[i][k]; totalDistToPartition[j] += difference * difference; } totalDistToPartition[j] = sqrt(totalDistToPartition[j]); normalizationFactor += exp(-50.0 * totalDistToPartition[j]); } for(int j=0;j<numPartitions;j++){ zMatrix[j][i] = exp(-50.0 * totalDistToPartition[j]) / normalizationFactor; } } iteration++; // cout << "K means: " << iteration << '\t' << maxChange << endl; } // cout << "Iter:-1"; for(int i=0;i<numPartitions;i++){ weights[i] = 0.0000; for(int j=0;j<numSamples;j++){ weights[i] += zMatrix[i][j]; } // printf("\tw_%d=%.3f", i, weights[i]); } // cout << endl; for(int i=0;i<numOTUs;i++){ if (m->control_pressed) { return 0; } for(int j=0;j<numPartitions;j++){ if(alphaMatrix[j][i] > 0){ lambdaMatrix[j][i] = log(alphaMatrix[j][i]); } else{ lambdaMatrix[j][i] = -10.0; } } } return 0; } catch(exception& e){ m->errorOut(e, "CommunityTypeFinder", "kMeans"); exit(1); } }