/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
 
 CHk=Bk/(k−1)/Wk/(n−k)
 
 where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
double CommunityTypeFinder::calcCHIndex(vector< vector< double> > dists){
    try {
        double CH = 0.0;
        
        if (numPartitions < 2) { return CH; }
        
        map<int, int> clusterMap; //map sample to partition
        for (int j = 0; j < numSamples; j++) {
            double maxValue = -1e6;
            for (int i = 0; i < numPartitions; i++) {
                if (m->control_pressed) { return 0.0; }
                if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
                    clusterMap[j] = i;
                    maxValue = zMatrix[i][j];
                }
            }
        }
        
        //make countMatrix a relabund
        vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs]
        //get relative abundance
        for(int i=0;i<numSamples;i++){
            if (m->control_pressed) {  return 0; }
            int groupTotal = 0;
            
            relativeAbundance[i].assign(numOTUs, 0.0);
            
            for(int j=0;j<numOTUs;j++){
                groupTotal += countMatrix[i][j];
            }
            for(int j=0;j<numOTUs;j++){
                relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
            }
        }
        
        //find centers
        vector<vector<double> > centers = calcCenters(dists, clusterMap, relativeAbundance);
        
        if (m->control_pressed) { return 0.0; }
        
        double allMeanDist = rMedoid(relativeAbundance, dists);
        
        if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); }
        
        for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples
            for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus
                if (m->control_pressed) {  return 0; }
                //x <- (x - centers[cl, ])^2
                relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j]));
            }
        }
        
        double wgss = 0.0;
        for (int j = 0; j < numOTUs; j++) {
            for(int i=0;i<numSamples;i++){
                if (m->control_pressed) { return 0.0; }
                wgss += relativeAbundance[i][j];
            }
        }
        
        double bgss = allMeanDist - wgss;
        
        CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions));
        
        return CH;
    }
    catch(exception& e){
        m->errorOut(e, "CommunityTypeFinder", "calcCHIndex");
        exit(1);
    }
}
Exemple #2
0
void qFinderDMM::kMeans(){
    try {
        
        vector<vector<double> > relativeAbundance(numSamples);
        vector<vector<double> > alphaMatrix;
        
        alphaMatrix.resize(numPartitions);
        lambdaMatrix.resize(numPartitions);
        for(int i=0;i<numPartitions;i++){
            alphaMatrix[i].assign(numOTUs, 0);
            lambdaMatrix[i].assign(numOTUs, 0);
        }
        
        //get relative abundance
        for(int i=0;i<numSamples;i++){
            if (m->control_pressed) {  return; }
            int groupTotal = 0;
            
            relativeAbundance[i].assign(numOTUs, 0.0);
            
            for(int j=0;j<numOTUs;j++){
                groupTotal += countMatrix[i][j];
            }
            for(int j=0;j<numOTUs;j++){
                relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
            }
        }
        
        //randomly assign samples into partitions
        zMatrix.resize(numPartitions);
        for(int i=0;i<numPartitions;i++){
            zMatrix[i].assign(numSamples, 0);
        }
        
        for(int i=0;i<numSamples;i++){
            zMatrix[rand()%numPartitions][i] = 1;
        }
        
        double maxChange = 1;
        int maxIters = 1000;
        int iteration = 0;
        
        weights.assign(numPartitions, 0);
        
        while(maxChange > 1e-6 && iteration < maxIters){
            
            if (m->control_pressed) {  return; }
            //calcualte average relative abundance
            maxChange = 0.0000;
            for(int i=0;i<numPartitions;i++){
                
                double normChange = 0.0;
                
                weights[i] = 0;
                
                for(int j=0;j<numSamples;j++){
                    weights[i] += (double)zMatrix[i][j];
                }
                
                vector<double> averageRelativeAbundance(numOTUs, 0);
                for(int j=0;j<numOTUs;j++){
                    for(int k=0;k<numSamples;k++){
                        averageRelativeAbundance[j] += zMatrix[i][k] * relativeAbundance[k][j];
                    }
                }
                
                for(int j=0;j<numOTUs;j++){
                    averageRelativeAbundance[j] /= weights[i];
                    double difference = averageRelativeAbundance[j] - alphaMatrix[i][j];
                    normChange += difference * difference;
                    alphaMatrix[i][j] = averageRelativeAbundance[j];
                }
                
                normChange = sqrt(normChange);
                
                if(normChange > maxChange){ maxChange = normChange; }
            }
            
            
            //calcualte distance between each sample in partition adn the average relative abundance
            for(int i=0;i<numSamples;i++){
                if (m->control_pressed) {  return; }
                
                double normalizationFactor = 0;
                vector<double> totalDistToPartition(numPartitions, 0);
                
                for(int j=0;j<numPartitions;j++){
                    for(int k=0;k<numOTUs;k++){
                        double difference = alphaMatrix[j][k] - relativeAbundance[i][k];
                        totalDistToPartition[j] += difference * difference;
                    }
                    totalDistToPartition[j] = sqrt(totalDistToPartition[j]);
                    normalizationFactor += exp(-50.0 * totalDistToPartition[j]);
                }
                
                
                for(int j=0;j<numPartitions;j++){
                    zMatrix[j][i] = exp(-50.0 * totalDistToPartition[j]) / normalizationFactor;
                }
                
            }
            
            iteration++;
            //        cout << "K means: " << iteration << '\t' << maxChange << endl;
            
        }
        
        //    cout << "Iter:-1";
        for(int i=0;i<numPartitions;i++){
            weights[i] = 0.0000;
            
            for(int j=0;j<numSamples;j++){
                weights[i] += zMatrix[i][j];
            }
            //        printf("\tw_%d=%.3f", i, weights[i]);
        }
        //    cout << endl;
        
        
        for(int i=0;i<numOTUs;i++){
            if (m->control_pressed) {  return; }
            for(int j=0;j<numPartitions;j++){
                if(alphaMatrix[j][i] > 0){
                    lambdaMatrix[j][i] = log(alphaMatrix[j][i]);
                }
                else{
                    lambdaMatrix[j][i] = -10.0;
                }
            }
        }
    }
    catch(exception& e){
        m->errorOut(e, "qFinderDMM", "kMeans");
        exit(1);
    }
}
int CommunityTypeFinder::findkMeans(){
    try {
        error.resize(numPartitions); for (int i = 0; i < numPartitions; i++) { error[i].resize(numOTUs, 0.0); }
        vector<vector<double> > relativeAbundance(numSamples);
        vector<vector<double> > alphaMatrix;
        
        alphaMatrix.resize(numPartitions);
        lambdaMatrix.resize(numPartitions);
        for(int i=0;i<numPartitions;i++){
            alphaMatrix[i].assign(numOTUs, 0);
            lambdaMatrix[i].assign(numOTUs, 0);
        }
        
        //get relative abundance
        for(int i=0;i<numSamples;i++){
            if (m->control_pressed) {  return 0; }
            int groupTotal = 0;
            
            relativeAbundance[i].assign(numOTUs, 0.0);
            
            for(int j=0;j<numOTUs;j++){
                groupTotal += countMatrix[i][j];
            }
            for(int j=0;j<numOTUs;j++){
                relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
            }
        }
        
        //randomly assign samples into partitions
        zMatrix.resize(numPartitions);
        for(int i=0;i<numPartitions;i++){
            zMatrix[i].assign(numSamples, 0);
        }
        
        //randomize samples
        vector<int> temp;
        for (int i = 0; i < numSamples; i++) { temp.push_back(i); }
        random_shuffle(temp.begin(), temp.end());
        
        //assign each partition at least one random sample
        int numAssignedSamples = 0;
        for (int i = 0; i < numPartitions; i++) {
            zMatrix[i][temp[numAssignedSamples]] = 1;
            numAssignedSamples++;
        }
        
        //assign rest of samples to partitions
        int count = 0;
        for(int i=numAssignedSamples;i<numSamples;i++){
            zMatrix[count%numPartitions][temp[i]] = 1;
            count++;
        }
        
        double maxChange = 1;
        int maxIters = 1000;
        int iteration = 0;
        
        weights.assign(numPartitions, 0);
        
        while(maxChange > 1e-6 && iteration < maxIters){
            
            if (m->control_pressed) {  return 0; }
            //calcualte average relative abundance
            maxChange = 0.0000;
            for(int i=0;i<numPartitions;i++){
                
                double normChange = 0.0;
                
                weights[i] = 0;
                
                for(int j=0;j<numSamples;j++){
                    weights[i] += (double)zMatrix[i][j];
                }
                
                vector<double> averageRelativeAbundance(numOTUs, 0);
                for(int j=0;j<numOTUs;j++){
                    for(int k=0;k<numSamples;k++){
                        averageRelativeAbundance[j] += zMatrix[i][k] * relativeAbundance[k][j];
                    }
                }
                
                for(int j=0;j<numOTUs;j++){
                    averageRelativeAbundance[j] /= weights[i];
                    
                    double difference = averageRelativeAbundance[j] - alphaMatrix[i][j];
                    normChange += difference * difference;
                    alphaMatrix[i][j] = averageRelativeAbundance[j];
                }
                
                normChange = sqrt(normChange);
                
                if(normChange > maxChange){ maxChange = normChange; }
            }
            
            
            //calcualte distance between each sample in partition and the average relative abundance
            for(int i=0;i<numSamples;i++){
                if (m->control_pressed) {  return 0; }
                
                double normalizationFactor = 0;
                vector<double> totalDistToPartition(numPartitions, 0);
                
                for(int j=0;j<numPartitions;j++){
                    for(int k=0;k<numOTUs;k++){
                        double difference = alphaMatrix[j][k] - relativeAbundance[i][k];
                        totalDistToPartition[j] += difference * difference;
                    }
                    totalDistToPartition[j] = sqrt(totalDistToPartition[j]);
                    normalizationFactor += exp(-50.0 * totalDistToPartition[j]);
                }
                
                
                for(int j=0;j<numPartitions;j++){
                    zMatrix[j][i] = exp(-50.0 * totalDistToPartition[j]) / normalizationFactor;
                }
                
            }
            
            iteration++;
            //        cout << "K means: " << iteration << '\t' << maxChange << endl;
            
        }
        
        //    cout << "Iter:-1";
        for(int i=0;i<numPartitions;i++){
            weights[i] = 0.0000;
            
            for(int j=0;j<numSamples;j++){
                weights[i] += zMatrix[i][j];
            }
            //        printf("\tw_%d=%.3f", i, weights[i]);
        }
        //    cout << endl;
        
        
        for(int i=0;i<numOTUs;i++){
            if (m->control_pressed) {  return 0; }
            for(int j=0;j<numPartitions;j++){
                if(alphaMatrix[j][i] > 0){
                    lambdaMatrix[j][i] = log(alphaMatrix[j][i]);
                }
                else{
                    lambdaMatrix[j][i] = -10.0;
                }
            }
        }
        return 0;
    }
    catch(exception& e){
        m->errorOut(e, "CommunityTypeFinder", "kMeans");
        exit(1);
    }
}