void example_kmeans( int nrows, int ncols, double** data, int nclusters, int npass, char dist, char* jobname) /* Perform k-means clustering on genes */ { int i, j, ii, nl, nc; //const int nclusters = 3; const int transpose = 0; //const char dist = 'e'; const char method = 'a'; /* For method=='a', the centroid is defined as the mean over all elements belonging to a cluster for each dimension. For method=='m', the centroid is defined as the median over all elements belonging to a cluster for each dimension. */ //int npass = 1; int ifound = 0; int test=0; double error; double distance; int** index; int* count; double* weight = malloc(ncols*sizeof(double)); int* clusterid = malloc(nrows*sizeof(int)); double** cdata = malloc(nclusters*sizeof(double*)); int n=0; char* filename; char* filename2; char* filename3; FILE *out1=NULL; FILE *out2=NULL; FILE *out3=NULL; for (i = 0; i < nclusters; i++) { cdata[i] = malloc(ncols*sizeof(double)); } for (i = 0; i < ncols; i++) weight[i] = 1.0; n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext"); if (dist) { int dummy = nclusters; do n++; while (dummy/=10); } //avovk printf("a je u omari :) \n"); filename = malloc(n*sizeof(char)); filename2 = malloc(n*sizeof(char)); filename3 = malloc(n*sizeof(char)); sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters); out1 = fopen( filename, "w" ); sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters); out2 = fopen( filename2, "w" ); sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters); out3 = fopen( filename3, "w" ); printf("======================== k-means clustering" " ========================\n"); printf ("\n"); printf ("----- doing %d passes... go stretch your legs...\n",npass); //npass = 3; kcluster(nclusters,nrows,ncols,data,weight,transpose,npass,method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; ", ifound); printf ("within-cluster sum of distances is %f\n", error); printf ("------- writing Cluster assignments to file:\t\t" " %s_K_G%d.kgg\n",jobname, nclusters); for (i = 0; i < nrows; i++) fprintf (out1, "%09d\t %d\n", i, clusterid[i]); fclose(out1); out1=NULL; printf ("------- writing Distance between clusters to file:\t %s_K_G%d.dis \n", jobname, nclusters); fprintf (out2,"------- Distance between clusters:\n"); index = malloc(nclusters*sizeof(int*)); count = malloc(nclusters*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) count[clusterid[i]]++; for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) { int id = clusterid[i]; index[id][count[id]] = i; count[id]++; } for (i = 0; i < nclusters-1; i++) { for (j = 1+i; j < nclusters; j++) { distance = clusterdistance(nrows, ncols, data, weight, count[i], count[j], index[i], index[j], 'e', 'a', 0); fprintf(out2,"Distance between %d and %d: %7.3f\n", i, j, distance); // fprintf(stderr,"Distance between %d and %d: %7.3f\n", i, j, distance); } } fclose(out2); out2=NULL; printf ("------- writing Cluster centroids to file:\t\t%s_K_G%d.cen\n",jobname, nclusters); fprintf (out3,"------- Cluster centroids:\n"); getclustercentroids(nclusters, nrows, ncols, data, clusterid, cdata, 0, 'a'); fprintf(out3," coefficients:"); for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i); fprintf(out3,"\n"); for (i = 0; i < nclusters; i++){ fprintf(out3,"Cluster %2d:", i); for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]); fprintf(out3,"\n"); } fclose(out3); out3=NULL; printf("Done...\n"); /* call function to calculate distance between each voxel and centroid */ /* we will need: count - number of elements in cluster as we allready have it cdata - cluster centroids clusterid data */ getvoxlclusterdist(count, cdata, clusterid, data, jobname, nclusters, nrows, ncols); for (i = 0; i < nclusters; i++) free(index[i]); free(index); free(count); for (i = 0; i < nclusters; i++){ free(cdata[i]); } free(cdata); free(clusterid); free(weight); return; }
void example_kmeans( int nrows, int ncols, double** data, int nclusters, int npass, char dist, char* jobname, int *clusterid) /* Perform k-means clustering on genes */ { int i, j, ii, nl, nc; //const int nclusters = 3; const int transpose = 0; //const char dist = 'e'; const char method = 'a'; /* For method=='a', the centroid is defined as the mean over all elements belonging to a cluster for each dimension. For method=='m', the centroid is defined as the median over all elements belonging to a cluster for each dimension. */ //int npass = 1; int ifound = 0; int test=0; double error; double distance; int** index; int* count; double* weight = malloc(ncols*sizeof(double)); double** cdata = malloc(nclusters*sizeof(double*)); int** cmask = malloc(nclusters*sizeof(int*)); int** mask = NULL; int n=0; char* filename; char* filename2; char* filename3; FILE *out1=NULL; FILE *out2=NULL; FILE *out3=NULL; for (i = 0; i < nclusters; i++) { cdata[i] = malloc(ncols*sizeof(double)); cmask[i] = malloc(ncols*sizeof(int)); } for (i = 0; i < ncols; i++) weight[i] = 1.0; mask = (int **)calloc(sizeof(int*), nrows); for (ii=0;ii<nrows;++ii) { mask[ii] = (int *)calloc(sizeof(int),ncols); } for (nl=0; nl<nrows; ++nl) { for (nc=0; nc<ncols; ++nc) { mask[nl][nc] = 1; } } n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext"); if (dist) { int dummy = nclusters; do n++; while (dummy/=10); } //avovk printf("a je u omari :) \n"); filename = (char *)malloc(n*sizeof(char)); filename2 = (char *)malloc(n*sizeof(char)); filename3 = (char *)malloc(n*sizeof(char)); sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters); out1 = fopen( filename, "w" ); sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters); out2 = fopen( filename2, "w" ); sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters); out3 = fopen( filename3, "w" ); printf("======================== k-means clustering" " ========================\n"); printf ("\n"); printf ("----- doing %d passes... go stretch your legs...\n",npass); //npass = 3; /* ANDREJ: This function returns different answers each time it is executed. Does the library provide for ways to initialize the random number generators used for the searching and initializations? */ kcluster( nclusters, nrows,ncols,data, mask,weight, transpose,npass, method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; ", ifound); printf ("within-cluster sum of distances is %f\n", error); printf ("------- writing Cluster assignments to file:\t\t" " %s_K_G%d.kgg\n",jobname, nclusters); for (i = 0; i < nrows; i++) fprintf (out1, "%09d\t %d\n", i, clusterid[i]); fclose(out1); out1=NULL; printf ("------- writing Distance between clusters to file:\t " "%s_K_G%d.dis \n", jobname, nclusters); fprintf (out2,"------- Distance between clusters:\n"); index = (int **)malloc(nclusters*sizeof(int*)); count = (int *)malloc(nclusters*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) count[clusterid[i]]++; for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++){ int id = clusterid[i]; index[id][count[id]] = i; count[id]++; } distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1], index[0], index[1], 'e', 'a', 0); fprintf(out2,"Distance between 0 and 1: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2], index[0], index[2], 'e', 'a', 0); fprintf(out2,"Distance between 0 and 2: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2], index[1], index[2], 'e', 'a', 0); fprintf(out2,"Distance between 1 and 2: %7.3f\n", distance); fclose(out2); out2=NULL; printf ("------- writing Cluster centroids to file:\t\t" "%s_K_G%d.cen\n",jobname, nclusters); fprintf (out3,"------- Cluster centroids:\n"); getclustercentroids(nclusters, nrows, ncols, data, mask, clusterid, cdata, cmask, 0, 'a'); fprintf(out3," coefficients:"); for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i); fprintf(out3,"\n"); for (i = 0; i < nclusters; i++){ fprintf(out3,"Cluster %2d:", i); for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]); fprintf(out3,"\n"); } fclose(out3); out3=NULL; printf("Done...\n"); for (i = 0; i < nclusters; i++) free(index[i]); free(index); free(count); for (i = 0; i < nclusters; i++){ free(cdata[i]); free(cmask[i]); } for (ii=0;ii<nrows;++ii) { if (mask[ii]) free(mask[ii]); } free(cdata); free(cmask); free(weight); free(mask); return; }
void example_kmeans(int nrows, int ncols, double** data, int** mask) /* Perform k-means clustering on genes */ { int i, j; const int nclusters = 3; const int transpose = 0; const char dist = 'e'; const char method = 'a'; int npass = 1; int ifound = 0; double error; double distance; int** index; int* count; double* weight = malloc(ncols*sizeof(double)); int* clusterid = malloc(nrows*sizeof(int)); double** cdata = malloc(nclusters*sizeof(double*)); int** cmask = malloc(nclusters*sizeof(int*)); for (i = 0; i < nclusters; i++) { cdata[i] = malloc(ncols*sizeof(double)); cmask[i] = malloc(ncols*sizeof(int)); } for (i = 0; i < ncols; i++) weight[i] = 1.0; printf("======================== k-means clustering ====================\n"); printf("\n"); printf("----- one pass of the EM algorithm (results may change)\n"); kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; within-cluster sum of distances is %f\n", ifound, error); printf ("Cluster assignments:\n"); for (i = 0; i < nrows; i++) printf ("Gene %d: cluster %d\n", i, clusterid[i]); printf ("\n"); printf("----- 1000 passes of the EM algorithm (result should not change)\n"); npass = 1000; kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; ", ifound); printf ("within-cluster sum of distances is %f\n", error); printf ("Cluster assignments:\n"); for (i = 0; i < nrows; i++) printf ("Gene %d: cluster %d\n", i, clusterid[i]); printf ("\n"); printf ("------- Distance between clusters:\n"); index = malloc(nclusters*sizeof(int*)); count = malloc(nclusters*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) count[clusterid[i]]++; for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) { int id = clusterid[i]; index[id][count[id]] = i; count[id]++; } distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1], index[0], index[1], 'e', 'a', 0); printf("Distance between 0 and 1: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2], index[0], index[2], 'e', 'a', 0); printf("Distance between 0 and 2: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2], index[1], index[2], 'e', 'a', 0); printf("Distance between 1 and 2: %7.3f\n", distance); printf ("\n"); printf ("------- Cluster centroids:\n"); getclustermean(nclusters, nrows, ncols, data, mask, clusterid, cdata, cmask, 0); printf(" Microarray:"); for(i=0; i<ncols; i++) printf("\t%7d", i); printf("\n"); for (i = 0; i < nclusters; i++) { printf("Cluster %2d:", i); for (j = 0; j < ncols; j++) printf("\t%7.3f", cdata[i][j]); printf("\n"); } printf("\n"); for (i = 0; i < nclusters; i++) free(index[i]); free(index); free(count); for (i = 0; i < nclusters; i++) { free(cdata[i]); free(cmask[i]); } free(cdata); free(cmask); free(clusterid); free(weight); return; }