Exemplo n.º 1
0
void example_kmeans( int nrows, int ncols, 
                     double** data, 
                     int nclusters, int npass, 
                     char dist, char* jobname)

/* Perform k-means clustering on genes */
{ 
   int i, j, ii, nl, nc;
   //const int nclusters = 3;
   const int transpose = 0;
   //const char dist = 'e';
   const char method = 'a';
   /* For method=='a', the centroid is defined as the mean over all elements
     belonging to a cluster for each dimension.
     For method=='m', the centroid is defined as the median over all elements
     belonging to a cluster for each dimension.
   */

   //int npass = 1;
   int ifound = 0;
   int test=0;
   double error;
   double distance;
   int** index;
   int* count;
   double* weight = malloc(ncols*sizeof(double));
   int* clusterid = malloc(nrows*sizeof(int));
   double** cdata = malloc(nclusters*sizeof(double*));

   int n=0;
   char* filename;
   char* filename2;
   char* filename3;
   FILE *out1=NULL;
   FILE *out2=NULL;
   FILE *out3=NULL;
   
   for (i = 0; i < nclusters; i++)
   { cdata[i] = malloc(ncols*sizeof(double));

   }
   for (i = 0; i < ncols; i++) weight[i] = 1.0;



   n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext");

   if (dist)
   { int dummy = nclusters;
    do n++; while (dummy/=10);
   }
    
   //avovk 
   printf("a je u omari :) \n");
   filename = malloc(n*sizeof(char));
   filename2 = malloc(n*sizeof(char));
   filename3 = malloc(n*sizeof(char));
   sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters);
   out1 = fopen( filename, "w" );
   sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters);
   out2 = fopen( filename2, "w" );
   sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters);
   out3 = fopen( filename3, "w" );

   printf("======================== k-means clustering"
         " ========================\n");

   printf ("\n");
   printf ("----- doing %d passes... go stretch your legs...\n",npass);
   //npass = 3;
   kcluster(nclusters,nrows,ncols,data,weight,transpose,npass,method,dist, 
    clusterid, &error, &ifound);
   printf ("Solution found %d times; ", ifound);
   printf ("within-cluster sum of distances is %f\n", error);
   printf ("------- writing Cluster assignments to file:\t\t"
          " %s_K_G%d.kgg\n",jobname, nclusters);
   for (i = 0; i < nrows; i++)
     fprintf (out1, "%09d\t %d\n", i, clusterid[i]);
   fclose(out1); out1=NULL;
  
  printf ("------- writing Distance between clusters to file:\t %s_K_G%d.dis \n", jobname, nclusters);
  fprintf (out2,"------- Distance between clusters:\n");
  index = malloc(nclusters*sizeof(int*));
  count = malloc(nclusters*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++) count[clusterid[i]]++;
  for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++)
  { int id = clusterid[i];
    index[id][count[id]] = i;
    count[id]++;
  }  

  for (i = 0; i < nclusters-1; i++)
    {
      for (j = 1+i; j < nclusters; j++)
	{
	  distance = clusterdistance(nrows, ncols, data, weight, count[i], count[j], index[i], index[j], 'e', 'a', 0); 
	  fprintf(out2,"Distance between %d and %d: %7.3f\n", i, j, distance);
	  // fprintf(stderr,"Distance between %d and %d: %7.3f\n", i, j, distance);
	}
    }
   
  fclose(out2); out2=NULL;



   printf ("------- writing Cluster centroids to file:\t\t%s_K_G%d.cen\n",jobname, nclusters);
   fprintf (out3,"------- Cluster centroids:\n");
   getclustercentroids(nclusters, nrows, ncols, data, clusterid, cdata, 0, 'a');
	   fprintf(out3,"   coefficients:");
		       for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i);
		       fprintf(out3,"\n");
   for (i = 0; i < nclusters; i++){ 
      fprintf(out3,"Cluster %2d:", i);
      for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]);
      fprintf(out3,"\n");
   }
   fclose(out3); out3=NULL;
   printf("Done...\n");

   /* call function to calculate distance between each voxel and centroid */
   /* we will need: 
      count - number of elements in cluster as we allready have it
      cdata - cluster centroids
      clusterid
      data */


      getvoxlclusterdist(count, cdata, clusterid, data, jobname, 
nclusters, nrows, ncols);


   for (i = 0; i < nclusters; i++) free(index[i]);
   free(index);
   free(count);

   for (i = 0; i < nclusters; i++){ 
      free(cdata[i]);
   }

   
   free(cdata);

   free(clusterid);
   free(weight);

   return;
}
Exemplo n.º 2
0
void example_kmeans( int nrows, int ncols, 
                     double** data, 
                     int nclusters, int npass, 
                     char dist, char* jobname,
                     int *clusterid)
/* Perform k-means clustering on genes */
{ 
   int i, j, ii, nl, nc;
   //const int nclusters = 3;
   const int transpose = 0;
   //const char dist = 'e';
   const char method = 'a';
   /* For method=='a', the centroid is defined as the mean over all elements
     belonging to a cluster for each dimension.
     For method=='m', the centroid is defined as the median over all elements
     belonging to a cluster for each dimension.
   */

   //int npass = 1;
   int ifound = 0;
   int test=0;
   double error;
   double distance;
   int** index;
   int* count;
   double* weight = malloc(ncols*sizeof(double));
   double** cdata = malloc(nclusters*sizeof(double*));
   int** cmask = malloc(nclusters*sizeof(int*));
   int** mask = NULL;
   int n=0;
   char* filename;
   char* filename2;
   char* filename3;
   FILE *out1=NULL;
   FILE *out2=NULL;
   FILE *out3=NULL;
   
   for (i = 0; i < nclusters; i++)
   { cdata[i] = malloc(ncols*sizeof(double));
    cmask[i] = malloc(ncols*sizeof(int));
   }
   for (i = 0; i < ncols; i++) weight[i] = 1.0;

   mask = (int **)calloc(sizeof(int*), nrows);
   for (ii=0;ii<nrows;++ii) {
    mask[ii] = (int *)calloc(sizeof(int),ncols);
   }

   for (nl=0; nl<nrows; ++nl) {
    for (nc=0; nc<ncols; ++nc) {
      mask[nl][nc] = 1;
    }
   }



   n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext");

   if (dist)
   { int dummy = nclusters;
    do n++; while (dummy/=10);
   }
    
   //avovk 
   printf("a je u omari :) \n");
   filename = (char *)malloc(n*sizeof(char));
   filename2 = (char *)malloc(n*sizeof(char));
   filename3 = (char *)malloc(n*sizeof(char));
   sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters);
   out1 = fopen( filename, "w" );
   sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters);
   out2 = fopen( filename2, "w" );
   sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters);
   out3 = fopen( filename3, "w" );

   printf("======================== k-means clustering"
         " ========================\n");

   printf ("\n");
   printf ("----- doing %d passes... go stretch your legs...\n",npass);
   //npass = 3;
   /* ANDREJ: This function returns different answers each time it is 
      executed. Does the library provide for ways to initialize the
      random number generators used for the searching and initializations? */
   kcluster(   nclusters,
               nrows,ncols,data,
               mask,weight,
               transpose,npass,
               method,dist,
               clusterid, &error, &ifound);
   printf ("Solution found %d times; ", ifound);
   printf ("within-cluster sum of distances is %f\n", error);
   printf ("------- writing Cluster assignments to file:\t\t"
          " %s_K_G%d.kgg\n",jobname, nclusters);
   for (i = 0; i < nrows; i++)
    fprintf (out1, "%09d\t %d\n", i, clusterid[i]);
   fclose(out1); out1=NULL;

   printf ("------- writing Distance between clusters to file:\t "
          "%s_K_G%d.dis \n", jobname, nclusters);
   fprintf (out2,"------- Distance between clusters:\n");
   index = (int **)malloc(nclusters*sizeof(int*));
   count = (int *)malloc(nclusters*sizeof(int));
   for (i = 0; i < nclusters; i++) count[i] = 0;
   for (i = 0; i < nrows; i++) count[clusterid[i]]++;
   for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
   for (i = 0; i < nclusters; i++) count[i] = 0;
   for (i = 0; i < nrows; i++){ 
      int id = clusterid[i];
      index[id][count[id]] = i;
      count[id]++;
   }  
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1],
		    index[0], index[1], 'e', 'a', 0); 
   fprintf(out2,"Distance between 0 and 1: %7.3f\n", distance);
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2],
		    index[0], index[2], 'e', 'a', 0); 
   fprintf(out2,"Distance between 0 and 2: %7.3f\n", distance);
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2],
		    index[1], index[2], 'e', 'a', 0); 
   fprintf(out2,"Distance between 1 and 2: %7.3f\n", distance);
   fclose(out2); out2=NULL;

   printf ("------- writing Cluster centroids to file:\t\t"
          "%s_K_G%d.cen\n",jobname, nclusters);
   fprintf (out3,"------- Cluster centroids:\n");
   getclustercentroids(nclusters, nrows, ncols, data, mask, clusterid,
                      cdata, cmask, 0, 'a');
   fprintf(out3,"   coefficients:");
   for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i);
   fprintf(out3,"\n");
   for (i = 0; i < nclusters; i++){ 
      fprintf(out3,"Cluster %2d:", i);
      for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]);
      fprintf(out3,"\n");
   }
   fclose(out3); out3=NULL;
   printf("Done...\n");
   for (i = 0; i < nclusters; i++) free(index[i]);
   free(index);
   free(count);

   for (i = 0; i < nclusters; i++){ 
      free(cdata[i]);
      free(cmask[i]);
   }
   for (ii=0;ii<nrows;++ii) {
      if (mask[ii]) free(mask[ii]);
   }

   
   free(cdata);
   free(cmask);
   free(weight);
   free(mask);
   return;
}
Exemplo n.º 3
0
void example_kmeans(int nrows, int ncols, double** data, int** mask)
/* Perform k-means clustering on genes */
{ int i, j;
  const int nclusters = 3;
  const int transpose = 0;
  const char dist = 'e';
  const char method = 'a';
  int npass = 1;
  int ifound = 0;
  double error;
  double distance;
  int** index;
  int* count;
  double* weight = malloc(ncols*sizeof(double));
  int* clusterid = malloc(nrows*sizeof(int));
  double** cdata = malloc(nclusters*sizeof(double*));
  int** cmask = malloc(nclusters*sizeof(int*));
  for (i = 0; i < nclusters; i++)
  { cdata[i] = malloc(ncols*sizeof(double));
    cmask[i] = malloc(ncols*sizeof(int));
  }
  for (i = 0; i < ncols; i++) weight[i] = 1.0;
  printf("======================== k-means clustering ====================\n");
  printf("\n");
  printf("----- one pass of the EM algorithm (results may change)\n");
  kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, 
    clusterid, &error, &ifound);
  printf ("Solution found %d times; within-cluster sum of distances is %f\n",
    ifound, error);
  printf ("Cluster assignments:\n");
  for (i = 0; i < nrows; i++)
    printf ("Gene %d: cluster %d\n", i, clusterid[i]);

  printf ("\n");
  printf("----- 1000 passes of the EM algorithm (result should not change)\n");
  npass = 1000;
  kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, 
    clusterid, &error, &ifound);
  printf ("Solution found %d times; ", ifound);
  printf ("within-cluster sum of distances is %f\n", error);
  printf ("Cluster assignments:\n");
  for (i = 0; i < nrows; i++)
    printf ("Gene %d: cluster %d\n", i, clusterid[i]);
  printf ("\n");
  printf ("------- Distance between clusters:\n");
  index = malloc(nclusters*sizeof(int*));
  count = malloc(nclusters*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++) count[clusterid[i]]++;
  for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++)
  { int id = clusterid[i];
    index[id][count[id]] = i;
    count[id]++;
  }  
  distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1],
		    index[0], index[1], 'e', 'a', 0); 
  printf("Distance between 0 and 1: %7.3f\n", distance);
  distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2],
		    index[0], index[2], 'e', 'a', 0); 
  printf("Distance between 0 and 2: %7.3f\n", distance);
  distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2],
		    index[1], index[2], 'e', 'a', 0); 
  printf("Distance between 1 and 2: %7.3f\n", distance);

  printf ("\n");
  printf ("------- Cluster centroids:\n");
  getclustermean(nclusters, nrows, ncols, data, mask, clusterid,
		 cdata, cmask, 0);
  printf("   Microarray:");
  for(i=0; i<ncols; i++) printf("\t%7d", i);
  printf("\n");
  for (i = 0; i < nclusters; i++)
  { printf("Cluster %2d:", i);
    for (j = 0; j < ncols; j++) printf("\t%7.3f", cdata[i][j]);
    printf("\n");
  }
  printf("\n");
  for (i = 0; i < nclusters; i++) free(index[i]);
  free(index);
  free(count);

  for (i = 0; i < nclusters; i++)
  { free(cdata[i]);
    free(cmask[i]);
  }
  free(cdata);
  free(cmask);
  free(clusterid);
  free(weight);
  return;
}