char* oph_ccluster_kcluster(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error)
{
    if(!initid->ptr){
        *error=0;
        *is_null=0;

        initid->ptr=(char *)calloc(1,sizeof(oph_string));
        if(!initid->ptr){
            pmesg(1,  __FILE__, __LINE__, "Error allocating result\n");
            *length=0;
            *is_null=1;
            *error=1;
            return NULL;
        }
        initid->extension = calloc(1,sizeof(oph_ccluster_kcluster_extra));
        if(!initid->extension){
            pmesg(1,  __FILE__, __LINE__, "Error allocating extension\n");
            *length=0;
            *is_null=1;
            *error=1;
            return NULL;
        }

        oph_ccluster_kcluster_extra *extra = (oph_ccluster_kcluster_extra *) initid->extension;

        extra->k = (int) *((long long*) args->args[3]); // set cluster number
        extra->method = OPH_CCLUSTER_KCLUSTER_KMEANS; // default method
        extra->level = OPH_CCLUSTER_KCLUSTER_ALL; // default level
        extra->npass = 1; // default npass
        extra->type = OPH_DOUBLE; // default input type

	if (!strncasecmp(args->args[0],"OPH_INT",args->lengths[0])) extra->type = OPH_INT;
	else if (!strncasecmp(args->args[0],"OPH_SHORT",args->lengths[0])) extra->type = OPH_SHORT;
	else if (!strncasecmp(args->args[0],"OPH_BYTE",args->lengths[0])) extra->type = OPH_BYTE;
	else if (!strncasecmp(args->args[0],"OPH_LONG",args->lengths[0])) extra->type = OPH_LONG;
	else if (!strncasecmp(args->args[0],"OPH_FLOAT",args->lengths[0])) extra->type = OPH_FLOAT;
	else if (!strncasecmp(args->args[0],"OPH_DOUBLE",args->lengths[0])) extra->type = OPH_DOUBLE;
	else {
		pmesg(1,  __FILE__, __LINE__, "Invalid input data type!\n");
		*length=0;
		*is_null=0;
		*error=1;
		return NULL;
	}

        int i;
        for (i = 4; i < args->arg_count; i++) {
                if (args->arg_type[i]==INT_RESULT) { // npass
                    extra->npass = (int) *((long long*) args->args[i]);
                    if (extra->npass < 1) {
                        pmesg(1,  __FILE__, __LINE__, "npass must be >= 1!\n");
                        *length=0;
                        *is_null=0;
                        *error=1;
                        return NULL;
                    }
                } else if (args->arg_type[i]==STRING_RESULT) {
                    if (!strncasecmp(args->args[i],"KMEANS",args->lengths[i])) {
                        extra->method = OPH_CCLUSTER_KCLUSTER_KMEANS;
                    } else if (!strncasecmp(args->args[i],"KMEDIANS",args->lengths[i])) {
                        extra->method = OPH_CCLUSTER_KCLUSTER_KMEDIANS;
                    } else if (!strncasecmp(args->args[i],"CENTROIDS",args->lengths[i])) {
                        extra->level = OPH_CCLUSTER_KCLUSTER_CENTROIDS;
                    } else if (!strncasecmp(args->args[i],"LABELS",args->lengths[i])) {
                        extra->level = OPH_CCLUSTER_KCLUSTER_LABELS;
                    } else if (!strncasecmp(args->args[i],"ALL",args->lengths[i])) {
                        extra->level = OPH_CCLUSTER_KCLUSTER_ALL;
                    } else {
                        pmesg(1,  __FILE__, __LINE__, "invalid argument %d!\n",i);
                        *length=0;
                        *is_null=0;
                        *error=1;
                        return NULL;
                    }
                } else {
                    pmesg(1,  __FILE__, __LINE__, "wrong type for argument %d!\n",i);
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
        }

        extra->npoints = args->lengths[2] / core_sizeof(extra->type);

        extra->data = (double **) malloc(sizeof(double *) * extra->npoints);
        if (!extra->data) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (i = 0; i < extra->npoints; i++) {
            extra->data[i] = (double *) malloc(sizeof(double));
            if (!extra->data[i]) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        }

        extra->mask = (int **) malloc(sizeof(int *) * extra->npoints);
        if (!extra->mask) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (i = 0; i < extra->npoints; i++) {
            extra->mask[i] = (int *) malloc(sizeof(int));
            if (!extra->mask[i]) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        }

        extra->clusterid = (int *) malloc(sizeof(int) * extra->npoints);
        if (!extra->clusterid) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }

        oph_stringPtr output = (oph_stringPtr) initid->ptr;

        if (extra->level==OPH_CCLUSTER_KCLUSTER_CENTROIDS) {
            extra->cdata = (double **) malloc(sizeof(double *) * extra->k);
            if (!extra->cdata) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cdata[i] = (double *) malloc(sizeof(double));
                if (!extra->cdata[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            extra->cmask = (int **) malloc(sizeof(int *) * extra->k);
            if (!extra->cmask) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cmask[i] = (int *) malloc(sizeof(int));
                if (!extra->cmask[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            unsigned long len = (unsigned long ) strlen("OPH_DOUBLE");
            core_set_type(output, "OPH_DOUBLE" ,&len);
            if(core_set_elemsize(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on setting result elements size\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            output->length = (unsigned long *) malloc(sizeof(unsigned long));
            if (!output->length) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating length\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            *(output->length) = (unsigned long) extra->k * output->elemsize;

            output->content = (char *) malloc(*(output->length));
            if (!output->content) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating output\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            if(core_set_numelem(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on counting result elements\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        } else if (extra->level==OPH_CCLUSTER_KCLUSTER_ALL) {
            extra->cdata = (double **) malloc(sizeof(double *) * extra->k);
            if (!extra->cdata) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cdata[i] = (double *) malloc(sizeof(double));
                if (!extra->cdata[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            extra->cmask = (int **) malloc(sizeof(int *) * extra->k);
            if (!extra->cmask) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cmask[i] = (int *) malloc(sizeof(int));
                if (!extra->cmask[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            unsigned long len = (unsigned long ) strlen("OPH_DOUBLE");
            core_set_type(output, "OPH_DOUBLE" ,&len);
            if(core_set_elemsize(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on setting result elements size\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            output->length = (unsigned long *) malloc(sizeof(unsigned long));
            if (!output->length) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating length\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            *(output->length) = (unsigned long) extra->npoints * output->elemsize;

            output->content = (char *) malloc(*(output->length));
            if (!output->content) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating output\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            if(core_set_numelem(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on counting result elements\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        } else {
            unsigned long len = (unsigned long ) strlen("OPH_INT");
            core_set_type(output, "OPH_INT" ,&len);
            if(core_set_elemsize(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on setting result elements size\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            output->length = (unsigned long *) malloc(sizeof(unsigned long));
            if (!output->length) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating length\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            *(output->length) = (unsigned long) extra->npoints * output->elemsize;

            output->content = (char *) malloc(*(output->length));
            if (!output->content) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating output\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            if(core_set_numelem(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on counting result elements\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        }
    }

    if (*error)
    {
        *length=0;
        *is_null=0;
        *error=1;
        return NULL;
    }
    if (*is_null)
    {
        *length=0;
        *is_null=1;
        *error=0;
        return NULL;
    }

    oph_stringPtr output = (oph_stringPtr) initid->ptr;
    oph_ccluster_kcluster_extra *extra = (oph_ccluster_kcluster_extra *) initid->extension;

    double weight[1] = {1.0};
    char method = (extra->method==OPH_CCLUSTER_KCLUSTER_KMEANS)?'a':'m';
    double err;
    int ifound;
    int n;

    switch (extra->type) {
        case OPH_INT:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((int *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_SHORT:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((short *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_BYTE:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((char *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_LONG:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((long long *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_FLOAT:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((float *)(args->args[2]))[n];
                if (isnan(extra->data[n][0])) {
                    extra->mask[n][0] = 0;
                } else {
                    extra->mask[n][0] = 1;
                }
            }
            break;
        case OPH_DOUBLE:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = ((double *)(args->args[2]))[n];
                if (isnan(extra->data[n][0])) {
                    extra->mask[n][0] = 0;
                } else {
                    extra->mask[n][0] = 1;
                }
            }
            break;
        default:
            pmesg(1,  __FILE__, __LINE__, "Invalid input type\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
    }

    kcluster(extra->k,extra->npoints,1,extra->data,extra->mask,weight,0,extra->npass,method,'e',extra->clusterid,&err,&ifound);

    if(ifound==0){
        pmesg(1,  __FILE__, __LINE__, "k > number of points!\n");
        *length=0;
        *is_null=0;
        *error=1;
        return NULL;
    }
    if(ifound==-1){
        pmesg(1,  __FILE__, __LINE__, "Error allocating memory for clustering\n");
        *length=0;
        *is_null=0;
        *error=1;
        return NULL;
    }

    if (extra->level==OPH_CCLUSTER_KCLUSTER_CENTROIDS) {
        if (!getclustercentroids(extra->k,extra->npoints,1,extra->data,extra->mask,extra->clusterid,extra->cdata,extra->cmask,0,method)) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory for centroids retrieval\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (n = 0; n < extra->k; n++) {
            if (extra->cmask[n][0]==0) {
                pmesg(1,  __FILE__, __LINE__, "All cluster members are missing for cluster %d\n",n);
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            } else {
                ((double *)(output->content))[n] = extra->cdata[n][0];
            }
        }
    } else if (extra->level==OPH_CCLUSTER_KCLUSTER_ALL) {
        if (!getclustercentroids(extra->k,extra->npoints,1,extra->data,extra->mask,extra->clusterid,extra->cdata,extra->cmask,0,method)) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory for centroids retrieval\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (n = 0; n < extra->npoints; n++) {
            if (extra->cmask[extra->clusterid[n]][0]==0) {
                pmesg(1,  __FILE__, __LINE__, "All cluster members are missing for cluster %d\n",extra->clusterid[n]);
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            } else {
                ((double *)(output->content))[n] = extra->cdata[extra->clusterid[n]][0];
            }
        }
    } else {
        memcpy(output->content,extra->clusterid,*(output->length));
    }

    *length=*(output->length);
    return output->content;
}
Пример #2
0
void example_kmeans( int nrows, int ncols, 
                     double** data, 
                     int nclusters, int npass, 
                     char dist, char* jobname,
                     int *clusterid)
/* Perform k-means clustering on genes */
{ 
   int i, j, ii, nl, nc;
   //const int nclusters = 3;
   const int transpose = 0;
   //const char dist = 'e';
   const char method = 'a';
   /* For method=='a', the centroid is defined as the mean over all elements
     belonging to a cluster for each dimension.
     For method=='m', the centroid is defined as the median over all elements
     belonging to a cluster for each dimension.
   */

   //int npass = 1;
   int ifound = 0;
   int test=0;
   double error;
   double distance;
   int** index;
   int* count;
   double* weight = malloc(ncols*sizeof(double));
   double** cdata = malloc(nclusters*sizeof(double*));
   int** cmask = malloc(nclusters*sizeof(int*));
   int** mask = NULL;
   int n=0;
   char* filename;
   char* filename2;
   char* filename3;
   FILE *out1=NULL;
   FILE *out2=NULL;
   FILE *out3=NULL;
   
   for (i = 0; i < nclusters; i++)
   { cdata[i] = malloc(ncols*sizeof(double));
    cmask[i] = malloc(ncols*sizeof(int));
   }
   for (i = 0; i < ncols; i++) weight[i] = 1.0;

   mask = (int **)calloc(sizeof(int*), nrows);
   for (ii=0;ii<nrows;++ii) {
    mask[ii] = (int *)calloc(sizeof(int),ncols);
   }

   for (nl=0; nl<nrows; ++nl) {
    for (nc=0; nc<ncols; ++nc) {
      mask[nl][nc] = 1;
    }
   }



   n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext");

   if (dist)
   { int dummy = nclusters;
    do n++; while (dummy/=10);
   }
    
   //avovk 
   printf("a je u omari :) \n");
   filename = (char *)malloc(n*sizeof(char));
   filename2 = (char *)malloc(n*sizeof(char));
   filename3 = (char *)malloc(n*sizeof(char));
   sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters);
   out1 = fopen( filename, "w" );
   sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters);
   out2 = fopen( filename2, "w" );
   sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters);
   out3 = fopen( filename3, "w" );

   printf("======================== k-means clustering"
         " ========================\n");

   printf ("\n");
   printf ("----- doing %d passes... go stretch your legs...\n",npass);
   //npass = 3;
   /* ANDREJ: This function returns different answers each time it is 
      executed. Does the library provide for ways to initialize the
      random number generators used for the searching and initializations? */
   kcluster(   nclusters,
               nrows,ncols,data,
               mask,weight,
               transpose,npass,
               method,dist,
               clusterid, &error, &ifound);
   printf ("Solution found %d times; ", ifound);
   printf ("within-cluster sum of distances is %f\n", error);
   printf ("------- writing Cluster assignments to file:\t\t"
          " %s_K_G%d.kgg\n",jobname, nclusters);
   for (i = 0; i < nrows; i++)
    fprintf (out1, "%09d\t %d\n", i, clusterid[i]);
   fclose(out1); out1=NULL;

   printf ("------- writing Distance between clusters to file:\t "
          "%s_K_G%d.dis \n", jobname, nclusters);
   fprintf (out2,"------- Distance between clusters:\n");
   index = (int **)malloc(nclusters*sizeof(int*));
   count = (int *)malloc(nclusters*sizeof(int));
   for (i = 0; i < nclusters; i++) count[i] = 0;
   for (i = 0; i < nrows; i++) count[clusterid[i]]++;
   for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
   for (i = 0; i < nclusters; i++) count[i] = 0;
   for (i = 0; i < nrows; i++){ 
      int id = clusterid[i];
      index[id][count[id]] = i;
      count[id]++;
   }  
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1],
		    index[0], index[1], 'e', 'a', 0); 
   fprintf(out2,"Distance between 0 and 1: %7.3f\n", distance);
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2],
		    index[0], index[2], 'e', 'a', 0); 
   fprintf(out2,"Distance between 0 and 2: %7.3f\n", distance);
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2],
		    index[1], index[2], 'e', 'a', 0); 
   fprintf(out2,"Distance between 1 and 2: %7.3f\n", distance);
   fclose(out2); out2=NULL;

   printf ("------- writing Cluster centroids to file:\t\t"
          "%s_K_G%d.cen\n",jobname, nclusters);
   fprintf (out3,"------- Cluster centroids:\n");
   getclustercentroids(nclusters, nrows, ncols, data, mask, clusterid,
                      cdata, cmask, 0, 'a');
   fprintf(out3,"   coefficients:");
   for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i);
   fprintf(out3,"\n");
   for (i = 0; i < nclusters; i++){ 
      fprintf(out3,"Cluster %2d:", i);
      for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]);
      fprintf(out3,"\n");
   }
   fclose(out3); out3=NULL;
   printf("Done...\n");
   for (i = 0; i < nclusters; i++) free(index[i]);
   free(index);
   free(count);

   for (i = 0; i < nclusters; i++){ 
      free(cdata[i]);
      free(cmask[i]);
   }
   for (ii=0;ii<nrows;++ii) {
      if (mask[ii]) free(mask[ii]);
   }

   
   free(cdata);
   free(cmask);
   free(weight);
   free(mask);
   return;
}
Пример #3
0
/* @api private */
VALUE rb_do_kcluster(int argc, VALUE *argv, VALUE self) {
    VALUE size, data, mask, weights, options;
    rb_scan_args(argc, argv, "21", &size, &data, &options);

    if (TYPE(data) != T_ARRAY)
        rb_raise(rb_eArgError, "data should be an array of arrays");

    if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
        rb_raise(rb_eArgError, "size should be > 0 and <= data size");

    mask = get_value_option(options, "mask", Qnil);

    if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
        rb_raise(rb_eArgError, "mask should be an array of arrays");

    int transpose = get_bool_option(options, "transpose", 0);
    int npass     = get_int_option(options, "iterations", DEFAULT_ITERATIONS);

    // a = average, m = means
    int method    = get_int_option(options, "method", 'a');

    // e = euclidian,
    // b = city-block distance
    // c = correlation
    // a = absolute value of the correlation
    // u = uncentered correlation
    // x = absolute uncentered correlation
    // s = spearman's rank correlation
    // k = kendall's tau
    int dist      = get_int_option(options, "metric", 'e');

    // initial assignment
    int assign    = get_int_option(options, "seed",    0);

    int i,j;
    int nrows = RARRAY_LEN(data);
    int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
    int nsets = NUM2INT(rb_Integer(size));

    double **cdata          = (double**)malloc(sizeof(double*)*nrows);
    int    **cmask          = (int   **)malloc(sizeof(int   *)*nrows);
    double *cweights        = (double *)malloc(sizeof(double )*ncols);

    double **ccentroid;
    int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols;

    for (i = 0; i < nrows; i++) {
        cdata[i]          = (double*)malloc(sizeof(double)*ncols);
        cmask[i]          = (int   *)malloc(sizeof(int   )*ncols);
        for (j = 0; j < ncols; j++) {
            cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
            cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
        }
    }

    weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
    for (i = 0; i < ncols; i++) {
        cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
    }

    if (transpose) {
        dimx  = ncols;
        dimy  = nrows;
        cdimx = nrows;
        cdimy = nsets;
    }

    ccluster       = (int    *)malloc(sizeof(int    )*dimx);
    ccentroid      = (double**)malloc(sizeof(double*)*cdimx);
    ccentroid_mask = (int   **)malloc(sizeof(int   *)*cdimx);

    for (i = 0; i < cdimx; i++) {
      ccentroid[i]      = (double*)malloc(sizeof(double)*cdimy);
      ccentroid_mask[i] = (int   *)malloc(sizeof(int   )*cdimy);
    }

    int    ifound;
    double error;

    kcluster(nsets,
        nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound, assign);
    getclustercentroids(nsets,
        nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method);

    VALUE result   = rb_hash_new();
    VALUE cluster  = rb_ary_new();
    VALUE centroid = rb_ary_new();

    for (i = 0; i < dimx; i++)
        rb_ary_push(cluster, INT2NUM(ccluster[i]));

    for (i = 0; i < cdimx; i++) {
        VALUE point = rb_ary_new();
        for (j = 0; j < cdimy; j++)
            rb_ary_push(point, DBL2NUM(ccentroid[i][j]));
        rb_ary_push(centroid, point);
    }

    rb_hash_aset(result, ID2SYM(rb_intern("cluster")),   cluster);
    rb_hash_aset(result, ID2SYM(rb_intern("centroid")),  centroid);
    rb_hash_aset(result, ID2SYM(rb_intern("error")),     DBL2NUM(error));
    rb_hash_aset(result, ID2SYM(rb_intern("repeated")),  INT2NUM(ifound));

    for (i = 0; i < nrows; i++) {
        free(cdata[i]);
        free(cmask[i]);
    }

    for (i = 0; i < cdimx; i++) {
        free(ccentroid[i]);
        free(ccentroid_mask[i]);
    }

    free(cdata);
    free(cmask);
    free(ccentroid);
    free(ccentroid_mask);
    free(cweights);
    free(ccluster);

    return result;
}
Пример #4
0
void example_kmeans( int nrows, int ncols, 
                     double** data, 
                     int nclusters, int npass, 
                     char dist, char* jobname)

/* Perform k-means clustering on genes */
{ 
   int i, j, ii, nl, nc;
   //const int nclusters = 3;
   const int transpose = 0;
   //const char dist = 'e';
   const char method = 'a';
   /* For method=='a', the centroid is defined as the mean over all elements
     belonging to a cluster for each dimension.
     For method=='m', the centroid is defined as the median over all elements
     belonging to a cluster for each dimension.
   */

   //int npass = 1;
   int ifound = 0;
   int test=0;
   double error;
   double distance;
   int** index;
   int* count;
   double* weight = malloc(ncols*sizeof(double));
   int* clusterid = malloc(nrows*sizeof(int));
   double** cdata = malloc(nclusters*sizeof(double*));

   int n=0;
   char* filename;
   char* filename2;
   char* filename3;
   FILE *out1=NULL;
   FILE *out2=NULL;
   FILE *out3=NULL;
   
   for (i = 0; i < nclusters; i++)
   { cdata[i] = malloc(ncols*sizeof(double));

   }
   for (i = 0; i < ncols; i++) weight[i] = 1.0;



   n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext");

   if (dist)
   { int dummy = nclusters;
    do n++; while (dummy/=10);
   }
    
   //avovk 
   printf("a je u omari :) \n");
   filename = malloc(n*sizeof(char));
   filename2 = malloc(n*sizeof(char));
   filename3 = malloc(n*sizeof(char));
   sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters);
   out1 = fopen( filename, "w" );
   sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters);
   out2 = fopen( filename2, "w" );
   sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters);
   out3 = fopen( filename3, "w" );

   printf("======================== k-means clustering"
         " ========================\n");

   printf ("\n");
   printf ("----- doing %d passes... go stretch your legs...\n",npass);
   //npass = 3;
   kcluster(nclusters,nrows,ncols,data,weight,transpose,npass,method,dist, 
    clusterid, &error, &ifound);
   printf ("Solution found %d times; ", ifound);
   printf ("within-cluster sum of distances is %f\n", error);
   printf ("------- writing Cluster assignments to file:\t\t"
          " %s_K_G%d.kgg\n",jobname, nclusters);
   for (i = 0; i < nrows; i++)
     fprintf (out1, "%09d\t %d\n", i, clusterid[i]);
   fclose(out1); out1=NULL;
  
  printf ("------- writing Distance between clusters to file:\t %s_K_G%d.dis \n", jobname, nclusters);
  fprintf (out2,"------- Distance between clusters:\n");
  index = malloc(nclusters*sizeof(int*));
  count = malloc(nclusters*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++) count[clusterid[i]]++;
  for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++)
  { int id = clusterid[i];
    index[id][count[id]] = i;
    count[id]++;
  }  

  for (i = 0; i < nclusters-1; i++)
    {
      for (j = 1+i; j < nclusters; j++)
	{
	  distance = clusterdistance(nrows, ncols, data, weight, count[i], count[j], index[i], index[j], 'e', 'a', 0); 
	  fprintf(out2,"Distance between %d and %d: %7.3f\n", i, j, distance);
	  // fprintf(stderr,"Distance between %d and %d: %7.3f\n", i, j, distance);
	}
    }
   
  fclose(out2); out2=NULL;



   printf ("------- writing Cluster centroids to file:\t\t%s_K_G%d.cen\n",jobname, nclusters);
   fprintf (out3,"------- Cluster centroids:\n");
   getclustercentroids(nclusters, nrows, ncols, data, clusterid, cdata, 0, 'a');
	   fprintf(out3,"   coefficients:");
		       for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i);
		       fprintf(out3,"\n");
   for (i = 0; i < nclusters; i++){ 
      fprintf(out3,"Cluster %2d:", i);
      for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]);
      fprintf(out3,"\n");
   }
   fclose(out3); out3=NULL;
   printf("Done...\n");

   /* call function to calculate distance between each voxel and centroid */
   /* we will need: 
      count - number of elements in cluster as we allready have it
      cdata - cluster centroids
      clusterid
      data */


      getvoxlclusterdist(count, cdata, clusterid, data, jobname, 
nclusters, nrows, ncols);


   for (i = 0; i < nclusters; i++) free(index[i]);
   free(index);
   free(count);

   for (i = 0; i < nclusters; i++){ 
      free(cdata[i]);
   }

   
   free(cdata);

   free(clusterid);
   free(weight);

   return;
}