int ArrayKCluster(int k, int nTrials, char method, char dist, int* NodeMap) { int ifound = 0; double error; int ok; kcluster(k, _rows, _columns, _data, _mask, _geneweight, 1, nTrials, method, dist, NodeMap, &error, &ifound); ok = SetClusterIndex('a', k, NodeMap); if (ok) return ifound; return -1; }
char* oph_ccluster_kcluster(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error) { if(!initid->ptr){ *error=0; *is_null=0; initid->ptr=(char *)calloc(1,sizeof(oph_string)); if(!initid->ptr){ pmesg(1, __FILE__, __LINE__, "Error allocating result\n"); *length=0; *is_null=1; *error=1; return NULL; } initid->extension = calloc(1,sizeof(oph_ccluster_kcluster_extra)); if(!initid->extension){ pmesg(1, __FILE__, __LINE__, "Error allocating extension\n"); *length=0; *is_null=1; *error=1; return NULL; } oph_ccluster_kcluster_extra *extra = (oph_ccluster_kcluster_extra *) initid->extension; extra->k = (int) *((long long*) args->args[3]); // set cluster number extra->method = OPH_CCLUSTER_KCLUSTER_KMEANS; // default method extra->level = OPH_CCLUSTER_KCLUSTER_ALL; // default level extra->npass = 1; // default npass extra->type = OPH_DOUBLE; // default input type if (!strncasecmp(args->args[0],"OPH_INT",args->lengths[0])) extra->type = OPH_INT; else if (!strncasecmp(args->args[0],"OPH_SHORT",args->lengths[0])) extra->type = OPH_SHORT; else if (!strncasecmp(args->args[0],"OPH_BYTE",args->lengths[0])) extra->type = OPH_BYTE; else if (!strncasecmp(args->args[0],"OPH_LONG",args->lengths[0])) extra->type = OPH_LONG; else if (!strncasecmp(args->args[0],"OPH_FLOAT",args->lengths[0])) extra->type = OPH_FLOAT; else if (!strncasecmp(args->args[0],"OPH_DOUBLE",args->lengths[0])) extra->type = OPH_DOUBLE; else { pmesg(1, __FILE__, __LINE__, "Invalid input data type!\n"); *length=0; *is_null=0; *error=1; return NULL; } int i; for (i = 4; i < args->arg_count; i++) { if (args->arg_type[i]==INT_RESULT) { // npass extra->npass = (int) *((long long*) args->args[i]); if (extra->npass < 1) { pmesg(1, __FILE__, __LINE__, "npass must be >= 1!\n"); *length=0; *is_null=0; *error=1; return NULL; } } else if (args->arg_type[i]==STRING_RESULT) { if (!strncasecmp(args->args[i],"KMEANS",args->lengths[i])) { extra->method = OPH_CCLUSTER_KCLUSTER_KMEANS; } else if (!strncasecmp(args->args[i],"KMEDIANS",args->lengths[i])) { extra->method = OPH_CCLUSTER_KCLUSTER_KMEDIANS; } else if (!strncasecmp(args->args[i],"CENTROIDS",args->lengths[i])) { extra->level = OPH_CCLUSTER_KCLUSTER_CENTROIDS; } else if (!strncasecmp(args->args[i],"LABELS",args->lengths[i])) { extra->level = OPH_CCLUSTER_KCLUSTER_LABELS; } else if (!strncasecmp(args->args[i],"ALL",args->lengths[i])) { extra->level = OPH_CCLUSTER_KCLUSTER_ALL; } else { pmesg(1, __FILE__, __LINE__, "invalid argument %d!\n",i); *length=0; *is_null=0; *error=1; return NULL; } } else { pmesg(1, __FILE__, __LINE__, "wrong type for argument %d!\n",i); *length=0; *is_null=0; *error=1; return NULL; } } extra->npoints = args->lengths[2] / core_sizeof(extra->type); extra->data = (double **) malloc(sizeof(double *) * extra->npoints); if (!extra->data) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } for (i = 0; i < extra->npoints; i++) { extra->data[i] = (double *) malloc(sizeof(double)); if (!extra->data[i]) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } } extra->mask = (int **) malloc(sizeof(int *) * extra->npoints); if (!extra->mask) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } for (i = 0; i < extra->npoints; i++) { extra->mask[i] = (int *) malloc(sizeof(int)); if (!extra->mask[i]) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } } extra->clusterid = (int *) malloc(sizeof(int) * extra->npoints); if (!extra->clusterid) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } oph_stringPtr output = (oph_stringPtr) initid->ptr; if (extra->level==OPH_CCLUSTER_KCLUSTER_CENTROIDS) { extra->cdata = (double **) malloc(sizeof(double *) * extra->k); if (!extra->cdata) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } for (i = 0; i < extra->k; i++) { extra->cdata[i] = (double *) malloc(sizeof(double)); if (!extra->cdata[i]) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } } extra->cmask = (int **) malloc(sizeof(int *) * extra->k); if (!extra->cmask) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } for (i = 0; i < extra->k; i++) { extra->cmask[i] = (int *) malloc(sizeof(int)); if (!extra->cmask[i]) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } } unsigned long len = (unsigned long ) strlen("OPH_DOUBLE"); core_set_type(output, "OPH_DOUBLE" ,&len); if(core_set_elemsize(output)){ pmesg(1, __FILE__, __LINE__, "Error on setting result elements size\n"); *length=0; *is_null=0; *error=1; return NULL; } output->length = (unsigned long *) malloc(sizeof(unsigned long)); if (!output->length) { pmesg(1, __FILE__, __LINE__, "Error allocating length\n"); *length=0; *is_null=0; *error=1; return NULL; } *(output->length) = (unsigned long) extra->k * output->elemsize; output->content = (char *) malloc(*(output->length)); if (!output->content) { pmesg(1, __FILE__, __LINE__, "Error allocating output\n"); *length=0; *is_null=0; *error=1; return NULL; } if(core_set_numelem(output)){ pmesg(1, __FILE__, __LINE__, "Error on counting result elements\n"); *length=0; *is_null=0; *error=1; return NULL; } } else if (extra->level==OPH_CCLUSTER_KCLUSTER_ALL) { extra->cdata = (double **) malloc(sizeof(double *) * extra->k); if (!extra->cdata) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } for (i = 0; i < extra->k; i++) { extra->cdata[i] = (double *) malloc(sizeof(double)); if (!extra->cdata[i]) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } } extra->cmask = (int **) malloc(sizeof(int *) * extra->k); if (!extra->cmask) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } for (i = 0; i < extra->k; i++) { extra->cmask[i] = (int *) malloc(sizeof(int)); if (!extra->cmask[i]) { pmesg(1, __FILE__, __LINE__, "Error allocating memory\n"); *length=0; *is_null=0; *error=1; return NULL; } } unsigned long len = (unsigned long ) strlen("OPH_DOUBLE"); core_set_type(output, "OPH_DOUBLE" ,&len); if(core_set_elemsize(output)){ pmesg(1, __FILE__, __LINE__, "Error on setting result elements size\n"); *length=0; *is_null=0; *error=1; return NULL; } output->length = (unsigned long *) malloc(sizeof(unsigned long)); if (!output->length) { pmesg(1, __FILE__, __LINE__, "Error allocating length\n"); *length=0; *is_null=0; *error=1; return NULL; } *(output->length) = (unsigned long) extra->npoints * output->elemsize; output->content = (char *) malloc(*(output->length)); if (!output->content) { pmesg(1, __FILE__, __LINE__, "Error allocating output\n"); *length=0; *is_null=0; *error=1; return NULL; } if(core_set_numelem(output)){ pmesg(1, __FILE__, __LINE__, "Error on counting result elements\n"); *length=0; *is_null=0; *error=1; return NULL; } } else { unsigned long len = (unsigned long ) strlen("OPH_INT"); core_set_type(output, "OPH_INT" ,&len); if(core_set_elemsize(output)){ pmesg(1, __FILE__, __LINE__, "Error on setting result elements size\n"); *length=0; *is_null=0; *error=1; return NULL; } output->length = (unsigned long *) malloc(sizeof(unsigned long)); if (!output->length) { pmesg(1, __FILE__, __LINE__, "Error allocating length\n"); *length=0; *is_null=0; *error=1; return NULL; } *(output->length) = (unsigned long) extra->npoints * output->elemsize; output->content = (char *) malloc(*(output->length)); if (!output->content) { pmesg(1, __FILE__, __LINE__, "Error allocating output\n"); *length=0; *is_null=0; *error=1; return NULL; } if(core_set_numelem(output)){ pmesg(1, __FILE__, __LINE__, "Error on counting result elements\n"); *length=0; *is_null=0; *error=1; return NULL; } } } if (*error) { *length=0; *is_null=0; *error=1; return NULL; } if (*is_null) { *length=0; *is_null=1; *error=0; return NULL; } oph_stringPtr output = (oph_stringPtr) initid->ptr; oph_ccluster_kcluster_extra *extra = (oph_ccluster_kcluster_extra *) initid->extension; double weight[1] = {1.0}; char method = (extra->method==OPH_CCLUSTER_KCLUSTER_KMEANS)?'a':'m'; double err; int ifound; int n; switch (extra->type) { case OPH_INT: for (n = 0; n < extra->npoints; n++) { extra->data[n][0] = (double) ((int *)(args->args[2]))[n]; extra->mask[n][0] = 1; } break; case OPH_SHORT: for (n = 0; n < extra->npoints; n++) { extra->data[n][0] = (double) ((short *)(args->args[2]))[n]; extra->mask[n][0] = 1; } break; case OPH_BYTE: for (n = 0; n < extra->npoints; n++) { extra->data[n][0] = (double) ((char *)(args->args[2]))[n]; extra->mask[n][0] = 1; } break; case OPH_LONG: for (n = 0; n < extra->npoints; n++) { extra->data[n][0] = (double) ((long long *)(args->args[2]))[n]; extra->mask[n][0] = 1; } break; case OPH_FLOAT: for (n = 0; n < extra->npoints; n++) { extra->data[n][0] = (double) ((float *)(args->args[2]))[n]; if (isnan(extra->data[n][0])) { extra->mask[n][0] = 0; } else { extra->mask[n][0] = 1; } } break; case OPH_DOUBLE: for (n = 0; n < extra->npoints; n++) { extra->data[n][0] = ((double *)(args->args[2]))[n]; if (isnan(extra->data[n][0])) { extra->mask[n][0] = 0; } else { extra->mask[n][0] = 1; } } break; default: pmesg(1, __FILE__, __LINE__, "Invalid input type\n"); *length=0; *is_null=0; *error=1; return NULL; } kcluster(extra->k,extra->npoints,1,extra->data,extra->mask,weight,0,extra->npass,method,'e',extra->clusterid,&err,&ifound); if(ifound==0){ pmesg(1, __FILE__, __LINE__, "k > number of points!\n"); *length=0; *is_null=0; *error=1; return NULL; } if(ifound==-1){ pmesg(1, __FILE__, __LINE__, "Error allocating memory for clustering\n"); *length=0; *is_null=0; *error=1; return NULL; } if (extra->level==OPH_CCLUSTER_KCLUSTER_CENTROIDS) { if (!getclustercentroids(extra->k,extra->npoints,1,extra->data,extra->mask,extra->clusterid,extra->cdata,extra->cmask,0,method)) { pmesg(1, __FILE__, __LINE__, "Error allocating memory for centroids retrieval\n"); *length=0; *is_null=0; *error=1; return NULL; } for (n = 0; n < extra->k; n++) { if (extra->cmask[n][0]==0) { pmesg(1, __FILE__, __LINE__, "All cluster members are missing for cluster %d\n",n); *length=0; *is_null=0; *error=1; return NULL; } else { ((double *)(output->content))[n] = extra->cdata[n][0]; } } } else if (extra->level==OPH_CCLUSTER_KCLUSTER_ALL) { if (!getclustercentroids(extra->k,extra->npoints,1,extra->data,extra->mask,extra->clusterid,extra->cdata,extra->cmask,0,method)) { pmesg(1, __FILE__, __LINE__, "Error allocating memory for centroids retrieval\n"); *length=0; *is_null=0; *error=1; return NULL; } for (n = 0; n < extra->npoints; n++) { if (extra->cmask[extra->clusterid[n]][0]==0) { pmesg(1, __FILE__, __LINE__, "All cluster members are missing for cluster %d\n",extra->clusterid[n]); *length=0; *is_null=0; *error=1; return NULL; } else { ((double *)(output->content))[n] = extra->cdata[extra->clusterid[n]][0]; } } } else { memcpy(output->content,extra->clusterid,*(output->length)); } *length=*(output->length); return output->content; }
/* @api private */ VALUE rb_do_kcluster(int argc, VALUE *argv, VALUE self) { VALUE size, data, mask, weights, options; rb_scan_args(argc, argv, "21", &size, &data, &options); if (TYPE(data) != T_ARRAY) rb_raise(rb_eArgError, "data should be an array of arrays"); if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data)) rb_raise(rb_eArgError, "size should be > 0 and <= data size"); mask = get_value_option(options, "mask", Qnil); if (!NIL_P(mask) && TYPE(mask) != T_ARRAY) rb_raise(rb_eArgError, "mask should be an array of arrays"); int transpose = get_bool_option(options, "transpose", 0); int npass = get_int_option(options, "iterations", DEFAULT_ITERATIONS); // a = average, m = means int method = get_int_option(options, "method", 'a'); // e = euclidian, // b = city-block distance // c = correlation // a = absolute value of the correlation // u = uncentered correlation // x = absolute uncentered correlation // s = spearman's rank correlation // k = kendall's tau int dist = get_int_option(options, "metric", 'e'); // initial assignment int assign = get_int_option(options, "seed", 0); int i,j; int nrows = RARRAY_LEN(data); int ncols = RARRAY_LEN(rb_ary_entry(data, 0)); int nsets = NUM2INT(rb_Integer(size)); double **cdata = (double**)malloc(sizeof(double*)*nrows); int **cmask = (int **)malloc(sizeof(int *)*nrows); double *cweights = (double *)malloc(sizeof(double )*ncols); double **ccentroid; int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols; for (i = 0; i < nrows; i++) { cdata[i] = (double*)malloc(sizeof(double)*ncols); cmask[i] = (int *)malloc(sizeof(int )*ncols); for (j = 0; j < ncols; j++) { cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j))); cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j))); } } weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights"))); for (i = 0; i < ncols; i++) { cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i))); } if (transpose) { dimx = ncols; dimy = nrows; cdimx = nrows; cdimy = nsets; } ccluster = (int *)malloc(sizeof(int )*dimx); ccentroid = (double**)malloc(sizeof(double*)*cdimx); ccentroid_mask = (int **)malloc(sizeof(int *)*cdimx); for (i = 0; i < cdimx; i++) { ccentroid[i] = (double*)malloc(sizeof(double)*cdimy); ccentroid_mask[i] = (int *)malloc(sizeof(int )*cdimy); } int ifound; double error; kcluster(nsets, nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound, assign); getclustercentroids(nsets, nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method); VALUE result = rb_hash_new(); VALUE cluster = rb_ary_new(); VALUE centroid = rb_ary_new(); for (i = 0; i < dimx; i++) rb_ary_push(cluster, INT2NUM(ccluster[i])); for (i = 0; i < cdimx; i++) { VALUE point = rb_ary_new(); for (j = 0; j < cdimy; j++) rb_ary_push(point, DBL2NUM(ccentroid[i][j])); rb_ary_push(centroid, point); } rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster); rb_hash_aset(result, ID2SYM(rb_intern("centroid")), centroid); rb_hash_aset(result, ID2SYM(rb_intern("error")), DBL2NUM(error)); rb_hash_aset(result, ID2SYM(rb_intern("repeated")), INT2NUM(ifound)); for (i = 0; i < nrows; i++) { free(cdata[i]); free(cmask[i]); } for (i = 0; i < cdimx; i++) { free(ccentroid[i]); free(ccentroid_mask[i]); } free(cdata); free(cmask); free(ccentroid); free(ccentroid_mask); free(cweights); free(ccluster); return result; }
bool BagOfFeatures::buildKClustering(int numClusters, int pass, char method, char dist) { if(dictionary != NULL) cvReleaseMat(&dictionary); cout << "Initializing the data..." << endl; int i, j; int k = 0, l = 0, m = 0; int size; int totalImages = 0; double* error = new double [numFeatures]; int ifound; int *clusterID = new int [numFeatures]; double ** featureData = new double* [numFeatures]; // Allocate mask and set it all to 1 (assume no missing data) int ** mask = new int* [numFeatures]; for(i = 0; i < numFeatures; i++) { mask[i] = new int [descrSize]; featureData[i] = new double [descrSize]; for(j = 0; j < descrSize; j++) mask[i][j] = 1; } // Set the weights equal, all 1 double* weight = new double [descrSize]; for(i = 0; i < descrSize; i++) weight[i] = 1.0; // For each class for(m = 0; m < numClasses; m++) { totalImages = data[m].getTrainSize(); // For each image in that class... for(l = 0; l < totalImages; l++) { size = trainObject[m].featureSet[l].size; // for each feature in that image... for(i = 0; i < size; i++) { // Copy the descriptor into the data array for(j = 0; j < descrSize; j++) { featureData[k][j] = (double)trainObject[m].featureSet[l].descriptors[i][j]; //cout << featureData[k][j] << " "; } //cout << endl; k++; } } } cout << "Clustering data..." << endl; kcluster(numClusters, numFeatures, descrSize, featureData, mask, weight, 0, pass, method, dist, clusterID, error, &ifound); cout << "Computing cluster centers and building dictionary..." << endl; int* indexCount = new int [numClusters]; int index; float *ptrCenter; dictionary = cvCreateMat(numClusters, descrSize, CV_32FC1); cvSetZero(dictionary); for(i = 0; i < numClusters; i++) indexCount[i] = 0; // Figure out how many clusters per index for(i = 0; i < numFeatures; i++) { index = clusterID[i]; indexCount[index]++; ptrCenter = (float *)(dictionary->data.ptr + index * dictionary->step); for(j = 0; j < descrSize; j++) { ptrCenter[j] += (float)featureData[i][j]; } } for(i = 0; i < numClusters; i++) { ptrCenter = (float *)(dictionary->data.ptr + i * dictionary->step); //cout << i << " \t\t\t" << indexCount[i] << endl << endl; float t = indexCount[i]; for(j = 0; j < descrSize; j++) { ptrCenter[j] /= (float)indexCount[i]; } } // Release all memory for(i = 0; i < numFeatures; i++) { delete [] mask[i]; delete [] featureData[i]; } delete [] featureData; delete [] mask; delete [] weight; delete [] indexCount; delete [] error; delete [] clusterID; // Make sure that the tree was allocated return true; }
// calculate average comembership matrix in "resampNum" times of resampling void calcAveComemMatrix(int nclusters, int nrow, int ncol, double** data, int** mask, double weight[], int npass, int resampNum, double subSampPercnt, short** aveComemMatrix) { int subSampNum=int(nrow*subSampPercnt); int i,j,k; double** subData; double** subCdata; int** subCmask; int** subMask; int* subClusterid=(int*)malloc((size_t)subSampNum*sizeof(int)); int* allClusterid=(int*)malloc((size_t)nrow*sizeof(int)); long* originalIndex=(long*)malloc((size_t)nrow*sizeof(int)); double error; int ifound; subData = (double**)malloc((size_t)subSampNum*sizeof(double*)); for(i=0; i<subSampNum; i++) subData[i] = (double*)malloc((size_t)ncol*sizeof(double)); subCdata = (double**)malloc((size_t)nclusters*sizeof(double*)); for(i=0; i<nclusters; i++) subCdata[i] = (double*)malloc((size_t)ncol*sizeof(double)); subMask= (int**)malloc((size_t)subSampNum*sizeof(int*)); for(i=0; i<subSampNum; i++) subMask[i] = (int*)malloc((size_t)ncol*sizeof(int)); subCmask= (int**)malloc((size_t)nclusters*sizeof(int*)); for(i=0; i<nclusters; i++) subCmask[i] = (int*)malloc((size_t)ncol*sizeof(int)); for(i=0; i<nrow; i++) for(j=0; j<nrow; j++) aveComemMatrix[i][j]=0; for(i=0; i<nrow; i++) originalIndex[i]=i; for(k=0; k<resampNum; k++) { //printf("the %dth resampling!\n", k); genprm(originalIndex, nrow); for(i=0; i<subSampNum; i++) { for(j=0; j<ncol; j++) { subData[i][j]=data[originalIndex[i]][j]; subMask[i][j]=mask[originalIndex[i]][j]; } } if(nclusters<subSampNum) kcluster(nclusters, subSampNum, ncol, subData, subMask, weight, 0, npass, 'a', 'e', subClusterid, subCdata, subCmask, &error, &ifound); else {printf("Error: nclusters larger than subSampNum!!"); scanf("%d", &i);} for(i=0; i<nrow; i++) allClusterid[i]=which_minDist(ncol, data, subCdata, mask, subCmask, weight, i, nclusters); for(i=0; i<nrow; i++) { for(j=0; j<nrow; j++) { if(allClusterid[i]==allClusterid[j]) aveComemMatrix[i][j] += 1; } } } //free memory for(i=0; i<subSampNum; i++) free(subData[i]); for(i=0; i<subSampNum; i++) free(subMask[i]); for(i=0; i<nclusters; i++) free(subCdata[i]); for(i=0; i<nclusters; i++) free(subCmask[i]); free(subData); free(subCdata); free(subMask); free(subCmask); free(subClusterid); free(allClusterid); free(originalIndex); }
void example_kmeans( int nrows, int ncols, double** data, int nclusters, int npass, char dist, char* jobname, int *clusterid) /* Perform k-means clustering on genes */ { int i, j, ii, nl, nc; //const int nclusters = 3; const int transpose = 0; //const char dist = 'e'; const char method = 'a'; /* For method=='a', the centroid is defined as the mean over all elements belonging to a cluster for each dimension. For method=='m', the centroid is defined as the median over all elements belonging to a cluster for each dimension. */ //int npass = 1; int ifound = 0; int test=0; double error; double distance; int** index; int* count; double* weight = malloc(ncols*sizeof(double)); double** cdata = malloc(nclusters*sizeof(double*)); int** cmask = malloc(nclusters*sizeof(int*)); int** mask = NULL; int n=0; char* filename; char* filename2; char* filename3; FILE *out1=NULL; FILE *out2=NULL; FILE *out3=NULL; for (i = 0; i < nclusters; i++) { cdata[i] = malloc(ncols*sizeof(double)); cmask[i] = malloc(ncols*sizeof(int)); } for (i = 0; i < ncols; i++) weight[i] = 1.0; mask = (int **)calloc(sizeof(int*), nrows); for (ii=0;ii<nrows;++ii) { mask[ii] = (int *)calloc(sizeof(int),ncols); } for (nl=0; nl<nrows; ++nl) { for (nc=0; nc<ncols; ++nc) { mask[nl][nc] = 1; } } n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext"); if (dist) { int dummy = nclusters; do n++; while (dummy/=10); } //avovk printf("a je u omari :) \n"); filename = (char *)malloc(n*sizeof(char)); filename2 = (char *)malloc(n*sizeof(char)); filename3 = (char *)malloc(n*sizeof(char)); sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters); out1 = fopen( filename, "w" ); sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters); out2 = fopen( filename2, "w" ); sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters); out3 = fopen( filename3, "w" ); printf("======================== k-means clustering" " ========================\n"); printf ("\n"); printf ("----- doing %d passes... go stretch your legs...\n",npass); //npass = 3; /* ANDREJ: This function returns different answers each time it is executed. Does the library provide for ways to initialize the random number generators used for the searching and initializations? */ kcluster( nclusters, nrows,ncols,data, mask,weight, transpose,npass, method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; ", ifound); printf ("within-cluster sum of distances is %f\n", error); printf ("------- writing Cluster assignments to file:\t\t" " %s_K_G%d.kgg\n",jobname, nclusters); for (i = 0; i < nrows; i++) fprintf (out1, "%09d\t %d\n", i, clusterid[i]); fclose(out1); out1=NULL; printf ("------- writing Distance between clusters to file:\t " "%s_K_G%d.dis \n", jobname, nclusters); fprintf (out2,"------- Distance between clusters:\n"); index = (int **)malloc(nclusters*sizeof(int*)); count = (int *)malloc(nclusters*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) count[clusterid[i]]++; for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++){ int id = clusterid[i]; index[id][count[id]] = i; count[id]++; } distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1], index[0], index[1], 'e', 'a', 0); fprintf(out2,"Distance between 0 and 1: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2], index[0], index[2], 'e', 'a', 0); fprintf(out2,"Distance between 0 and 2: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2], index[1], index[2], 'e', 'a', 0); fprintf(out2,"Distance between 1 and 2: %7.3f\n", distance); fclose(out2); out2=NULL; printf ("------- writing Cluster centroids to file:\t\t" "%s_K_G%d.cen\n",jobname, nclusters); fprintf (out3,"------- Cluster centroids:\n"); getclustercentroids(nclusters, nrows, ncols, data, mask, clusterid, cdata, cmask, 0, 'a'); fprintf(out3," coefficients:"); for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i); fprintf(out3,"\n"); for (i = 0; i < nclusters; i++){ fprintf(out3,"Cluster %2d:", i); for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]); fprintf(out3,"\n"); } fclose(out3); out3=NULL; printf("Done...\n"); for (i = 0; i < nclusters; i++) free(index[i]); free(index); free(count); for (i = 0; i < nclusters; i++){ free(cdata[i]); free(cmask[i]); } for (ii=0;ii<nrows;++ii) { if (mask[ii]) free(mask[ii]); } free(cdata); free(cmask); free(weight); free(mask); return; }
void example_kmeans( int nrows, int ncols, double** data, int nclusters, int npass, char dist, char* jobname) /* Perform k-means clustering on genes */ { int i, j, ii, nl, nc; //const int nclusters = 3; const int transpose = 0; //const char dist = 'e'; const char method = 'a'; /* For method=='a', the centroid is defined as the mean over all elements belonging to a cluster for each dimension. For method=='m', the centroid is defined as the median over all elements belonging to a cluster for each dimension. */ //int npass = 1; int ifound = 0; int test=0; double error; double distance; int** index; int* count; double* weight = malloc(ncols*sizeof(double)); int* clusterid = malloc(nrows*sizeof(int)); double** cdata = malloc(nclusters*sizeof(double*)); int n=0; char* filename; char* filename2; char* filename3; FILE *out1=NULL; FILE *out2=NULL; FILE *out3=NULL; for (i = 0; i < nclusters; i++) { cdata[i] = malloc(ncols*sizeof(double)); } for (i = 0; i < ncols; i++) weight[i] = 1.0; n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext"); if (dist) { int dummy = nclusters; do n++; while (dummy/=10); } //avovk printf("a je u omari :) \n"); filename = malloc(n*sizeof(char)); filename2 = malloc(n*sizeof(char)); filename3 = malloc(n*sizeof(char)); sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters); out1 = fopen( filename, "w" ); sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters); out2 = fopen( filename2, "w" ); sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters); out3 = fopen( filename3, "w" ); printf("======================== k-means clustering" " ========================\n"); printf ("\n"); printf ("----- doing %d passes... go stretch your legs...\n",npass); //npass = 3; kcluster(nclusters,nrows,ncols,data,weight,transpose,npass,method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; ", ifound); printf ("within-cluster sum of distances is %f\n", error); printf ("------- writing Cluster assignments to file:\t\t" " %s_K_G%d.kgg\n",jobname, nclusters); for (i = 0; i < nrows; i++) fprintf (out1, "%09d\t %d\n", i, clusterid[i]); fclose(out1); out1=NULL; printf ("------- writing Distance between clusters to file:\t %s_K_G%d.dis \n", jobname, nclusters); fprintf (out2,"------- Distance between clusters:\n"); index = malloc(nclusters*sizeof(int*)); count = malloc(nclusters*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) count[clusterid[i]]++; for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) { int id = clusterid[i]; index[id][count[id]] = i; count[id]++; } for (i = 0; i < nclusters-1; i++) { for (j = 1+i; j < nclusters; j++) { distance = clusterdistance(nrows, ncols, data, weight, count[i], count[j], index[i], index[j], 'e', 'a', 0); fprintf(out2,"Distance between %d and %d: %7.3f\n", i, j, distance); // fprintf(stderr,"Distance between %d and %d: %7.3f\n", i, j, distance); } } fclose(out2); out2=NULL; printf ("------- writing Cluster centroids to file:\t\t%s_K_G%d.cen\n",jobname, nclusters); fprintf (out3,"------- Cluster centroids:\n"); getclustercentroids(nclusters, nrows, ncols, data, clusterid, cdata, 0, 'a'); fprintf(out3," coefficients:"); for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i); fprintf(out3,"\n"); for (i = 0; i < nclusters; i++){ fprintf(out3,"Cluster %2d:", i); for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]); fprintf(out3,"\n"); } fclose(out3); out3=NULL; printf("Done...\n"); /* call function to calculate distance between each voxel and centroid */ /* we will need: count - number of elements in cluster as we allready have it cdata - cluster centroids clusterid data */ getvoxlclusterdist(count, cdata, clusterid, data, jobname, nclusters, nrows, ncols); for (i = 0; i < nclusters; i++) free(index[i]); free(index); free(count); for (i = 0; i < nclusters; i++){ free(cdata[i]); } free(cdata); free(clusterid); free(weight); return; }
void example_kmeans(int nrows, int ncols, double** data, int** mask) /* Perform k-means clustering on genes */ { int i, j; const int nclusters = 3; const int transpose = 0; const char dist = 'e'; const char method = 'a'; int npass = 1; int ifound = 0; double error; double distance; int** index; int* count; double* weight = malloc(ncols*sizeof(double)); int* clusterid = malloc(nrows*sizeof(int)); double** cdata = malloc(nclusters*sizeof(double*)); int** cmask = malloc(nclusters*sizeof(int*)); for (i = 0; i < nclusters; i++) { cdata[i] = malloc(ncols*sizeof(double)); cmask[i] = malloc(ncols*sizeof(int)); } for (i = 0; i < ncols; i++) weight[i] = 1.0; printf("======================== k-means clustering ====================\n"); printf("\n"); printf("----- one pass of the EM algorithm (results may change)\n"); kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; within-cluster sum of distances is %f\n", ifound, error); printf ("Cluster assignments:\n"); for (i = 0; i < nrows; i++) printf ("Gene %d: cluster %d\n", i, clusterid[i]); printf ("\n"); printf("----- 1000 passes of the EM algorithm (result should not change)\n"); npass = 1000; kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, clusterid, &error, &ifound); printf ("Solution found %d times; ", ifound); printf ("within-cluster sum of distances is %f\n", error); printf ("Cluster assignments:\n"); for (i = 0; i < nrows; i++) printf ("Gene %d: cluster %d\n", i, clusterid[i]); printf ("\n"); printf ("------- Distance between clusters:\n"); index = malloc(nclusters*sizeof(int*)); count = malloc(nclusters*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) count[clusterid[i]]++; for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int)); for (i = 0; i < nclusters; i++) count[i] = 0; for (i = 0; i < nrows; i++) { int id = clusterid[i]; index[id][count[id]] = i; count[id]++; } distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1], index[0], index[1], 'e', 'a', 0); printf("Distance between 0 and 1: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2], index[0], index[2], 'e', 'a', 0); printf("Distance between 0 and 2: %7.3f\n", distance); distance = clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2], index[1], index[2], 'e', 'a', 0); printf("Distance between 1 and 2: %7.3f\n", distance); printf ("\n"); printf ("------- Cluster centroids:\n"); getclustermean(nclusters, nrows, ncols, data, mask, clusterid, cdata, cmask, 0); printf(" Microarray:"); for(i=0; i<ncols; i++) printf("\t%7d", i); printf("\n"); for (i = 0; i < nclusters; i++) { printf("Cluster %2d:", i); for (j = 0; j < ncols; j++) printf("\t%7.3f", cdata[i][j]); printf("\n"); } printf("\n"); for (i = 0; i < nclusters; i++) free(index[i]); free(index); free(count); for (i = 0; i < nclusters; i++) { free(cdata[i]); free(cmask[i]); } free(cdata); free(cmask); free(clusterid); free(weight); return; }