Exemplo n.º 1
0
int ArrayKCluster(int k, int nTrials, char method, char dist, int* NodeMap)
{ int ifound = 0;
  double error;
  int ok;
  kcluster(k, _rows, _columns, _data, _mask,
    _geneweight, 1, nTrials, method, dist, NodeMap, &error, &ifound);
  ok = SetClusterIndex('a', k, NodeMap);
  if (ok) return ifound;
  return -1;
}
char* oph_ccluster_kcluster(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error)
{
    if(!initid->ptr){
        *error=0;
        *is_null=0;

        initid->ptr=(char *)calloc(1,sizeof(oph_string));
        if(!initid->ptr){
            pmesg(1,  __FILE__, __LINE__, "Error allocating result\n");
            *length=0;
            *is_null=1;
            *error=1;
            return NULL;
        }
        initid->extension = calloc(1,sizeof(oph_ccluster_kcluster_extra));
        if(!initid->extension){
            pmesg(1,  __FILE__, __LINE__, "Error allocating extension\n");
            *length=0;
            *is_null=1;
            *error=1;
            return NULL;
        }

        oph_ccluster_kcluster_extra *extra = (oph_ccluster_kcluster_extra *) initid->extension;

        extra->k = (int) *((long long*) args->args[3]); // set cluster number
        extra->method = OPH_CCLUSTER_KCLUSTER_KMEANS; // default method
        extra->level = OPH_CCLUSTER_KCLUSTER_ALL; // default level
        extra->npass = 1; // default npass
        extra->type = OPH_DOUBLE; // default input type

	if (!strncasecmp(args->args[0],"OPH_INT",args->lengths[0])) extra->type = OPH_INT;
	else if (!strncasecmp(args->args[0],"OPH_SHORT",args->lengths[0])) extra->type = OPH_SHORT;
	else if (!strncasecmp(args->args[0],"OPH_BYTE",args->lengths[0])) extra->type = OPH_BYTE;
	else if (!strncasecmp(args->args[0],"OPH_LONG",args->lengths[0])) extra->type = OPH_LONG;
	else if (!strncasecmp(args->args[0],"OPH_FLOAT",args->lengths[0])) extra->type = OPH_FLOAT;
	else if (!strncasecmp(args->args[0],"OPH_DOUBLE",args->lengths[0])) extra->type = OPH_DOUBLE;
	else {
		pmesg(1,  __FILE__, __LINE__, "Invalid input data type!\n");
		*length=0;
		*is_null=0;
		*error=1;
		return NULL;
	}

        int i;
        for (i = 4; i < args->arg_count; i++) {
                if (args->arg_type[i]==INT_RESULT) { // npass
                    extra->npass = (int) *((long long*) args->args[i]);
                    if (extra->npass < 1) {
                        pmesg(1,  __FILE__, __LINE__, "npass must be >= 1!\n");
                        *length=0;
                        *is_null=0;
                        *error=1;
                        return NULL;
                    }
                } else if (args->arg_type[i]==STRING_RESULT) {
                    if (!strncasecmp(args->args[i],"KMEANS",args->lengths[i])) {
                        extra->method = OPH_CCLUSTER_KCLUSTER_KMEANS;
                    } else if (!strncasecmp(args->args[i],"KMEDIANS",args->lengths[i])) {
                        extra->method = OPH_CCLUSTER_KCLUSTER_KMEDIANS;
                    } else if (!strncasecmp(args->args[i],"CENTROIDS",args->lengths[i])) {
                        extra->level = OPH_CCLUSTER_KCLUSTER_CENTROIDS;
                    } else if (!strncasecmp(args->args[i],"LABELS",args->lengths[i])) {
                        extra->level = OPH_CCLUSTER_KCLUSTER_LABELS;
                    } else if (!strncasecmp(args->args[i],"ALL",args->lengths[i])) {
                        extra->level = OPH_CCLUSTER_KCLUSTER_ALL;
                    } else {
                        pmesg(1,  __FILE__, __LINE__, "invalid argument %d!\n",i);
                        *length=0;
                        *is_null=0;
                        *error=1;
                        return NULL;
                    }
                } else {
                    pmesg(1,  __FILE__, __LINE__, "wrong type for argument %d!\n",i);
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
        }

        extra->npoints = args->lengths[2] / core_sizeof(extra->type);

        extra->data = (double **) malloc(sizeof(double *) * extra->npoints);
        if (!extra->data) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (i = 0; i < extra->npoints; i++) {
            extra->data[i] = (double *) malloc(sizeof(double));
            if (!extra->data[i]) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        }

        extra->mask = (int **) malloc(sizeof(int *) * extra->npoints);
        if (!extra->mask) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (i = 0; i < extra->npoints; i++) {
            extra->mask[i] = (int *) malloc(sizeof(int));
            if (!extra->mask[i]) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        }

        extra->clusterid = (int *) malloc(sizeof(int) * extra->npoints);
        if (!extra->clusterid) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }

        oph_stringPtr output = (oph_stringPtr) initid->ptr;

        if (extra->level==OPH_CCLUSTER_KCLUSTER_CENTROIDS) {
            extra->cdata = (double **) malloc(sizeof(double *) * extra->k);
            if (!extra->cdata) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cdata[i] = (double *) malloc(sizeof(double));
                if (!extra->cdata[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            extra->cmask = (int **) malloc(sizeof(int *) * extra->k);
            if (!extra->cmask) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cmask[i] = (int *) malloc(sizeof(int));
                if (!extra->cmask[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            unsigned long len = (unsigned long ) strlen("OPH_DOUBLE");
            core_set_type(output, "OPH_DOUBLE" ,&len);
            if(core_set_elemsize(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on setting result elements size\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            output->length = (unsigned long *) malloc(sizeof(unsigned long));
            if (!output->length) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating length\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            *(output->length) = (unsigned long) extra->k * output->elemsize;

            output->content = (char *) malloc(*(output->length));
            if (!output->content) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating output\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            if(core_set_numelem(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on counting result elements\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        } else if (extra->level==OPH_CCLUSTER_KCLUSTER_ALL) {
            extra->cdata = (double **) malloc(sizeof(double *) * extra->k);
            if (!extra->cdata) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cdata[i] = (double *) malloc(sizeof(double));
                if (!extra->cdata[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            extra->cmask = (int **) malloc(sizeof(int *) * extra->k);
            if (!extra->cmask) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            for (i = 0; i < extra->k; i++) {
                extra->cmask[i] = (int *) malloc(sizeof(int));
                if (!extra->cmask[i]) {
                    pmesg(1,  __FILE__, __LINE__, "Error allocating memory\n");
                    *length=0;
                    *is_null=0;
                    *error=1;
                    return NULL;
                }
            }

            unsigned long len = (unsigned long ) strlen("OPH_DOUBLE");
            core_set_type(output, "OPH_DOUBLE" ,&len);
            if(core_set_elemsize(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on setting result elements size\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            output->length = (unsigned long *) malloc(sizeof(unsigned long));
            if (!output->length) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating length\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            *(output->length) = (unsigned long) extra->npoints * output->elemsize;

            output->content = (char *) malloc(*(output->length));
            if (!output->content) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating output\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            if(core_set_numelem(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on counting result elements\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        } else {
            unsigned long len = (unsigned long ) strlen("OPH_INT");
            core_set_type(output, "OPH_INT" ,&len);
            if(core_set_elemsize(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on setting result elements size\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            output->length = (unsigned long *) malloc(sizeof(unsigned long));
            if (!output->length) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating length\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
            *(output->length) = (unsigned long) extra->npoints * output->elemsize;

            output->content = (char *) malloc(*(output->length));
            if (!output->content) {
                pmesg(1,  __FILE__, __LINE__, "Error allocating output\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }

            if(core_set_numelem(output)){
                pmesg(1,  __FILE__, __LINE__, "Error on counting result elements\n");
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            }
        }
    }

    if (*error)
    {
        *length=0;
        *is_null=0;
        *error=1;
        return NULL;
    }
    if (*is_null)
    {
        *length=0;
        *is_null=1;
        *error=0;
        return NULL;
    }

    oph_stringPtr output = (oph_stringPtr) initid->ptr;
    oph_ccluster_kcluster_extra *extra = (oph_ccluster_kcluster_extra *) initid->extension;

    double weight[1] = {1.0};
    char method = (extra->method==OPH_CCLUSTER_KCLUSTER_KMEANS)?'a':'m';
    double err;
    int ifound;
    int n;

    switch (extra->type) {
        case OPH_INT:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((int *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_SHORT:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((short *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_BYTE:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((char *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_LONG:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((long long *)(args->args[2]))[n];
                extra->mask[n][0] = 1;
            }
            break;
        case OPH_FLOAT:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = (double) ((float *)(args->args[2]))[n];
                if (isnan(extra->data[n][0])) {
                    extra->mask[n][0] = 0;
                } else {
                    extra->mask[n][0] = 1;
                }
            }
            break;
        case OPH_DOUBLE:
            for (n = 0; n < extra->npoints; n++) {
                extra->data[n][0] = ((double *)(args->args[2]))[n];
                if (isnan(extra->data[n][0])) {
                    extra->mask[n][0] = 0;
                } else {
                    extra->mask[n][0] = 1;
                }
            }
            break;
        default:
            pmesg(1,  __FILE__, __LINE__, "Invalid input type\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
    }

    kcluster(extra->k,extra->npoints,1,extra->data,extra->mask,weight,0,extra->npass,method,'e',extra->clusterid,&err,&ifound);

    if(ifound==0){
        pmesg(1,  __FILE__, __LINE__, "k > number of points!\n");
        *length=0;
        *is_null=0;
        *error=1;
        return NULL;
    }
    if(ifound==-1){
        pmesg(1,  __FILE__, __LINE__, "Error allocating memory for clustering\n");
        *length=0;
        *is_null=0;
        *error=1;
        return NULL;
    }

    if (extra->level==OPH_CCLUSTER_KCLUSTER_CENTROIDS) {
        if (!getclustercentroids(extra->k,extra->npoints,1,extra->data,extra->mask,extra->clusterid,extra->cdata,extra->cmask,0,method)) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory for centroids retrieval\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (n = 0; n < extra->k; n++) {
            if (extra->cmask[n][0]==0) {
                pmesg(1,  __FILE__, __LINE__, "All cluster members are missing for cluster %d\n",n);
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            } else {
                ((double *)(output->content))[n] = extra->cdata[n][0];
            }
        }
    } else if (extra->level==OPH_CCLUSTER_KCLUSTER_ALL) {
        if (!getclustercentroids(extra->k,extra->npoints,1,extra->data,extra->mask,extra->clusterid,extra->cdata,extra->cmask,0,method)) {
            pmesg(1,  __FILE__, __LINE__, "Error allocating memory for centroids retrieval\n");
            *length=0;
            *is_null=0;
            *error=1;
            return NULL;
        }
        for (n = 0; n < extra->npoints; n++) {
            if (extra->cmask[extra->clusterid[n]][0]==0) {
                pmesg(1,  __FILE__, __LINE__, "All cluster members are missing for cluster %d\n",extra->clusterid[n]);
                *length=0;
                *is_null=0;
                *error=1;
                return NULL;
            } else {
                ((double *)(output->content))[n] = extra->cdata[extra->clusterid[n]][0];
            }
        }
    } else {
        memcpy(output->content,extra->clusterid,*(output->length));
    }

    *length=*(output->length);
    return output->content;
}
Exemplo n.º 3
0
/* @api private */
VALUE rb_do_kcluster(int argc, VALUE *argv, VALUE self) {
    VALUE size, data, mask, weights, options;
    rb_scan_args(argc, argv, "21", &size, &data, &options);

    if (TYPE(data) != T_ARRAY)
        rb_raise(rb_eArgError, "data should be an array of arrays");

    if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
        rb_raise(rb_eArgError, "size should be > 0 and <= data size");

    mask = get_value_option(options, "mask", Qnil);

    if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
        rb_raise(rb_eArgError, "mask should be an array of arrays");

    int transpose = get_bool_option(options, "transpose", 0);
    int npass     = get_int_option(options, "iterations", DEFAULT_ITERATIONS);

    // a = average, m = means
    int method    = get_int_option(options, "method", 'a');

    // e = euclidian,
    // b = city-block distance
    // c = correlation
    // a = absolute value of the correlation
    // u = uncentered correlation
    // x = absolute uncentered correlation
    // s = spearman's rank correlation
    // k = kendall's tau
    int dist      = get_int_option(options, "metric", 'e');

    // initial assignment
    int assign    = get_int_option(options, "seed",    0);

    int i,j;
    int nrows = RARRAY_LEN(data);
    int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
    int nsets = NUM2INT(rb_Integer(size));

    double **cdata          = (double**)malloc(sizeof(double*)*nrows);
    int    **cmask          = (int   **)malloc(sizeof(int   *)*nrows);
    double *cweights        = (double *)malloc(sizeof(double )*ncols);

    double **ccentroid;
    int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols;

    for (i = 0; i < nrows; i++) {
        cdata[i]          = (double*)malloc(sizeof(double)*ncols);
        cmask[i]          = (int   *)malloc(sizeof(int   )*ncols);
        for (j = 0; j < ncols; j++) {
            cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
            cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
        }
    }

    weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
    for (i = 0; i < ncols; i++) {
        cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
    }

    if (transpose) {
        dimx  = ncols;
        dimy  = nrows;
        cdimx = nrows;
        cdimy = nsets;
    }

    ccluster       = (int    *)malloc(sizeof(int    )*dimx);
    ccentroid      = (double**)malloc(sizeof(double*)*cdimx);
    ccentroid_mask = (int   **)malloc(sizeof(int   *)*cdimx);

    for (i = 0; i < cdimx; i++) {
      ccentroid[i]      = (double*)malloc(sizeof(double)*cdimy);
      ccentroid_mask[i] = (int   *)malloc(sizeof(int   )*cdimy);
    }

    int    ifound;
    double error;

    kcluster(nsets,
        nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound, assign);
    getclustercentroids(nsets,
        nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method);

    VALUE result   = rb_hash_new();
    VALUE cluster  = rb_ary_new();
    VALUE centroid = rb_ary_new();

    for (i = 0; i < dimx; i++)
        rb_ary_push(cluster, INT2NUM(ccluster[i]));

    for (i = 0; i < cdimx; i++) {
        VALUE point = rb_ary_new();
        for (j = 0; j < cdimy; j++)
            rb_ary_push(point, DBL2NUM(ccentroid[i][j]));
        rb_ary_push(centroid, point);
    }

    rb_hash_aset(result, ID2SYM(rb_intern("cluster")),   cluster);
    rb_hash_aset(result, ID2SYM(rb_intern("centroid")),  centroid);
    rb_hash_aset(result, ID2SYM(rb_intern("error")),     DBL2NUM(error));
    rb_hash_aset(result, ID2SYM(rb_intern("repeated")),  INT2NUM(ifound));

    for (i = 0; i < nrows; i++) {
        free(cdata[i]);
        free(cmask[i]);
    }

    for (i = 0; i < cdimx; i++) {
        free(ccentroid[i]);
        free(ccentroid_mask[i]);
    }

    free(cdata);
    free(cmask);
    free(ccentroid);
    free(ccentroid_mask);
    free(cweights);
    free(ccluster);

    return result;
}
bool BagOfFeatures::buildKClustering(int numClusters, int pass, char method, char dist)
{
    if(dictionary != NULL)
        cvReleaseMat(&dictionary);

    cout << "Initializing the data..." << endl;

    int i, j;
	int k = 0, l = 0, m = 0;
	int size;
	int totalImages = 0;
    double* error = new double [numFeatures];
    int ifound;
    int *clusterID = new int [numFeatures];
    double ** featureData = new double* [numFeatures];
    // Allocate mask and set it all to 1 (assume no missing data)
	int ** mask = new int* [numFeatures];
	for(i = 0; i < numFeatures; i++)
	{
	    mask[i] = new int [descrSize];
	    featureData[i] = new double [descrSize];
		for(j = 0; j < descrSize; j++)
			mask[i][j] = 1;
	}

	// Set the weights equal, all 1
	double* weight = new double [descrSize];
	for(i = 0; i < descrSize; i++)
		weight[i] = 1.0;

	// For each class
    for(m = 0; m < numClasses; m++)
    {
        totalImages = data[m].getTrainSize();
        // For each image in that class...
        for(l = 0; l < totalImages; l++)
        {
            size = trainObject[m].featureSet[l].size;
            // for each feature in that image...
            for(i = 0; i < size; i++)
            {
                // Copy the descriptor into the data array
                for(j = 0; j < descrSize; j++)
                {
                    featureData[k][j] = (double)trainObject[m].featureSet[l].descriptors[i][j];
                    //cout << featureData[k][j] << " ";
                }
                //cout << endl;
                k++;
            }
        }
    }

    cout << "Clustering data..." << endl;

    kcluster(numClusters, numFeatures, descrSize, featureData,
                mask, weight, 0, pass, method, dist,
                clusterID, error, &ifound);

    cout << "Computing cluster centers and building dictionary..." << endl;

    int* indexCount = new int [numClusters];
    int index;
    float *ptrCenter;

    dictionary = cvCreateMat(numClusters, descrSize, CV_32FC1);
    cvSetZero(dictionary);

    for(i = 0; i < numClusters; i++)
        indexCount[i] = 0;

	// Figure out how many clusters per index
	for(i = 0; i < numFeatures; i++)
	{
        index = clusterID[i];
        indexCount[index]++;
		ptrCenter = (float *)(dictionary->data.ptr + index * dictionary->step);
		for(j = 0; j < descrSize; j++)
        {
            ptrCenter[j] += (float)featureData[i][j];
        }
	}

	for(i = 0; i < numClusters; i++)
	{
        ptrCenter = (float *)(dictionary->data.ptr + i * dictionary->step);
        //cout << i << " \t\t\t" << indexCount[i] << endl << endl;
        float t = indexCount[i];
        for(j = 0; j < descrSize; j++)
        {
            ptrCenter[j] /= (float)indexCount[i];
        }
    }


	// Release all memory
	for(i = 0; i < numFeatures; i++)
	{
	    delete [] mask[i];
	    delete [] featureData[i];
	}
	delete [] featureData;
	delete [] mask;
	delete [] weight;
	delete [] indexCount;
	delete [] error;
	delete [] clusterID;

    // Make sure that the tree was allocated
    return true;
}
Exemplo n.º 5
0
// calculate average comembership matrix in "resampNum" times of resampling
void calcAveComemMatrix(int nclusters, int nrow, int ncol, double** data, int** mask, 
	double weight[], int npass, int resampNum, double subSampPercnt, short** aveComemMatrix)
{
	int subSampNum=int(nrow*subSampPercnt);
	int i,j,k;
	double** subData;
	double** subCdata;
	int** subCmask;
	int** subMask;
	int* subClusterid=(int*)malloc((size_t)subSampNum*sizeof(int)); 
	int* allClusterid=(int*)malloc((size_t)nrow*sizeof(int)); 
	long* originalIndex=(long*)malloc((size_t)nrow*sizeof(int)); 
	double error; int ifound;

	subData = (double**)malloc((size_t)subSampNum*sizeof(double*)); 
	for(i=0; i<subSampNum; i++)
		subData[i] = (double*)malloc((size_t)ncol*sizeof(double)); 
	subCdata = (double**)malloc((size_t)nclusters*sizeof(double*)); 
	for(i=0; i<nclusters; i++)
		subCdata[i] = (double*)malloc((size_t)ncol*sizeof(double)); 
	subMask= (int**)malloc((size_t)subSampNum*sizeof(int*)); 
	for(i=0; i<subSampNum; i++)
		subMask[i] = (int*)malloc((size_t)ncol*sizeof(int)); 
	subCmask= (int**)malloc((size_t)nclusters*sizeof(int*)); 
	for(i=0; i<nclusters; i++)
		subCmask[i] = (int*)malloc((size_t)ncol*sizeof(int)); 

	for(i=0; i<nrow; i++)
		for(j=0; j<nrow; j++)
			aveComemMatrix[i][j]=0;
	for(i=0; i<nrow; i++)
		originalIndex[i]=i;

	for(k=0; k<resampNum; k++)
	{
		//printf("the %dth resampling!\n", k);
		genprm(originalIndex, nrow);
		for(i=0; i<subSampNum; i++)
		{
			for(j=0; j<ncol; j++)
			{
				subData[i][j]=data[originalIndex[i]][j];
				subMask[i][j]=mask[originalIndex[i]][j];
			}
		}

		if(nclusters<subSampNum)
			kcluster(nclusters, subSampNum, ncol, subData, subMask, weight, 0, npass, 'a', 'e', subClusterid, subCdata, subCmask, &error, &ifound);
		else {printf("Error: nclusters larger than subSampNum!!"); scanf("%d", &i);}

		for(i=0; i<nrow; i++)
			allClusterid[i]=which_minDist(ncol, data, subCdata, mask, subCmask, weight, i, nclusters);

		for(i=0; i<nrow; i++)
		{
			for(j=0; j<nrow; j++)
			{
				if(allClusterid[i]==allClusterid[j])
					aveComemMatrix[i][j] += 1;
			}
		}
	}

	//free memory
	for(i=0; i<subSampNum; i++) free(subData[i]);
	for(i=0; i<subSampNum; i++) free(subMask[i]);
	for(i=0; i<nclusters; i++) free(subCdata[i]);
	for(i=0; i<nclusters; i++) free(subCmask[i]);
	free(subData);
	free(subCdata);
	free(subMask);
	free(subCmask);
	free(subClusterid);
	free(allClusterid);
	free(originalIndex);
}
Exemplo n.º 6
0
void example_kmeans( int nrows, int ncols, 
                     double** data, 
                     int nclusters, int npass, 
                     char dist, char* jobname,
                     int *clusterid)
/* Perform k-means clustering on genes */
{ 
   int i, j, ii, nl, nc;
   //const int nclusters = 3;
   const int transpose = 0;
   //const char dist = 'e';
   const char method = 'a';
   /* For method=='a', the centroid is defined as the mean over all elements
     belonging to a cluster for each dimension.
     For method=='m', the centroid is defined as the median over all elements
     belonging to a cluster for each dimension.
   */

   //int npass = 1;
   int ifound = 0;
   int test=0;
   double error;
   double distance;
   int** index;
   int* count;
   double* weight = malloc(ncols*sizeof(double));
   double** cdata = malloc(nclusters*sizeof(double*));
   int** cmask = malloc(nclusters*sizeof(int*));
   int** mask = NULL;
   int n=0;
   char* filename;
   char* filename2;
   char* filename3;
   FILE *out1=NULL;
   FILE *out2=NULL;
   FILE *out3=NULL;
   
   for (i = 0; i < nclusters; i++)
   { cdata[i] = malloc(ncols*sizeof(double));
    cmask[i] = malloc(ncols*sizeof(int));
   }
   for (i = 0; i < ncols; i++) weight[i] = 1.0;

   mask = (int **)calloc(sizeof(int*), nrows);
   for (ii=0;ii<nrows;++ii) {
    mask[ii] = (int *)calloc(sizeof(int),ncols);
   }

   for (nl=0; nl<nrows; ++nl) {
    for (nc=0; nc<ncols; ++nc) {
      mask[nl][nc] = 1;
    }
   }



   n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext");

   if (dist)
   { int dummy = nclusters;
    do n++; while (dummy/=10);
   }
    
   //avovk 
   printf("a je u omari :) \n");
   filename = (char *)malloc(n*sizeof(char));
   filename2 = (char *)malloc(n*sizeof(char));
   filename3 = (char *)malloc(n*sizeof(char));
   sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters);
   out1 = fopen( filename, "w" );
   sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters);
   out2 = fopen( filename2, "w" );
   sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters);
   out3 = fopen( filename3, "w" );

   printf("======================== k-means clustering"
         " ========================\n");

   printf ("\n");
   printf ("----- doing %d passes... go stretch your legs...\n",npass);
   //npass = 3;
   /* ANDREJ: This function returns different answers each time it is 
      executed. Does the library provide for ways to initialize the
      random number generators used for the searching and initializations? */
   kcluster(   nclusters,
               nrows,ncols,data,
               mask,weight,
               transpose,npass,
               method,dist,
               clusterid, &error, &ifound);
   printf ("Solution found %d times; ", ifound);
   printf ("within-cluster sum of distances is %f\n", error);
   printf ("------- writing Cluster assignments to file:\t\t"
          " %s_K_G%d.kgg\n",jobname, nclusters);
   for (i = 0; i < nrows; i++)
    fprintf (out1, "%09d\t %d\n", i, clusterid[i]);
   fclose(out1); out1=NULL;

   printf ("------- writing Distance between clusters to file:\t "
          "%s_K_G%d.dis \n", jobname, nclusters);
   fprintf (out2,"------- Distance between clusters:\n");
   index = (int **)malloc(nclusters*sizeof(int*));
   count = (int *)malloc(nclusters*sizeof(int));
   for (i = 0; i < nclusters; i++) count[i] = 0;
   for (i = 0; i < nrows; i++) count[clusterid[i]]++;
   for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
   for (i = 0; i < nclusters; i++) count[i] = 0;
   for (i = 0; i < nrows; i++){ 
      int id = clusterid[i];
      index[id][count[id]] = i;
      count[id]++;
   }  
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1],
		    index[0], index[1], 'e', 'a', 0); 
   fprintf(out2,"Distance between 0 and 1: %7.3f\n", distance);
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2],
		    index[0], index[2], 'e', 'a', 0); 
   fprintf(out2,"Distance between 0 and 2: %7.3f\n", distance);
   distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2],
		    index[1], index[2], 'e', 'a', 0); 
   fprintf(out2,"Distance between 1 and 2: %7.3f\n", distance);
   fclose(out2); out2=NULL;

   printf ("------- writing Cluster centroids to file:\t\t"
          "%s_K_G%d.cen\n",jobname, nclusters);
   fprintf (out3,"------- Cluster centroids:\n");
   getclustercentroids(nclusters, nrows, ncols, data, mask, clusterid,
                      cdata, cmask, 0, 'a');
   fprintf(out3,"   coefficients:");
   for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i);
   fprintf(out3,"\n");
   for (i = 0; i < nclusters; i++){ 
      fprintf(out3,"Cluster %2d:", i);
      for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]);
      fprintf(out3,"\n");
   }
   fclose(out3); out3=NULL;
   printf("Done...\n");
   for (i = 0; i < nclusters; i++) free(index[i]);
   free(index);
   free(count);

   for (i = 0; i < nclusters; i++){ 
      free(cdata[i]);
      free(cmask[i]);
   }
   for (ii=0;ii<nrows;++ii) {
      if (mask[ii]) free(mask[ii]);
   }

   
   free(cdata);
   free(cmask);
   free(weight);
   free(mask);
   return;
}
Exemplo n.º 7
0
void example_kmeans( int nrows, int ncols, 
                     double** data, 
                     int nclusters, int npass, 
                     char dist, char* jobname)

/* Perform k-means clustering on genes */
{ 
   int i, j, ii, nl, nc;
   //const int nclusters = 3;
   const int transpose = 0;
   //const char dist = 'e';
   const char method = 'a';
   /* For method=='a', the centroid is defined as the mean over all elements
     belonging to a cluster for each dimension.
     For method=='m', the centroid is defined as the median over all elements
     belonging to a cluster for each dimension.
   */

   //int npass = 1;
   int ifound = 0;
   int test=0;
   double error;
   double distance;
   int** index;
   int* count;
   double* weight = malloc(ncols*sizeof(double));
   int* clusterid = malloc(nrows*sizeof(int));
   double** cdata = malloc(nclusters*sizeof(double*));

   int n=0;
   char* filename;
   char* filename2;
   char* filename3;
   FILE *out1=NULL;
   FILE *out2=NULL;
   FILE *out3=NULL;
   
   for (i = 0; i < nclusters; i++)
   { cdata[i] = malloc(ncols*sizeof(double));

   }
   for (i = 0; i < ncols; i++) weight[i] = 1.0;



   n = 1 + strlen(jobname) + strlen("_K_G") + strlen(".ext");

   if (dist)
   { int dummy = nclusters;
    do n++; while (dummy/=10);
   }
    
   //avovk 
   printf("a je u omari :) \n");
   filename = malloc(n*sizeof(char));
   filename2 = malloc(n*sizeof(char));
   filename3 = malloc(n*sizeof(char));
   sprintf (filename, "%s_K_G%d.kgg", jobname, nclusters);
   out1 = fopen( filename, "w" );
   sprintf (filename2, "%s_K_G%d.dis", jobname, nclusters);
   out2 = fopen( filename2, "w" );
   sprintf (filename3, "%s_K_G%d.cen", jobname, nclusters);
   out3 = fopen( filename3, "w" );

   printf("======================== k-means clustering"
         " ========================\n");

   printf ("\n");
   printf ("----- doing %d passes... go stretch your legs...\n",npass);
   //npass = 3;
   kcluster(nclusters,nrows,ncols,data,weight,transpose,npass,method,dist, 
    clusterid, &error, &ifound);
   printf ("Solution found %d times; ", ifound);
   printf ("within-cluster sum of distances is %f\n", error);
   printf ("------- writing Cluster assignments to file:\t\t"
          " %s_K_G%d.kgg\n",jobname, nclusters);
   for (i = 0; i < nrows; i++)
     fprintf (out1, "%09d\t %d\n", i, clusterid[i]);
   fclose(out1); out1=NULL;
  
  printf ("------- writing Distance between clusters to file:\t %s_K_G%d.dis \n", jobname, nclusters);
  fprintf (out2,"------- Distance between clusters:\n");
  index = malloc(nclusters*sizeof(int*));
  count = malloc(nclusters*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++) count[clusterid[i]]++;
  for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++)
  { int id = clusterid[i];
    index[id][count[id]] = i;
    count[id]++;
  }  

  for (i = 0; i < nclusters-1; i++)
    {
      for (j = 1+i; j < nclusters; j++)
	{
	  distance = clusterdistance(nrows, ncols, data, weight, count[i], count[j], index[i], index[j], 'e', 'a', 0); 
	  fprintf(out2,"Distance between %d and %d: %7.3f\n", i, j, distance);
	  // fprintf(stderr,"Distance between %d and %d: %7.3f\n", i, j, distance);
	}
    }
   
  fclose(out2); out2=NULL;



   printf ("------- writing Cluster centroids to file:\t\t%s_K_G%d.cen\n",jobname, nclusters);
   fprintf (out3,"------- Cluster centroids:\n");
   getclustercentroids(nclusters, nrows, ncols, data, clusterid, cdata, 0, 'a');
	   fprintf(out3,"   coefficients:");
		       for(i=0; i<ncols; i++) fprintf(out3,"\t%7d", i);
		       fprintf(out3,"\n");
   for (i = 0; i < nclusters; i++){ 
      fprintf(out3,"Cluster %2d:", i);
      for (j = 0; j < ncols; j++) fprintf(out3,"\t%7.3f", cdata[i][j]);
      fprintf(out3,"\n");
   }
   fclose(out3); out3=NULL;
   printf("Done...\n");

   /* call function to calculate distance between each voxel and centroid */
   /* we will need: 
      count - number of elements in cluster as we allready have it
      cdata - cluster centroids
      clusterid
      data */


      getvoxlclusterdist(count, cdata, clusterid, data, jobname, 
nclusters, nrows, ncols);


   for (i = 0; i < nclusters; i++) free(index[i]);
   free(index);
   free(count);

   for (i = 0; i < nclusters; i++){ 
      free(cdata[i]);
   }

   
   free(cdata);

   free(clusterid);
   free(weight);

   return;
}
Exemplo n.º 8
0
void example_kmeans(int nrows, int ncols, double** data, int** mask)
/* Perform k-means clustering on genes */
{ int i, j;
  const int nclusters = 3;
  const int transpose = 0;
  const char dist = 'e';
  const char method = 'a';
  int npass = 1;
  int ifound = 0;
  double error;
  double distance;
  int** index;
  int* count;
  double* weight = malloc(ncols*sizeof(double));
  int* clusterid = malloc(nrows*sizeof(int));
  double** cdata = malloc(nclusters*sizeof(double*));
  int** cmask = malloc(nclusters*sizeof(int*));
  for (i = 0; i < nclusters; i++)
  { cdata[i] = malloc(ncols*sizeof(double));
    cmask[i] = malloc(ncols*sizeof(int));
  }
  for (i = 0; i < ncols; i++) weight[i] = 1.0;
  printf("======================== k-means clustering ====================\n");
  printf("\n");
  printf("----- one pass of the EM algorithm (results may change)\n");
  kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, 
    clusterid, &error, &ifound);
  printf ("Solution found %d times; within-cluster sum of distances is %f\n",
    ifound, error);
  printf ("Cluster assignments:\n");
  for (i = 0; i < nrows; i++)
    printf ("Gene %d: cluster %d\n", i, clusterid[i]);

  printf ("\n");
  printf("----- 1000 passes of the EM algorithm (result should not change)\n");
  npass = 1000;
  kcluster(nclusters,nrows,ncols,data,mask,weight,transpose,npass,method,dist, 
    clusterid, &error, &ifound);
  printf ("Solution found %d times; ", ifound);
  printf ("within-cluster sum of distances is %f\n", error);
  printf ("Cluster assignments:\n");
  for (i = 0; i < nrows; i++)
    printf ("Gene %d: cluster %d\n", i, clusterid[i]);
  printf ("\n");
  printf ("------- Distance between clusters:\n");
  index = malloc(nclusters*sizeof(int*));
  count = malloc(nclusters*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++) count[clusterid[i]]++;
  for (i = 0; i < nclusters; i++) index[i] = malloc(count[i]*sizeof(int));
  for (i = 0; i < nclusters; i++) count[i] = 0;
  for (i = 0; i < nrows; i++)
  { int id = clusterid[i];
    index[id][count[id]] = i;
    count[id]++;
  }  
  distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[1],
		    index[0], index[1], 'e', 'a', 0); 
  printf("Distance between 0 and 1: %7.3f\n", distance);
  distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[0], count[2],
		    index[0], index[2], 'e', 'a', 0); 
  printf("Distance between 0 and 2: %7.3f\n", distance);
  distance =
    clusterdistance(nrows, ncols, data, mask, weight, count[1], count[2],
		    index[1], index[2], 'e', 'a', 0); 
  printf("Distance between 1 and 2: %7.3f\n", distance);

  printf ("\n");
  printf ("------- Cluster centroids:\n");
  getclustermean(nclusters, nrows, ncols, data, mask, clusterid,
		 cdata, cmask, 0);
  printf("   Microarray:");
  for(i=0; i<ncols; i++) printf("\t%7d", i);
  printf("\n");
  for (i = 0; i < nclusters; i++)
  { printf("Cluster %2d:", i);
    for (j = 0; j < ncols; j++) printf("\t%7.3f", cdata[i][j]);
    printf("\n");
  }
  printf("\n");
  for (i = 0; i < nclusters; i++) free(index[i]);
  free(index);
  free(count);

  for (i = 0; i < nclusters; i++)
  { free(cdata[i]);
    free(cmask[i]);
  }
  free(cdata);
  free(cmask);
  free(clusterid);
  free(weight);
  return;
}