gmm_t * gmm_learn (int di, int ni, int ki, int niter, const float * v, int nt, int seed, int nredo, int flags) { long d=di,k=ki,n=ni; int iter, iter_tot = 0; double old_key, key = 666; niter = (niter == 0 ? 10000 : niter); /* the GMM parameters */ float * p = fvec_new_0 (n * k); /* p(ci|x) for all i */ gmm_t * g = gmm_new (d, k); /* initialize the GMM: k-means + variance estimation */ int * nassign = ivec_new (n); /* not useful -> to be removed when debugged */ float * dis = fvec_new (n); kmeans (d, n, k, niter, v, nt, seed, nredo, g->mu, dis, NULL, nassign); fflush (stderr); fprintf (stderr, "assign = "); ivec_print (nassign, k); fprintf (stderr, "\n"); free (nassign); /* initialization of the GMM parameters assuming a diagonal matrix */ fvec_set (g->w, k, 1.0 / k); double sig = fvec_sum (dis, n) / n; printf ("sigma at initialization = %.3f\n", sig); fvec_set (g->sigma, k * d, sig); free (dis); /* start the EM algorithm */ fprintf (stdout, "<><><><> GMM <><><><><>\n"); if(flags & GMM_FLAGS_PURE_KMEANS) niter=0; for (iter = 1 ; iter <= niter ; iter++) { gmm_compute_p_thread (n, v, g, p, flags, nt); fflush(stdout); gmm_handle_empty(n, v, g, p); gmm_compute_params (n, v, p, g, flags, nt); fflush(stdout); iter_tot++; /* convergence reached -> leave */ old_key = key; key = fvec_sum (g->mu, k * d); printf ("keys %5d: %.6f -> %.6f\n", iter, old_key, key); fflush(stdout); if (key == old_key) break; } fprintf (stderr, "\n"); free(p); return g; }
void ahc_clustering(DyArray *ahct, int bf, int rho, const fDataSet *ds){ ASSERTINFO(ahct == NULL || bf <= 0 || rho <= 0 || ds == NULL, "IPP"); int n = ds->n; int d = ds->d; Cluster _clu, clu, *pclu = NULL, *p0clu = NULL; int i; float qerror; int iclu, bfi, ni, ichild, ori_id; // the pointer, branch factor and volume of the i-th cluster int *nassign = ivec_new_set(bf, 0); int *assign = NULL; float *cent = fvec_new(d*bf); float *mem_points = NULL; DyArray *member = (DyArray*)malloc(sizeof(DyArray)*bf); /* initialize the first cluster (root) to add it to the ahc tree */ Cluster_init(&clu, n); for(i = 0; i < n; i++){ clu.idx[i] = i; } clu.type = ClusterType_Root; DyArray_add(ahct, (void*)&clu, 1); /* begin the loop of adaptive hierarchical clustering */ iclu = 0; while(iclu < ahct->count){ /* deal with the i-th cluster */ // figure out the adaptive branch factor of the i-th cluster pclu = (Cluster*)DyArray_get(ahct, iclu, 1); ni = pclu->npts; bfi = i_min(bf, (int)round(ni / (float)rho)); // deal with the cluster according to its size if(bfi < 2){ /* * this is a leaf cluster * - mark it, release the children * * not necessary to store real data points */ pclu->type = ClusterType_Leaf; }else{ printf("----------------- cluster %d, bfi-%d:\n", iclu, bfi); /* * this is an inner cluster * - divide it */ memcpy(&_clu, pclu, sizeof(Cluster)); // extract data points from the original dataset according to the idx mem_points = fvec_new(ni * d); for(i = 0; i < ni; i++){ memcpy(mem_points+i*d, ds->data+_clu.idx[i]*d, d); } // divide this cluster assign = ivec_new(ni); if(iclu == 30){ int _a = 1; _a++; ivec_print(_clu.idx, _clu.npts); } qerror = kmeans( d, ni, bfi, CLUSTERING_NITER, mem_points, CLUSTERING_NTHREAD | KMEANS_QUIET | KMEANS_INIT_BERKELEY, CLUSTERING_SEED, CLUSTERING_NREDO, cent, NULL, assign, nassign); // prepare space for members' ids for(i = 0; i < bfi; i++){ DyArray_init(&member[i], sizeof(int), nassign[i]); } // extract member points' ids for each children cluster for(i = 0; i < ni; i++){ ori_id = _clu.idx[i]; DyArray_add(&member[assign[i]], (void*)&ori_id, 1); } // fulfill the type, centroids and the children of this cluster, add them to the ahct _clu.type = ClusterType_Inner; _clu.cents = fvec_new(d * bfi); memcpy(_clu.cents, cent, sizeof(float)*d*bfi); DyArray_init(&_clu.children, sizeof(int), bfi); for(i = 0; i < bfi; i++){ Cluster_init(&clu, nassign[i]); memcpy(clu.idx, (int*)member[i].elem, sizeof(int)*nassign[i]); DyArray_add(&_clu.children, (void*)&ahct->count, 1); /* the i-th child's position */ DyArray_add(ahct, (void*)&clu, 1); /* add the i-th child to the ahct */ } /* as per the elems of ahct may change when expanding the space * we decide to get the brand new address of the element */ pclu = (Cluster*)DyArray_get(ahct, iclu, 1); memcpy(pclu, &_clu, sizeof(Cluster)); /* report */ ivec_print(nassign, bfi); ivec_print((int*)_clu.children.elem, _clu.children.count); /* unset or release */ FREE(mem_points); FREE(assign); for(i = 0; i < bfi; i++){ DyArray_unset(&member[i]); } } // move to next cluster iclu++; } FREE(nassign); FREE(cent); FREE(member); pclu = NULL; }