pca_online_t * pca_online_new (int d) { pca_online_t * pca = (pca_online_t *) malloc (sizeof (pca_online_t)); pca->d = d; pca->n = 0; pca->mu = fvec_new_0 (d); pca->cov = fvec_new_0 (d*(long)d); pca->eigvec = fvec_new (d*(long)d); pca->eigval = fvec_new (d); return pca; }
float *fmat_new_covariance (int d, int n, const float *v, float *avg, int assume_centered) { long i, j; float *cov = fvec_new_0 (d * d); if(!assume_centered) { float *sums = avg ? avg : fvec_new(d); fvec_0(sums,d); for (i = 0; i < n; i++) for (j = 0; j < d; j++) sums[j] += v[i * d + j]; for (i = 0; i < d; i++) for (j = 0; j < d; j++) cov[i + j * d] = sums[i] * sums[j]; if(avg) for(i=0;i<d;i++) avg[i]/=n; else free (sums); } FINTEGER di=d,ni=n; if(0) { float alpha = 1.0 / n, beta = -1.0 / (n * n); sgemm_ ("N", "T", &di, &di, &ni, &alpha, v, &di, v, &di, &beta, cov, &di); } else if(1) { /* transpose input matrix */ float *vt=fvec_new(n*d); for(i=0;i<d;i++) for(j=0;j<n;j++) vt[i*n+j]=v[j*d+i]; float alpha = 1.0 / n, beta = -1.0 / (n * n); sgemm_ ("T", "N", &di, &di, &ni, &alpha, vt, &ni, vt, &ni, &beta, cov, &di); free(vt); } else { float alpha = 1.0 / n, beta = -1.0 / (n * n); ssyrk_("L","N", &di, &ni, &alpha,(float*)v,&di,&beta,cov,&di); /* copy lower triangle to upper */ for(i=0;i<d;i++) for(j=i+1;j<d;j++) cov[i+j*d]=cov[j+i*d]; } return cov; }
float * spfvec_to_fvec (int * idx, float * v, int nz, int n) { int i; float * ret = fvec_new_0 (n); for (i = 0 ; i < nz ; i++) if(idx[i] >= 0) /* ignore bad bins */ ret[idx[i]] = v[i]; return ret; }
void gmm_handle_empty(int n, const float *v, gmm_t *g, float *p) { long d=g->d, k=g->k; long nz=fvec_count_occurrences(p,k*n,0); printf("nb of 0 probabilities: %ld / (%ld*%d) = %.1f %%\n", nz,k,n,nz*100.0/(k*n)); int i,j; float *w=fvec_new_0(k); for (i = 0 ; i < n ; i++) for (j = 0 ; j < k ; j++) w[j]+=p[j+i*k]; int bigprime=1000003; for (j = 0 ; j < k ; j++) if(w[j]==0) { printf("center %d is empty....",j); fflush(stdout); int j2; j2=j; for(i=0; i<k; i++) { j2=(j2+bigprime)%k; if(w[j2]>0) break; } assert(i<k || !"could not find centroid to split, veeeery bad input data"); /* dimension to split: that with highest variance */ int split_dim = fvec_arg_max (g->sigma + d * j2, d); /* transfer half(?) of the points from j2 -> j */ int nt=0,nnz=0; for(i=0; i<n; i++) if(p[j2+i*k]>0) { nnz++; if(v[i*d+split_dim]<g->mu[j2*d+split_dim]) { p[j+i*k]=p[j2+i*k]; p[j2+i*k]=0; nt++; } } printf("split %d at dim %d (variance %g, transferred %d/%d pts)\n", j2,split_dim,g->sigma[d*j2+split_dim],nt,nnz); w[j2]=-1; /* avoid further splits */ } free(w); }
float *fmat_new_0 (int nrow, int ncol) { return fvec_new_0 (nrow * (long)ncol); }
void HBPlus::inner_lb_distance_OnePerPoint(const fDataSet *ds) { int i, j, nci, otheri; float dis = 0; float *xcenter = fvec_new(d); float *ocenter = fvec_new(d); float *x = fvec_new(d); // distance between each centroid pair float *centroid_dis_map = fvec_new_0(ncenter*ncenter); innerLB = (DoubleIndex **)malloc(sizeof(DoubleIndex*)*ncenter); for(i = 0; i < ncenter; i++){ innerLB[i] = NULL; } /// prepare distances between each two centroids for(i = 0; i < ncenter; i++) { memcpy(xcenter, centroid+i*d, sizeof(float)*d); for(j = 0; j <= i; j++) { memcpy(ocenter, centroid+j*d, sizeof(float)*d); dis = odistance(xcenter, ocenter, d); centroid_dis_map[i*ncenter+j] = dis; if(i != j) { centroid_dis_map[j*ncenter+i] = dis; } } } // initialize the storing space for inner distance of each member point for(nci = 0; nci < ncenter; nci++) { /// cnt_member_points int cnt_member = member[nci].size(); innerLB[nci] = (DoubleIndex*)malloc(sizeof(DoubleIndex) * cnt_member); for(i = 0; i < cnt_member; i++) { innerLB[nci][i].id = -1; innerLB[nci][i].val = FLOAT_MAX; } } for(nci = 0; nci < ncenter; nci++) { /* in each centroid */ memcpy(xcenter, centroid+nci*d, sizeof(float)*d); // the current centroid int cnt_member = member[nci].size(); // cnt member points /* for each member points */ for(i = 0; i < cnt_member; i++){ memcpy(x, ds->data+member[nci][i]*d, sizeof(float)*d); /* for each other centroid */ for(otheri = 0; otheri < ncenter; otheri++) { if(otheri != nci) { memcpy(ocenter, centroid+otheri*d, sizeof(float)*d); dis = (odistance_square(x, ocenter, d) - odistance_square(x, xcenter, d)) / (2*centroid_dis_map[nci*ncenter+otheri]); if(f_bigger(innerLB[nci][i].val, dis)) {// update using smaller distance innerLB[nci][i].val = dis; innerLB[nci][i].id = member[nci][i]; // id is the data point } } } } // sort member data points along the innerLB distance in the nci-th cluster DI_MergeSort(innerLB[nci], 0, cnt_member-1); } free(centroid_dis_map); centroid_dis_map = NULL; free(ocenter); ocenter = NULL; free(xcenter); xcenter = NULL; free(x); x = NULL; }
gmm_t * gmm_learn (int di, int ni, int ki, int niter, const float * v, int nt, int seed, int nredo, int flags) { long d=di,k=ki,n=ni; int iter, iter_tot = 0; double old_key, key = 666; niter = (niter == 0 ? 10000 : niter); /* the GMM parameters */ float * p = fvec_new_0 (n * k); /* p(ci|x) for all i */ gmm_t * g = gmm_new (d, k); /* initialize the GMM: k-means + variance estimation */ int * nassign = ivec_new (n); /* not useful -> to be removed when debugged */ float * dis = fvec_new (n); kmeans (d, n, k, niter, v, nt, seed, nredo, g->mu, dis, NULL, nassign); fflush (stderr); fprintf (stderr, "assign = "); ivec_print (nassign, k); fprintf (stderr, "\n"); free (nassign); /* initialization of the GMM parameters assuming a diagonal matrix */ fvec_set (g->w, k, 1.0 / k); double sig = fvec_sum (dis, n) / n; printf ("sigma at initialization = %.3f\n", sig); fvec_set (g->sigma, k * d, sig); free (dis); /* start the EM algorithm */ fprintf (stdout, "<><><><> GMM <><><><><>\n"); if(flags & GMM_FLAGS_PURE_KMEANS) niter=0; for (iter = 1 ; iter <= niter ; iter++) { gmm_compute_p_thread (n, v, g, p, flags, nt); fflush(stdout); gmm_handle_empty(n, v, g, p); gmm_compute_params (n, v, p, g, flags, nt); fflush(stdout); iter_tot++; /* convergence reached -> leave */ old_key = key; key = fvec_sum (g->mu, k * d); printf ("keys %5d: %.6f -> %.6f\n", iter, old_key, key); fflush(stdout); if (key == old_key) break; } fprintf (stderr, "\n"); free(p); return g; }
float ANC::neighbor_cluster_estimation(const fDataSet *ds, int nth) { /// check for necessary data: centroids, basedata ASSERTINFO(ds == NULL || centroid == NULL || ds->data == NULL, "IPP"); /// prepare for necessary variables neighbor.resize(ncenter); int i, iclu = -1, ineighbor = -1; int K = ncenter; int n = ds->n; int *tmp_assign = ivec_new_set(n * g, -1); float *tmp_dis = fvec_new_0(n * g); int *neighbor_flag = ivec_new_set(ncenter*ncenter, 0); /// find k-nn among all centroids for each base vector: query=basedata, dataset=centroids, k=2 for neighbor cluster knn_full_thread ( 2, // euclidean distance n, K, d, g, // g-nn centroid, ds->data, NULL, tmp_assign, tmp_dis, nth); // extract neighbor clusters for each cluster for(i = 0; i < n; i++) { iclu = tmp_assign[i*g]; // current cluster = current point's 1-NN for (int ig = 1; ig < g; ig++){ ineighbor = tmp_assign[i*g+ig]; // current neighbor cluster = current point's g-thNN if(0 == neighbor_flag[iclu*ncenter+ineighbor]){ neighbor[iclu].push_back(ineighbor); neighbor_flag[iclu*ncenter+ineighbor] = 1; } } } puts("end neighbor"); // check number of neighbor cluster for(i = 0; i < K; i++){ ASSERTINFO(neighbor[i].size() == 0, "warning: there is a cluster who has no neighbors"); } puts(">>> finished neighbor cluster registration"); ///### display neighbor cluster count puts(">>> neighbor cluster"); int sum_neighbor = 0; for(i = 0; i < K; i++){ // printf("\n%d - %d\t", i, neighbor[i].size()); sum_neighbor += neighbor[i].size(); } float avg_neighbor = sum_neighbor / (float)K; /* if(K <= 10){ for(i = 0; i < K; i++){ printf("\n%d - %d\t", i, neighbor[i].size()); for(ineighbor = 0; ineighbor < neighbor[i].size(); ineighbor++){ printf("%d ", neighbor[i][ineighbor]); } } }*/ /// disallocate space FREE(tmp_assign); FREE(tmp_dis); return avg_neighbor; }
void vlad_compute(int k, int d, const float *centroids, int n, const float *v,int flags, float *desc) { int i,j,l,n_quantile,i0,i1,ai,a,ma,ni; int *perm ; float un , diff; float *tab,*u,*avg,*sum,*mom2,*dists; int *hist,*assign; if(flags<11 || flags>=13) { assign=ivec_new(n); nn(n,k,d,centroids,v,assign,NULL,NULL); if(flags==6 || flags==7) { n_quantile = flags==6 ? 3 : 1; fvec_0(desc,k*d*n_quantile); perm = ivec_new(n); tab = fvec_new(n); ivec_sort_index(assign,n,perm); i0=0; for(i=0;i<k;i++) { i1=i0; while(i1<n && assign[perm[i1]]==i) { i1++; } if(i1==i0) continue; for(j=0;j<d;j++) { for(l=i0;l<i1;l++) { tab[l-i0]=v[perm[l]*d+j]; } ni=i1-i0; fvec_sort(tab,ni); for(l=0;l<n_quantile;l++) { desc[(i*d+j)*n_quantile+l]=(tab[(l*ni+ni/2)/n_quantile]-centroids[i*d+j])*ni; } } i0=i1; } free(perm); free(tab); } else if(flags==5) { fvec_0(desc,k*d); for(i=0;i<n;i++) { for(j=0;j<d;j++) { desc[assign[i]*d+j]+=v[i*d+j]; } } } else if(flags==8 || flags==9) { fvec_0(desc,k*d); u = fvec_new(d); for(i=0;i<n;i++) { fvec_cpy(u,v+i*d,d); fvec_sub(u,centroids+assign[i]*d,d); un=(float)sqrt(fvec_norm2sqr(u,d)); if(un==0) continue; if(flags==8) { fvec_div_by(u,d,un); } else if(flags==9) { fvec_div_by(u,d,sqrt(un)); } fvec_add(desc+assign[i]*d,u,d); } free(u); } else if(flags==10) { fvec_0(desc,k*d); for(i=0;i<n;i++) { for(j=0;j<d;j++) { desc[assign[i]*d+j]+=v[i*d+j]; } } for(i=0;i<k;i++) { fvec_normalize(desc+i*d,d,2.0); } } else if(flags==13) { fvec_0(desc,k*d); for(i=0;i<n;i++) { for(j=0;j<d;j++) { desc[assign[i]*d+j]+=(float)sqr(v[i*d+j]-centroids[assign[i]*d+j]); } } } else if(flags==14) { avg = fvec_new_0(k*d); for(i=0;i<n;i++) { for(j=0;j<d;j++) { avg[assign[i]*d+j]+=v[i*d+j]-centroids[assign[i]*d+j]; } } hist=ivec_new_histogram(k,assign,n); for(i=0;i<k;i++) { if(hist[i]>0) { for(j=0;j<d;j++) { avg[i*d+j]/=hist[i]; } } } free(hist); fvec_0(desc,k*d); for(i=0;i<n;i++) { for(j=0;j<d;j++) { desc[assign[i]*d+j]+=(float)(sqr(v[i*d+j]-centroids[assign[i]*d+j]-avg[assign[i]*d+j])); } } fvec_sqrt(desc,k*d); free(avg); } else if(flags==15) { fvec_0(desc,k*d*2); sum = desc; for(i=0;i<n;i++) { for(j=0;j<d;j++) { sum[assign[i]*d+j]+=v[i*d+j]-centroids[assign[i]*d+j]; } } hist = ivec_new_histogram(k,assign,n); mom2 = desc+k*d; for(i=0;i<n;i++) { ai=assign[i]; for(j=0;j<d;j++) { mom2[ai*d+j]+=(float)(sqr(v[i*d+j]-centroids[ai*d+j]-sum[ai*d+j]/hist[ai])); } } fvec_sqrt(mom2,k*d); free(hist); } else if(flags==17) { fvec_0(desc,k*d*2); for(i=0;i<n;i++) { for(j=0;j<d;j++) { diff=v[i*d+j]-centroids[assign[i]*d+j]; if(diff>0) { desc[assign[i]*d+j]+=diff; } else { desc[assign[i]*d+j+k*d]-=diff; } } } } else { fvec_0(desc,k*d); for(i=0;i<n;i++) { for(j=0;j<d;j++) { desc[assign[i]*d+j]+=v[i*d+j]-centroids[assign[i]*d+j]; } } if(flags==1) { hist=ivec_new_histogram(k,assign,n); /* printf("unbalance factor=%g\n",ivec_unbalanced_factor(hist,k)); */ for(i=0;i<k;i++) { for(j=0;j<d;j++) { desc[i*d+j]/=hist[i]; } } free(hist); } if(flags==2) { for(i=0;i<k;i++) { fvec_normalize(desc+i*d,d,2.0); } } if(flags==3 || flags==4) { assert(!"not implemented"); } if(flags==16) { hist=ivec_new_histogram(k,assign,n); for(i=0;i<k;i++) { if(hist[i]>0) { fvec_norm(desc+i*d,d,2); fvec_mul_by(desc+i*d,d,sqrt(hist[i])); } } free(hist); } } free(assign); } else if(flags==11 || flags==12) { ma=flags==11 ? 4 : 2; assign=ivec_new(n*ma); dists=knn(n,k,d,ma,centroids,v,assign,NULL,NULL); fvec_0(desc,k*d); for(i=0;i<n;i++) { for(j=0;j<d;j++) { for(a=0;a<ma;a++) { desc[assign[ma*i+a]*d+j]+=v[i*d+j]-centroids[assign[ma*i+a]*d+j]; } } } free(dists); free(assign); } }
void gmm_fisher_spatial(int N, int K, int D, const float *Q, const float *sgmm, const float *ll, float *sdesc) { float *Q_sum = fvec_new_0(K); { long k, n; for(n = 0; n < N; n++) for(k = 0; k < K; k++) Q_sum[k] += Q[n * K + k]; for(k = 0; k < K; k++) Q_sum[k] /= N; } float *Q_ll, *Q_ll_2; { /* prepare a matrix containing both ll and ll**2 */ float *ll_ll2 = fvec_new(D * 2 * N); fvec_cpy(ll_ll2, ll, D * N); float *ll2 = ll_ll2 + D * N; long i; for(i = 0; i < D * N; i++) ll2[i] = ll[i] * ll[i]; /* compute Q.T * ll_ll2 */ FINTEGER mi = K, ni = 2 * D, ki = N; float one_over_N = 1.0 / N, zero = 0; Q_ll = fvec_new(K * 2 * D); Q_ll_2 = Q_ll + K * D; sgemm_("N", "N", &mi, &ni, &ki, &one_over_N, Q, &mi, ll_ll2, &ki, &zero, Q_ll, &mi); free(ll_ll2); } { const float *mm = sgmm; float *d_mm = sdesc; long k, d; for(d = 0; d < D; d++) for(k = 0; k < K; k++) d_mm[d + k * D] = Q_ll[K * d + k] - Q_sum[k] * mm[d]; float *d_S = sdesc + K * D; const float *S = sgmm + D; for(d = 0; d < D; d++) { float dfact = S[d] - mm[d] * mm[d]; for(k = 0; k < K; k++) d_S[d + k * D] = -Q_ll_2[K * d + k] + 2 * Q_ll[K * d + k] * mm[d] + Q_sum[k] * dfact; } } free(Q_ll); free(Q_sum); }
void gmm_fisher_from_posteriors(int n, const float *v, const gmm_t * g, int flags, const float *p, float *dp_dlambda) { long d=g->d, k=g->k; long i,j,l; long ii=0; float * vp = NULL; /* v*p */ float * sum_pj = NULL; /* sum of p's for a given j */ #define P(j,i) p[(i)*k+(j)] #define V(l,i) v[(i)*d+(l)] #define MU(l,j) g->mu[(j)*d+(l)] #define SIGMA(l,j) g->sigma[(j)*d+(l)] #define VP(l,j) vp[(j)*d+(l)] if(flags & GMM_FLAGS_W) { float *accus = fvec_new_0(k); for(i=0;i<n;i++) for(j=1;j<k;j++) accus[j] += P(j,i)/g->w[j] - P(0,i)/g->w[0]; for(j=1;j<k;j++) { double accu=accus[j]; /* normalization */ double f=n*(1/g->w[j]+1/g->w[0]); dp_dlambda[ii++]=accu/sqrt(f); } free(accus); } if(flags & GMM_FLAGS_MU) { float *dp_dmu=dp_dlambda+ii; #define DP_DMU(l,j) dp_dmu[(j)*d+(l)] if(0) { /* simple and slow */ for(j=0;j<k;j++) { for(l=0;l<d;l++) { double accu=0; for(i=0;i<n;i++) accu += P(j,i) * (V(l,i)-MU(l,j)) / SIGMA(l,j); DP_DMU(l,j)=accu; } } } else { /* complicated and fast */ /* precompute tables that may be useful for sigma too */ vp = fvec_new(k * d); fmat_mul_tr(v,p,d,k,n,vp); sum_pj = fvec_new_0(k); for(i=0;i<n;i++) for(j=0;j<k;j++) sum_pj[j] += P(j,i); for(j=0;j<k;j++) { for(l=0;l<d;l++) DP_DMU(l,j) = (VP(l,j) - MU(l,j) * sum_pj[j]) / SIGMA(l,j); } } /* normalization */ if(!(flags & GMM_FLAGS_NO_NORM)) { for(j=0;j<k;j++) for(l=0;l<d;l++) { float nf = sqrt(n*g->w[j]/SIGMA(l,j)); if(nf > 0) DP_DMU(l,j) /= nf; } } #undef DP_DMU ii+=d*k; } if(flags & (GMM_FLAGS_SIGMA | GMM_FLAGS_1SIGMA)) { if(flags & GMM_FLAGS_1SIGMA) { /* fast not implemented for 1 sigma */ for(j=0;j<k;j++) { double accu2=0; for(l=0;l<d;l++) { double accu=0; for(i=0;i<n;i++) accu += P(j,i) * (sqr(V(l,i)-MU(l,j)) / SIGMA(l,j) - 1) / sqrt(SIGMA(l,j)); if(flags & GMM_FLAGS_SIGMA) { double f=flags & GMM_FLAGS_NO_NORM ? 1.0 : 2*n*g->w[j]/SIGMA(l,j); dp_dlambda[ii++]=accu/sqrt(f); } accu2+=accu; } if(flags & GMM_FLAGS_1SIGMA) { double f=flags & GMM_FLAGS_NO_NORM ? 1.0 : 2*d*n*g->w[j]/SIGMA(0,j); dp_dlambda[ii++]=accu2/sqrt(f); } } } else { /* fast and complicated */ assert(flags & GMM_FLAGS_SIGMA); float *dp_dsigma = dp_dlambda + ii; if(!vp) { vp = fvec_new(k * d); fmat_mul_tr(v,p,d,k,n,vp); } if(!sum_pj) { sum_pj = fvec_new(k); for(j=0;j<k;j++) { double sum=0; for(i=0;i<n;i++) sum += P(j,i); sum_pj[j] = sum; } } float *v2 = fvec_new(n * d); for(i = n*d-1 ; i >= 0; i--) v2[i] = v[i] * v[i]; float *v2p = fvec_new(k * d); fmat_mul_tr(v2,p,d,k,n,v2p); free(v2); #define V2P(l,j) v2p[(j)*d+(l)] #define DP_DSIGMA(i,j) dp_dsigma[(i)+(j)*d] for(j=0;j<k;j++) { for(l=0;l<d;l++) { double accu; accu = V2P(l, j); accu += VP(l, j) * (- 2 * MU(l,j)); accu += sum_pj[j] * (sqr(MU(l,j)) - SIGMA(l,j)); /* normalization */ double f; if(flags & GMM_FLAGS_NO_NORM) { f = pow(SIGMA(l,j), -1.5); } else { f = 1 / (SIGMA(l,j) * sqrt(2*n*g->w[j])); } DP_DSIGMA(l,j) = accu * f; } } free(v2p); #undef DP_DSIGMA #undef V2P ii += d * k; } } assert(ii==gmm_fisher_sizeof(g,flags)); #undef P #undef V #undef MU #undef SIGMA free(sum_pj); free(vp); }
void Clustering::neighbor_cluster_estimation(const fDataSet *ds, int nth) { /// check for necessary data: centroids, basedata ASSERTINFO(ds == NULL || centroid == NULL || ds->data == NULL, "IPP"); /// prepare for necessary variables neighbor.resize(ncenter); int i, iclu = -1, ineighbor = -1; int K = ncenter; int n = ds->n; int d = ds->d; int *tmp_assign = ivec_new_set(n * 2, -1); float *tmp_dis = fvec_new_0(n * 2); int *neighbor_flag = ivec_new_set(ncenter*ncenter, 0); /// find k-nn among all centroids for each base vector: query=basedata, dataset=centroids, k=2 for neighbor cluster knn_full_thread ( 2, // euclidean distance n, K, d, 2, // 2-nn centroid, ds->data, NULL, tmp_assign, tmp_dis, nth); // extract neighbor clusters for each cluster for(i = 0; i < n; i++) { iclu = tmp_assign[i*2]; // current cluster = current point's 1-NN ineighbor = tmp_assign[i*2+1]; // current neighbor cluster = current point's 2-NN if(0 == neighbor_flag[iclu*ncenter+ineighbor]){ neighbor[iclu].push_back(ineighbor); neighbor_flag[iclu*ncenter+ineighbor] = 1; } } puts(">>> finished neighbor cluster registration"); ///### display neighbor cluster count puts(">>> neighbor cluster"); int sum_neighbor = 0; for(i = 0; i < K; i++){ printf("\n%d - %d\t", i, neighbor[i].size()); sum_neighbor += neighbor[i].size(); } printf("\naveragely %lf neighbors\n", sum_neighbor / (float)K); /* if(K <= 500){ for(i = 0; i < K; i++){ printf("\n%d - %d\t", i, neighbor[i].size()); int ineighbor; for(ineighbor = 0; ineighbor < neighbor[i].size(); ineighbor++){ printf("%d ", neighbor[i][ineighbor]); } } } */ printf("\naveragely %lf neighbors\n", sum_neighbor / (float)K); /// disallocate space FREE(tmp_assign); FREE(tmp_dis); }