static double merge_sumapprox_Tdt(int k) { double x[ddN.DT]; int d; for (d=0; d<ddN.DT; d++) { double xp = 0, xm = 0; int nn, tt, TdT; nn = ddS.Ndt[d][k]; if ( nn<=1 ) { x[d] = 0; continue; } tt = ddS.Tdt[d][k]; TdT = getTdT(d); if ( tt<nn ) xp = (ddP.bpar + ddP.apar*TdT) * S_V(ddC.SX,nn,tt+1) * alphabasetopicprob(k); if ( tt>1) { ddS.TDt[k]--; ddS.TDT--; xm = 1.0/(ddP.bpar + ddP.apar*(TdT-1)) / S_V(ddC.SX,nn,tt) / alphabasetopicprob(k); ddS.TDt[k]++; ddS.TDT++; } if ( xm>xp ) x[d] = xm; else x[d] = xp; } // ????????? }
double S_UV(stable_t *sp, unsigned n, unsigned m) { double SV; if ( m==1 ) return -INFINITY; if ( m==n+1 ) return 1; SV = S_V(sp,n,m); return (n - m*sp->a)*SV + 1.0; }
double S_U(stable_t *sp, unsigned n, unsigned m) { if ( m==1 ) return n - sp->a; if ( m<=1 ) yaps_quit("Bad constraints in S_U(%s,%u,%u)\n", sp->tag, n, m); assert(m>1); return n - m*sp->a + 1/S_V(sp,n,m); }
int main(int argc, char* argv[]) { int c; int n, t; stable_t *S; /* * default values for args */ while ( (c=getopt(argc, argv,"ha:N:T:"))>=0 ) { switch ( c ) { case 'a': if ( !optarg || sscanf(optarg,"%f",&apar)!=1 ) yaps_quit("Need a valid 'a' argument\n"); break; case 'N': if ( !optarg || sscanf(optarg,"%d",&N)!=1 ) yaps_quit("Need a valid 'N' argument\n"); break; case 'T': if ( !optarg || sscanf(optarg,"%d",&T)!=1 ) yaps_quit("Need a valid 'T' argument\n"); break; case 'h': usage(); exit(0); } } if ( T>N ) T = N; if ( !(S=S_make(N+1,T,2*N,2*T,apar,S_UVTABLE|S_STABLE|S_VERBOSE)) ) yaps_quit("S_make failed\n"); S_report(S, stdout); #if 1 /* * list various values */ printf("S(%d,%d) = %10.6lg, V=n/a U=%lg\n", N, 1, S_S(S,N,1), S_U(S,N,1)); for (t=2; t<=T; t++) printf("S(%d,%d) = %10.6lg, V=%lg U=%lg\n", N, t, S_S(S,N,t), S_V(S,N,t), S_U(S,N,t)); printf("\nS(%d,%d) = %10.6lg, V=n/a U=%lg\n", N+10, 1, S_S(S,N+10,1), S_U(S,N+10,1)); for (t=2; t<=2*T+10; t++) printf("S(%d,%d) = %10.6lg, V=%lg U=%lg\n", N+10, t, S_S(S,N+10,t), S_V(S,N+10,t), S_U(S,N+10,t)); for (n=T+1; n<=2*N; n++) printf("S(%d,%d..) = %10.6lg %10.6lg\n", n, T, S_S(S,n,T), S_S(S,n,T+1)); for (n=2; n<8; n++ ) { printf("S(%d,%d) = %10.6lg ", n, 1, S_S(S,n,1)); for (t=2; t<=n; t++) printf(" %10.6lg", S_S(S,n,t)); printf("\n"); } for (n=2; n<8; n++ ) { printf("V(%d,%d) = %10.6lg", n, 2, S_V(S,n,2)); for (t=3; t<=n; t++) printf(" %10.6lg", S_V(S,n,t)); printf("\n"); } for (n=2; n<8; n++ ) { printf("U(%d,%d) = %10.6lg", n, 2, S_U(S,n,2)); for (t=3; t<=n; t++) printf(" %10.6lg", S_U(S,n,t)); printf("\n"); } #else { /* * Sample a partition of size T of N by Chinese Rest. distribution * start by sampling the last entry from (1,2,...,N-T+1); * see p(m | CRD, apr, N, T) on page 4 of "doc/alpha.pdf" */ double *prob = malloc(sizeof(*prob)*N); double probtot = 1.0; prob[1] = 1.0; for (t=2; t<=N-T+1; t++) probtot += prob[t] = (N-t+1)*(t-apar) / S_U(S,N-t,T-1)/(t-1) * prob[t-1]; for (t=1; t<=N-T+1; t++) printf("p(m=%d) = %lg\n", t, prob[t]/probtot ); } #endif S_report(S, stdout); S_free(S); return 0; }
int main(int argc, char* argv[]) { int i, j, c, iter, ITER=200; unsigned long int seed=0; int bcycle = 0; float bstart = 0; int acycle = 0; float astart = 0; int burnin = 0; stable_t *ST = NULL; int useN = DIM*2; MAXN = 1; MAXT = MAXSTAB; /* * default values for args */ while ( (c=getopt(argc, argv,"a:b:B:C:I:hH:I:N:P:S:s:T:v"))>=0 ) { switch ( c ) { case 'h': usage(burnin?burnin:ITER/2, ITER, useN); exit(0); case 'b': if ( !optarg || sscanf(optarg,"%f,%f",&bpar, &bstart)<1 ) yaps_quit("Need a valid 'b' argument\n"); break; case 'a': if ( !optarg || sscanf(optarg,"%f,%f",&apar,&astart)<1 ) yaps_quit("Need a valid 'a' argument\n"); break; case 'H': if ( !optarg || sscanf(optarg,"%d",&bcycle)!=1 ) yaps_quit("Need a valid 'G' argument\n"); break; case 'T': if ( !optarg || sscanf(optarg,"%d",&MAXT)!=1 ) yaps_quit("Need a valid 'T' argument\n"); break; case 'I': if ( !optarg || sscanf(optarg,"%d",&acycle)!=1 ) yaps_quit("Need a valid 'H' argument\n"); break; case 'N': if ( !optarg || sscanf(optarg,"%d",&useN)!=1 ) yaps_quit("Need a valid 'N' argument\n"); break; case 'C': if ( !optarg || sscanf(optarg,"%d",&ITER)!=1 ) yaps_quit("Need a valid 'C' argument\n"); break; case 'B': if ( !optarg || sscanf(optarg,"%d",&burnin)!=1 ) yaps_quit("Need a valid 'B' argument\n"); break; case 's': if ( !optarg || sscanf(optarg,"%lu",&seed)!=1 ) yaps_quit("Need a valid 's' argument\n"); break; case 'v': verbose++; break; #ifdef S_USE_THREADS case 'P': if ( !optarg || sscanf(optarg,"%u",&threads)!=1 ) yaps_quit("Need a valid 'P' argument\n"); break; #endif default: yaps_message("Bad command line argument\n\n"); usage(burnin?burnin:ITER/2, ITER, useN); exit(0); } } if ( useN>=MAXDATA ) yaps_quit("N too large\n"); if ( burnin==0 ) burnin = ITER/2; else if ( burnin>=ITER-1 ) yaps_quit("Burnin %d too large for cycles %d\n", burnin, ITER); yaps_message("Configuration details\n"); yaps_message("=====================\n"); /* * set random number generator */ if ( seed ) { rng_seed(rng,seed); } else { rng_time(rng,&seed); } yaps_message("Setting seed for data = %lu\n", seed); if ( acycle && apar==0 ) apar = 0.5; yaps_message("Setting a=%f, b=%f, N=%d, D=%d\n", apar, bpar, useN, NUMMN); yaps_message(" burnin=%d,", burnin); yaps_message(" cycles=%d\n", ITER); /* * fix pointers */ for (j=0; j<NUMMN; j++) { n[j] = &n_data[j*DIM]; t[j] = &t_data[j*DIM]; tave[j] = &tave_data[j*DIM]; } /* * initialise everything */ for (j=0; j<NUMMN; j++) { N[j] = useN; T[j] = 0; Tave[j] = 0; for (i=0; i<DIM; i++) { n[j][i] = 0; t[j][i] = 0; tave[j][i] = 0; } } /* * fix base distribution, uniform */ { for (i=0; i<DIM; i++) { H[i] = 1.0/DIM; } } /* * create data using a CRP to get initialisation for n[] */ c = 0; for (j=0; j<NUMMN; j++) { int cc; i = sampleH(); data[c++] = i; // first entry always adds a table n[j][i]++; t[j][i]++; T[j]++; for (cc=1; cc<N[j]; cc++) { float val = (cc+bpar)*rng_unit(rng); val -= T[j]*apar+bpar; if ( val<=0 ) { // new table i = sampleH(); t[j][i]++; T[j]++; } else { for (i=0; i<DIM; i++) { val -= n[j][i] - t[j][i]*apar; if ( val<0 ) break; } } assert(i<DIM); n[j][i]++; data[c++] = i; } } binit = bpar; /* * record maximum entries in data * do this where possible so that one can get the table * sizes right * */ MAXN = n[0][0]+1; MAXT = 1; for (j=0; j<NUMMN; j++) { for (i=0; i<DIM; i++) { if ( MAXN<=n[j][i] ) MAXN = n[j][i]+1; if ( MAXT<t[j][i] ) MAXT = t[j][i]*1.1+1; } } if ( MAXT>MAXN ) MAXT = MAXN; yaps_message("Making S for N=%d M=%d a=%lf\n", MAXN,MAXT,apar); ST = S_make(MAXN, MAXT, MAXN, MAXTAB, apar, S_STABLE | S_UVTABLE); if ( ST==NULL ) yaps_quit("Making S failed!\n"); S_report(ST,stdout); /* * the seed only sets the data/sample, * the seed for the simulation/Gibbs is always random */ rng_free(rng); rng_time(rng,&seed); //yaps_message("Resetting seed = %lu\n", seed); /* * report on initial data statistics */ yaps_message("\nData sampled\n"); yaps_message("============\n"); for (j=0; j<NUMMN; j++) { yaps_message("n[%d] =", j); for (i=0; i<DIM; i++) yaps_message(" %d", n[j][i]); yaps_message(" = %d\n", N[j]); yaps_message("t[%d] =",j); for (i=0; i<DIM; i++) yaps_message(" %d", t[j][i]); yaps_message(" = %d\n", T[j]); } /* * set the hyperparameters used in Gibbs, * can be different to data */ if ( bstart==0 ) bstart = bpar; if ( astart==0 ) astart = apar; // initialise latent stats and reporting info for (j=0; j<NUMMN; j++) { T[j] = 0; Tave[j] = 0; } tcnt = 0; bave = 0; bcnt = 0; aave = 0; acnt = 0; bpar = bstart; if ( verbose && bcycle!=0 ) yaps_message("Starting with initial b=%f\n", bpar); apar = astart; if ( verbose && acycle!=0 ) yaps_message("Starting with initial a=%f\n", apar); for (j=0; j<NUMMN; j++) { for (i=0; i<DIM; i++) { tave[j][i] = 0; t[j][i] = 0; if ( n[j][i]>0 ) { /* * initialise to a single table */ t[j][i] = 1; T[j]++; } } } for ( iter=0; iter<ITER; iter++) { /* * sampling with table indicators */ c = 0; for (j=0; j<NUMMN; j++) { int cc; for (cc=0; cc<N[j]; cc++) { float one; i = data[c++]; assert(n[j][i]); if ( n[j][i]==1 ) // this indicator must always be 1, no sampling continue; // sample whether it contributes to a table if ( t[j][i]>1 && (n[j][i]-1)*rng_unit(rng)<(t[j][i]-1) ) { t[j][i]--; T[j]--; } assert(t[j][i]<n[j][i]); // sample new table indicator one = H[i] * (bpar + T[j]*apar) * (t[j][i]) / (n[j][i]-t[j][i]+1) * S_V(ST, n[j][i],t[j][i]+1); if ( rng_unit(rng) < one/(one+1.0) ) { t[j][i]++; T[j]++; } } } /* * one major cycle of Gibbs sampler finished */ if ( verbose>1 ) { for (j=0; j<NUMMN; j++) { for (i=0; i<DIM; i++) printf(" %d", t[j][i]); printf(" = %d\n", T[j]); } } /* * sample & record b */ if ( bcycle!=0 && iter%bcycle==0 ) { // Gibbs on bpar (concentration par) too if ( bcycle<0 ) { int bc = -bcycle; for (bc-- ; bc>0; bc--) bpar = sampleb(bpar, 1, PB_shape, PB_scale, N, T, apar, rng, 1, 1); } bpar = sampleb(bpar, 1, PB_shape, PB_scale, N, T, apar, rng, 1, 1); if ( iter>=burnin ) { bave += bpar; bcnt ++; } } /* * sample & record a */ if ( acycle!=0 && iter%acycle==0 ) { int dimI[NUMMN]; double dimb[NUMMN]; for (j=0; j<NUMMN; j++) { dimI[j] = DIM; dimb[j] = bpar; } // Gibbs on apar (discount par) too if ( acycle<0 ) { int bc = -acycle; for (bc-- ; bc>0; bc--) apar = samplea(apar, NUMMN, dimI, T, n, t, NULL, dimb, rng, 1, 1); } apar = samplea(apar, NUMMN, dimI, T, n, t, NULL, dimb, rng, 1, 1); if ( iter>=burnin ) { aave += apar; acnt ++; } if ( verbose>1 ) yaps_message("Extending S for a=%lf\n", apar); if ( S_remake(ST,apar) ) yaps_message("Extending S failed\n"); } /* * full statistics collection */ if ( iter>=burnin ) { for (j=0; j<NUMMN; j++) { for (i=0; i<DIM; i++) { tave[j][i] += t[j][i]; } Tave[j] += T[j]; } tcnt ++; } } /* * report for this experiment */ yaps_message("\nEstimates\n"); yaps_message("=========\n"); for (j=0; j<NUMMN; j++) { yaps_message("t[%d] = ", j); for (i=0; i<DIM; i++) yaps_message(" %.2f", tave[j][i]/tcnt); yaps_message("\nT[%d]=%.2f\n", j, Tave[j]/tcnt); } if ( bcycle!=0 && bcnt>0 ) yaps_message("\nb=%.2f", bave/bcnt); if ( acycle!=0 && acnt>0 ) yaps_message("\na=%.3f", aave/acnt); yaps_message("\n"); S_free(ST); rng_free(rng); return 0; }
static void merge_opt_Tdt(int k1, int k2, merge_alpha_t *M) { int d; struct heap_s up; struct heap_s down; /* * sorting on moves, * stores doc index for */ uint32_t *Tdt_up; uint32_t *Tdt_down; /* * change from incr/decr this docs Tdt */ float *score_up; float *score_down; Tdt_up = u32vec(ddN.DT); Tdt_down = u32vec(ddN.DT); score_up = fvec(ddN.DT); score_down = fvec(ddN.DT); if ( !score_down || !score_up || !Tdt_up || !Tdt_down ) yap_quit("Out of memory in likemerge()\n"); /* * initialise sort */ for (d=0; d<ddN.DT; d++) { assert(M->Tdt[d]<=M->Ndt[d]); /* don't change for some docs */ Tdt_up[d] = d; Tdt_down[d] = d; if ( M->Tdt[d]<M->Ndt[d] ) score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) * S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1); else score_up[d] = 0; if ( M->Tdt[d]>1 ) score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d]) /(ddP.bpar + ddP.apar*(M->TdT[d]-1)); else score_down[d] = 0; assert((M->Tdt[d]>1)||score_down[d]==0); assert((M->Tdt[d]<M->Ndt[d])||score_up[d]==0); assert(M->Tdt[d]<=M->Ndt[d]); } assert(M->TDt>0); /* * use a heap, so only top of heap is least */ heap_init(&up, Tdt_up, ddN.DT, fveccmp, (void *)score_up); heap_init(&down, Tdt_down, ddN.DT, fveccmp, (void *)score_down); while ( 1 ) { float upv; float downv; upv = merge_alphabasetopicprob(M->TDTm+M->TDt, M->TDt, k1) *score_up[heap_front(&up)]; if ( M->TDt>1 ) downv = score_down[heap_front(&down)] / merge_alphabasetopicprob(M->TDTm+M->TDt-1, M->TDt-1, k1); else downv = 0.0; if ( downv>upv && downv>1.0 ){ // decrement this d = heap_front(&down); M->TdT[d]--; M->Tdt[d]--; assert(M->Tdt[d]>0); M->TDt--; heap_pop(&down); heap_remove(&up,d); } else if ( downv<upv && upv>1.0 ){ // increment this d = heap_front(&up); M->TdT[d]++; M->Tdt[d]++; assert(M->Tdt[d]<=M->Ndt[d]); M->TDt++; heap_pop(&up); heap_remove(&down,d); } else { // none are better so quit break; } if ( M->Tdt[d]<M->Ndt[d] ) score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) * S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1); else score_up[d] = 0; if ( M->Tdt[d]>1 ) score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d]) /(ddP.bpar + ddP.apar*(M->TdT[d]-1)); else score_down[d] = 0; assert(M->Tdt[d]>1||score_down[d]==0); assert(M->Tdt[d]<M->Ndt[d] ||score_up[d]==0); assert(M->Tdt[d]<=M->Ndt[d]); /* * now adjust the two heaps for new vals for [d] */ heap_push(&down,d); heap_push(&up,d); } free(score_up); free(score_down); heap_free(&up); heap_free(&down); }