コード例 #1
0
ファイル: merge.c プロジェクト: ecashin/topic-models
static double merge_sumapprox_Tdt(int k) {
  double x[ddN.DT];
  int d;
  for (d=0; d<ddN.DT; d++) {
    double xp = 0, xm = 0;
    int nn, tt, TdT;
    nn = ddS.Ndt[d][k];
    if ( nn<=1 ) {
      x[d] = 0;
      continue;
    }
    tt = ddS.Tdt[d][k];
    TdT = getTdT(d);
    if ( tt<nn )
      xp = (ddP.bpar + ddP.apar*TdT) * S_V(ddC.SX,nn,tt+1)
	* alphabasetopicprob(k);
    if ( tt>1) {
      ddS.TDt[k]--; ddS.TDT--;
      xm = 1.0/(ddP.bpar + ddP.apar*(TdT-1)) / S_V(ddC.SX,nn,tt)
	/ alphabasetopicprob(k);
      ddS.TDt[k]++; ddS.TDT++;
    }
    if ( xm>xp )
      x[d] = xm;
    else 
      x[d] = xp;
  }
  // ?????????
}
コード例 #2
0
ファイル: stable.c プロジェクト: gbennett71/libstb
double S_UV(stable_t *sp, unsigned n, unsigned m) {
  double SV;
  if ( m==1 )
    return -INFINITY;
  if ( m==n+1 )
   return 1;
 SV = S_V(sp,n,m);
 return (n - m*sp->a)*SV + 1.0;
}
コード例 #3
0
ファイル: stable.c プロジェクト: gbennett71/libstb
double S_U(stable_t *sp, unsigned n, unsigned m) {
  if ( m==1 )
    return n - sp->a;
  if ( m<=1 ) 
    yaps_quit("Bad constraints in S_U(%s,%u,%u)\n",
      sp->tag, n, m);
  assert(m>1);
  return n - m*sp->a + 1/S_V(sp,n,m);
}
コード例 #4
0
ファイル: list.c プロジェクト: lazycrazyowl/libstb
int main(int argc, char* argv[])
{
  int c;
  int n, t;
  stable_t *S;

  /*
   *  default values for args
   */
  while ( (c=getopt(argc, argv,"ha:N:T:"))>=0 ) {
    switch ( c ) {
    case 'a':
      if ( !optarg || sscanf(optarg,"%f",&apar)!=1 )
	yaps_quit("Need a valid 'a' argument\n");
      break;
    case 'N':
      if ( !optarg || sscanf(optarg,"%d",&N)!=1 )
	yaps_quit("Need a valid 'N' argument\n");
      break;
    case 'T':
      if ( !optarg || sscanf(optarg,"%d",&T)!=1 )
	yaps_quit("Need a valid 'T' argument\n");
      break;
    case 'h':
      usage();
      exit(0);
    }
  }
  if ( T>N ) T = N;
      
  if ( !(S=S_make(N+1,T,2*N,2*T,apar,S_UVTABLE|S_STABLE|S_VERBOSE)) )
    yaps_quit("S_make failed\n");
  
  S_report(S, stdout);

#if 1
  /*
   *   list various values
   */
  printf("S(%d,%d) = %10.6lg, V=n/a U=%lg\n", N, 1, S_S(S,N,1), S_U(S,N,1));
  for (t=2; t<=T; t++) 
    printf("S(%d,%d) = %10.6lg, V=%lg U=%lg\n", N, t, 
	   S_S(S,N,t), S_V(S,N,t), S_U(S,N,t));

  printf("\nS(%d,%d) = %10.6lg, V=n/a U=%lg\n", 
	 N+10, 1, S_S(S,N+10,1), S_U(S,N+10,1));
  for (t=2; t<=2*T+10; t++) 
    printf("S(%d,%d) = %10.6lg, V=%lg U=%lg\n", N+10, t, 
	   S_S(S,N+10,t), S_V(S,N+10,t), S_U(S,N+10,t));
  for (n=T+1; n<=2*N; n++) 
    printf("S(%d,%d..) = %10.6lg %10.6lg\n", n, T, S_S(S,n,T), S_S(S,n,T+1));
  
  for (n=2; n<8; n++ ) {
    printf("S(%d,%d) = %10.6lg ", n, 1, S_S(S,n,1));
    for (t=2; t<=n; t++)
      printf(" %10.6lg", S_S(S,n,t));
    printf("\n");
  }
  for (n=2; n<8; n++ ) {
    printf("V(%d,%d) = %10.6lg", n, 2, S_V(S,n,2));
    for (t=3; t<=n; t++)
      printf(" %10.6lg", S_V(S,n,t));
    printf("\n");
  }
  for (n=2; n<8; n++ ) {
    printf("U(%d,%d) = %10.6lg", n, 2, S_U(S,n,2));
    for (t=3; t<=n; t++)
      printf(" %10.6lg", S_U(S,n,t));
    printf("\n");
  }
#else
  {
    /*
     *     Sample a partition of size T of N by Chinese Rest. distribution
     *     start by sampling the last entry from (1,2,...,N-T+1);
     *     see p(m | CRD, apr, N, T) on page 4 of "doc/alpha.pdf"
     */
    double *prob = malloc(sizeof(*prob)*N);
    double probtot = 1.0;
    prob[1] = 1.0;
    for (t=2; t<=N-T+1; t++) 
      probtot += prob[t] = (N-t+1)*(t-apar) / S_U(S,N-t,T-1)/(t-1) * prob[t-1]; 
    for (t=1; t<=N-T+1; t++) 
      printf("p(m=%d) = %lg\n", t, prob[t]/probtot );
  }
#endif

  S_report(S, stdout);
  S_free(S);
  return 0; 
}
コード例 #5
0
ファイル: demo.c プロジェクト: wbuntine/libstb
int main(int argc, char* argv[])
{
  int i, j, c, iter, ITER=200;
  unsigned long int seed=0;
  int bcycle = 0;
  float bstart = 0;
  int acycle = 0;
  float astart = 0;
  int burnin = 0;
  stable_t *ST = NULL;
  int useN = DIM*2;

  MAXN = 1;
  MAXT = MAXSTAB;
  /*
   *  default values for args
   */
  while ( (c=getopt(argc, argv,"a:b:B:C:I:hH:I:N:P:S:s:T:v"))>=0 ) {
    switch ( c ) {
    case 'h':
      usage(burnin?burnin:ITER/2, ITER, useN);
      exit(0);
    case 'b':
      if ( !optarg || sscanf(optarg,"%f,%f",&bpar, &bstart)<1 )
	yaps_quit("Need a valid 'b' argument\n");
      break;
    case 'a':
      if ( !optarg || sscanf(optarg,"%f,%f",&apar,&astart)<1 )
	yaps_quit("Need a valid 'a' argument\n");
      break;
    case 'H':
      if ( !optarg || sscanf(optarg,"%d",&bcycle)!=1 )
	yaps_quit("Need a valid 'G' argument\n");
      break;
    case 'T':
      if ( !optarg || sscanf(optarg,"%d",&MAXT)!=1 )
	yaps_quit("Need a valid 'T' argument\n");
      break;
     case 'I':
      if ( !optarg || sscanf(optarg,"%d",&acycle)!=1 )
	yaps_quit("Need a valid 'H' argument\n");
      break;
    case 'N':
      if ( !optarg || sscanf(optarg,"%d",&useN)!=1 )
	yaps_quit("Need a valid 'N' argument\n");
      break;
    case 'C':
      if ( !optarg || sscanf(optarg,"%d",&ITER)!=1 )
	yaps_quit("Need a valid 'C' argument\n");
       break;
    case 'B':
      if ( !optarg || sscanf(optarg,"%d",&burnin)!=1 )
	yaps_quit("Need a valid 'B' argument\n");
      break;
    case 's':
      if ( !optarg || sscanf(optarg,"%lu",&seed)!=1 )
	yaps_quit("Need a valid 's' argument\n");
      break;
    case 'v':
      verbose++;
      break;
#ifdef S_USE_THREADS
      case 'P':
      if ( !optarg || sscanf(optarg,"%u",&threads)!=1 )
	yaps_quit("Need a valid 'P' argument\n");
      break;
#endif
    default:
      yaps_message("Bad command line argument\n\n");
      usage(burnin?burnin:ITER/2, ITER, useN);
      exit(0);
    }
  }
  if ( useN>=MAXDATA ) 
    yaps_quit("N too large\n");
  
  if ( burnin==0 )
    burnin = ITER/2;
  else
    if ( burnin>=ITER-1 )
      yaps_quit("Burnin %d too large for cycles %d\n", burnin, ITER);

  yaps_message("Configuration details\n");
  yaps_message("=====================\n");
  /*
   *   set random number generator
   */
  if ( seed ) {
    rng_seed(rng,seed);
  } else {
    rng_time(rng,&seed);
  }
  yaps_message("Setting seed for data = %lu\n", seed);
    
  if ( acycle && apar==0 )
    apar = 0.5;
 
  yaps_message("Setting a=%f, b=%f, N=%d, D=%d\n", 
	       apar, bpar, useN, NUMMN);
  yaps_message("             burnin=%d,", burnin);
  yaps_message(" cycles=%d\n", ITER);
  
  /*
   *    fix pointers
   */
  for (j=0; j<NUMMN; j++) {
    n[j] = &n_data[j*DIM];
    t[j] = &t_data[j*DIM];
    tave[j] = &tave_data[j*DIM];
  }

  /*
   *    initialise everything
   */
  for (j=0; j<NUMMN; j++) {
    N[j] = useN;
    T[j] = 0;
    Tave[j] = 0;
    for (i=0; i<DIM; i++) {
      n[j][i] = 0;
      t[j][i] = 0;
      tave[j][i] = 0;
    }
  }

  /*
   *  fix base distribution, uniform
   */ 
  {
    for (i=0; i<DIM; i++) {
      H[i] = 1.0/DIM;
    }
  } 
  
  /*
   *     create data using a CRP to get initialisation for n[]
   */
  c = 0;
  for (j=0; j<NUMMN; j++) {
    int cc;
    i = sampleH();
    data[c++] = i;
    //  first entry always adds a table
    n[j][i]++;  
    t[j][i]++;
    T[j]++;
    for (cc=1; cc<N[j]; cc++) {
      float val = (cc+bpar)*rng_unit(rng);
      val -=  T[j]*apar+bpar;
      if ( val<=0 ) {
	//  new table
	i = sampleH();
	t[j][i]++;
	T[j]++;
      } else {
	for (i=0; i<DIM; i++) {
	  val -= n[j][i] - t[j][i]*apar;
	  if ( val<0 )
	    break;
	}
      }
      assert(i<DIM);
      n[j][i]++;  
      data[c++] = i;
    }
  }
  
  binit = bpar;
  
  /*
   *   record maximum entries in data
   *   do this where possible so that one can get the table
   *   sizes right
   *
   */
  MAXN = n[0][0]+1;
  MAXT = 1;
  for (j=0; j<NUMMN; j++) {
    for (i=0; i<DIM; i++) {
      if ( MAXN<=n[j][i] ) MAXN = n[j][i]+1;
      if ( MAXT<t[j][i] ) MAXT = t[j][i]*1.1+1;
    }
  }
  if ( MAXT>MAXN )
    MAXT = MAXN;
  
  yaps_message("Making S for N=%d M=%d a=%lf\n", MAXN,MAXT,apar);
  ST = S_make(MAXN, MAXT, MAXN, MAXTAB,
	      apar, S_STABLE | S_UVTABLE);
  if ( ST==NULL )
    yaps_quit("Making S failed!\n");
  S_report(ST,stdout);

  /*
   *    the seed only sets the data/sample,
   *    the seed for the simulation/Gibbs is always random
   */
  rng_free(rng);
  rng_time(rng,&seed);
  //yaps_message("Resetting seed = %lu\n", seed);
  
  /*
   *   report on initial data statistics
   */
  yaps_message("\nData sampled\n");
  yaps_message("============\n");
  for (j=0; j<NUMMN; j++) {
    yaps_message("n[%d] =", j);
    for (i=0; i<DIM; i++)
      yaps_message(" %d", n[j][i]);
    yaps_message(" = %d\n", N[j]);
    yaps_message("t[%d] =",j);
    for (i=0; i<DIM; i++)
      yaps_message(" %d", t[j][i]);
    yaps_message(" = %d\n", T[j]);
  }

  /*
   *   set the hyperparameters used in Gibbs,
   *   can be different to data
   */
  if ( bstart==0 ) 
    bstart = bpar;
  if ( astart==0 ) 
    astart = apar;

  //  initialise latent stats and reporting info
  for (j=0; j<NUMMN; j++) {
    T[j] = 0;
    Tave[j] = 0;
  }
  tcnt = 0;
  bave = 0;
  bcnt = 0;
  aave = 0;
  acnt = 0;

  bpar = bstart;
  if ( verbose && bcycle!=0 )
    yaps_message("Starting with initial b=%f\n", bpar);
  
  apar = astart;
  if ( verbose && acycle!=0 )
    yaps_message("Starting with initial a=%f\n", apar);
  
  for (j=0; j<NUMMN; j++) {
    for (i=0; i<DIM; i++) {
      tave[j][i] = 0;
      t[j][i] = 0;
      if ( n[j][i]>0 ) {
	/*
	 *  initialise to a single table
	 */
	t[j][i] = 1;
	T[j]++;
      }
    }
  }

  for ( iter=0; iter<ITER; iter++) {
    /*
     *   sampling with table indicators
     */
    c = 0;
    for (j=0; j<NUMMN; j++) {
      int cc;
      for (cc=0; cc<N[j]; cc++) {
	float one;
	i = data[c++];
	assert(n[j][i]);
	if ( n[j][i]==1 )
	  //    this indicator must always be 1, no sampling
	  continue;
	//   sample whether it contributes to a table
	if ( t[j][i]>1 &&
	     (n[j][i]-1)*rng_unit(rng)<(t[j][i]-1) ) {
	  t[j][i]--;  
	  T[j]--;
	} 
	assert(t[j][i]<n[j][i]);
	//  sample new table indicator
	one = H[i] * (bpar + T[j]*apar) * (t[j][i]) / (n[j][i]-t[j][i]+1)
	  * S_V(ST, n[j][i],t[j][i]+1);
	if ( rng_unit(rng) < one/(one+1.0) ) {
	  t[j][i]++;
	  T[j]++;
	} 
      }
    }
    
    /*
     *   one major cycle of Gibbs sampler finished
     */
    if ( verbose>1 ) {
      for (j=0; j<NUMMN; j++) {
	for (i=0; i<DIM; i++)
	  printf(" %d", t[j][i]);
	printf(" = %d\n", T[j]);
      }
    }
    /*
     *     sample & record b 
     */
    if ( bcycle!=0 && iter%bcycle==0 ) {
      //   Gibbs on bpar (concentration par) too
      if ( bcycle<0 ) {
	  int bc = -bcycle;
	  for (bc-- ; bc>0; bc--)
	    bpar = sampleb(bpar, 1, PB_shape, PB_scale, N, T, 
			   apar, rng, 1, 1);
      }
      bpar = sampleb(bpar, 1, PB_shape, PB_scale, N, T, 
		     apar, rng, 1, 1);
      if ( iter>=burnin ) {
	bave += bpar;
	bcnt ++;
      }
    }
    /*
     *     sample & record a
     */
    if ( acycle!=0 && iter%acycle==0 ) {
      int dimI[NUMMN];
      double dimb[NUMMN];
      for (j=0; j<NUMMN; j++)  {
	dimI[j] = DIM;
	dimb[j] = bpar;
      }
      //   Gibbs on apar (discount par) too
      if ( acycle<0 ) {
	int bc = -acycle;
	for (bc-- ; bc>0; bc--)
	  apar = samplea(apar, NUMMN, dimI, T, n, t, NULL, dimb, rng, 1, 1);
      }
      apar = samplea(apar, NUMMN, dimI, T, n, t, NULL, dimb, rng, 1, 1);
      if ( iter>=burnin ) {
	aave += apar;
	acnt ++;
      }
      if ( verbose>1 )
	yaps_message("Extending S for a=%lf\n", apar);
      if ( S_remake(ST,apar) )
	yaps_message("Extending S failed\n");
    }
    /*
     *    full statistics collection
     */
    if ( iter>=burnin ) {
      for (j=0; j<NUMMN; j++) {
	for (i=0; i<DIM; i++) {
	  tave[j][i] += t[j][i];
	}
	Tave[j] += T[j];
      }
      tcnt ++;
    }
  }	  
  
  /*
   *     report for this experiment
   */
  yaps_message("\nEstimates\n");
  yaps_message("=========\n");
  for (j=0; j<NUMMN; j++) {
    yaps_message("t[%d] = ", j);
    for (i=0; i<DIM; i++)
      yaps_message(" %.2f", tave[j][i]/tcnt);
    yaps_message("\nT[%d]=%.2f\n", j, Tave[j]/tcnt);
  }
  if ( bcycle!=0 && bcnt>0 ) 
    yaps_message("\nb=%.2f", bave/bcnt);
  if ( acycle!=0 && acnt>0 ) 
    yaps_message("\na=%.3f", aave/acnt);
  yaps_message("\n");
  
  S_free(ST);
  rng_free(rng);
  return 0;
}
コード例 #6
0
ファイル: merge.c プロジェクト: ecashin/topic-models
static void merge_opt_Tdt(int k1, int k2, merge_alpha_t *M) {
int d;
  struct heap_s up;
  struct heap_s down;
  /*
   *    sorting on moves,
   *    stores doc index for 
   */
  uint32_t *Tdt_up;
  uint32_t *Tdt_down;
  /*
   *     change from incr/decr this docs Tdt
   */
  float *score_up;
  float *score_down;

  Tdt_up = u32vec(ddN.DT);
  Tdt_down = u32vec(ddN.DT);
  score_up = fvec(ddN.DT);
  score_down = fvec(ddN.DT);
  if ( !score_down || !score_up || !Tdt_up || !Tdt_down )
    yap_quit("Out of memory in likemerge()\n");
  /*
   *  initialise sort
   */
  for (d=0; d<ddN.DT; d++) {
    assert(M->Tdt[d]<=M->Ndt[d]);
    /*   don't change for some docs */
    Tdt_up[d] = d;
    Tdt_down[d] = d;
    if ( M->Tdt[d]<M->Ndt[d] )
      score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) 
	* S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1);
    else 
      score_up[d] = 0;
    if ( M->Tdt[d]>1 )
      score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d])
	/(ddP.bpar + ddP.apar*(M->TdT[d]-1));
    else
      score_down[d] = 0;    
    assert((M->Tdt[d]>1)||score_down[d]==0);
    assert((M->Tdt[d]<M->Ndt[d])||score_up[d]==0);
    assert(M->Tdt[d]<=M->Ndt[d]);
  }
  assert(M->TDt>0);

  /*  
   *  use a heap, so only top of heap is least 
   */
  heap_init(&up, Tdt_up, ddN.DT, fveccmp, (void *)score_up);
  heap_init(&down, Tdt_down, ddN.DT, fveccmp, (void *)score_down);

  while ( 1 ) {
    float upv;
    float downv;
    upv = merge_alphabasetopicprob(M->TDTm+M->TDt, M->TDt, k1)
      *score_up[heap_front(&up)];
    if ( M->TDt>1 )
      downv = score_down[heap_front(&down)] 
        / merge_alphabasetopicprob(M->TDTm+M->TDt-1, M->TDt-1, k1);
    else
      downv = 0.0;
    if ( downv>upv && downv>1.0 ){
      //  decrement this
      d = heap_front(&down); 
      M->TdT[d]--;
      M->Tdt[d]--;
      assert(M->Tdt[d]>0);
      M->TDt--;
      heap_pop(&down);
      heap_remove(&up,d);
    } else if ( downv<upv && upv>1.0 ){
      //  increment this
      d = heap_front(&up);
      M->TdT[d]++;
      M->Tdt[d]++;
      assert(M->Tdt[d]<=M->Ndt[d]);
      M->TDt++;
      heap_pop(&up);
      heap_remove(&down,d);
    } else {
      //  none are better so quit
      break;
    }
    if ( M->Tdt[d]<M->Ndt[d] )
      score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) 
	* S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1);
    else 
      score_up[d] = 0;
    if ( M->Tdt[d]>1 )
      score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d])
	/(ddP.bpar + ddP.apar*(M->TdT[d]-1));
    else
      score_down[d] = 0;
    assert(M->Tdt[d]>1||score_down[d]==0);
    assert(M->Tdt[d]<M->Ndt[d] ||score_up[d]==0);
    assert(M->Tdt[d]<=M->Ndt[d]);
    /*
     *  now adjust the two heaps for new vals for [d]
     */
    heap_push(&down,d);
    heap_push(&up,d);
  }  
  free(score_up);
  free(score_down);
  heap_free(&up);
  heap_free(&down);
}