Exemplo n.º 1
0
static double adkterms(double mya, void *mydata) {
  double val = 0;
  uint16_t **docstats = (uint16_t **)mydata;
#ifdef A_DEBUG
  float save_a = ddC.SD->a;
  double like;
#endif
  ddP.ad = mya;
  cache_update("ad");
  val = dmi_likelihood_aterms(&ddM, docstats,
                              pctl_gammaprior, ddP.ad, ddP.bdk, ddC.SD);
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval adkterms(%lf) = %lf", mya, val);
  like = likelihood_bdk();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like,
                val-last_val, like-last_like);
  } else
    yap_message("\n");
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 2
0
static double aterms_burst(double mya, void *mydata) {
  double b[ddM.T];
  double val = 0;
  uint16_t **docstats = (uint16_t **)mydata;
#ifdef A_DEBUG
  float save_a = ddC.a_burst->a;
  double like;
#endif
  int t;
  for (t=0; t<ddM.T; t++)
    b[t] = ddP.b_burst;
  cache_update("ab");
  val = dmi_likelihood_aterms(&ddM, docstats,
			      pctl_gammaprior, mya, b, ddC.a_burst);
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval adkterms(%lf) = %lf", mya, val);
  like = dmi_likelihood(&ddM,pctl_gammaprior,mya,b,ddC.a_burst);
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like, 
		val-last_val, like-last_like);
  } else
    yap_message("\n");
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 3
0
static double aterms(double mya, void *mydata) {
  int i, t;
  double val = 0;
  double la = log(mya);
#ifdef A_DEBUG
  float save_a = ddC.SX->a;
  double like;
#endif
  S_remake(ddC.SX, mya);
  for (i=0; i<ddN.DT; i++) {
    uint32_t Td_ = 0;
    for (t=0; t<ddN.T; t++) {
      Td_ += ddS.Tdt[i][t];
      if ( ddS.Ndt[i][t]>1 ) {
	val += S_S(ddC.SX,ddS.Ndt[i][t],ddS.Tdt[i][t]);
      }
    }
    val += Td_*la + lgamma(ddP.bpar/mya+Td_) - lgamma(ddP.bpar/mya);
  }  
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval aterms(%lf) = %lf (S had %f)", mya, val, save_a);
  ddP.apar = mya;
  cache_update("a");
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like, 
		val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 4
0
static double aterms_theta(double mya, void *mydata) {
  int i, t;
  double val = 0;
#ifdef A_DEBUG
  float save_a = ddC.a_theta->a;
  double like;
#endif
  S_remake(ddC.a_theta, mya);
  for (i=0; i<ddN.DT; i++) {
    for (t=0; t<ddN.T; t++) {
      if ( ddS.n_dt[i][t]>1 ) {
	val += S_S(ddC.a_theta,ddS.n_dt[i][t],ddS.c_dt[i][t]);
      }
    }
    val += poch(ddP.b_theta, mya, ddS.C_dT[i]);
  }  
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval aterms_theta(%lf) = %lf (S had %f)", mya, val, save_a);
  ddP.a_theta = mya;
  cache_update("at");
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like, 
		val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 5
0
static double aterms_phi0(double mya, void *mydata) {
  int v;
  double val = 0;
#ifdef A_DEBUG
  float save_a = ddC.a_phi0->a;
  double like;
#endif
  S_remake(ddC.a_phi0, mya);
  val += poch(ddP.b_phi0, mya, ddS.S_0_nz);
  for (v=0; v<ddN.W; v++) {
    if ( ddS.S_0vT[v]>1 )
      val += S_S(ddC.a_phi0, ddS.S_0vT[v], 1);
  }
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval aterms_phi0(%lf) = %lf (S had %f)", mya, val, save_a);
  ddP.a_phi0 = mya;
  cache_update("ap0");
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like, 
		val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 6
0
static double aterms_mu(double mya, void *mydata) {
  int e, t;
  double val = 0;
#ifdef A_DEBUG
  float save_a = ddC.a_mu->a;
  double like;
#endif
  S_remake(ddC.a_mu, mya);
  for (e=0; e<ddN.E; e++) {
    for (t=0; t<ddN.T; t++) {
      if ( ddS.cp_et[e][t]==0 )
	continue;
      if (e==ddN.E-1) 
	val += S_S(ddC.a_mu, ddS.C_eDt[e][t], ddS.cp_et[e][t]); 
      else
	val += 
	  S_S(ddC.a_mu, ddS.C_eDt[e][t] + ddS.cp_et[e+1][t], ddS.cp_et[e][t]);
    }
    val += poch(ddP.b_mu[e], mya, ddS.Cp_e[e]);
  }  
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval aterms_mu(%lf) = %lf (S had %f)", mya, val, save_a);
  ddP.a_mu = mya;
  cache_update("am");
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like, 
		val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 7
0
void hca_displayclass(char *resstem) {
  int i, k;
  double ent = 0;
  uint32_t **TbyC;

  /*
   *  write topic by class confusion matrix
   */
  TbyC = classbytopic(resstem);

  /*
   *  now report entropies
   */
  yap_message("Class entropies by topic: ");
  for (k=0; k<ddN.T; k++) {
    double me = 0;
    double tot = 0;
    for (i=0; i<ddN.C; i++) 
      tot += TbyC[k][i];
    for (i=0; i<ddN.C; i++) {
      double p;
      if ( TbyC[k][i]>0 ) {
	p = ((double)TbyC[k][i])/tot;
	me -= p * log(p) * M_LOG2E;
      }
    }
    ent += me;
    yap_message(" %.3lf", me);
  }
  yap_message(" -> %.3lf\n", ent/ddN.T);
  free(TbyC[0]); free(TbyC);
}
Exemplo n.º 8
0
static double awterms(double myaw, void *mydata) {
  int i, t;
  double val = 0;
  double law = log(myaw);
#ifdef A_DEBUG
  float save_a = ddC.SY->a;
  double like;
#endif
  S_remake(ddC.SY, myaw);
  for (t=0; t<ddN.T; t++) {
    uint32_t Tw_ = 0;
    for (i=0; i<ddN.W; i++) {
      Tw_ += ddS.Twt[i][t];
      if ( ddS.Nwt[i][t]>1 ) {
        val += S_S(ddC.SY,ddS.Nwt[i][t],ddS.Twt[i][t]);
      }
    }
    val += Tw_*law + lgamma(ddP_bwpar(t)/myaw+Tw_) - lgamma(ddP_bwpar(t)/myaw);
  }
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval awterms(%lf) = %lf (S had %f)", myaw, val, save_a);
  ddP.awpar = myaw;
  cache_update("aw");
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like,
                val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 9
0
void sample_a(double *mya) {
#ifdef A_DEBUG
  last_val = 0;
  last_like = 0;
#endif
  if ( verbose>1 )
    yap_message("sample_a (pre):  a=%lf, lp=%lf\n",
		*mya, likelihood());
  myarms(PYP_DISC_MIN, PYP_DISC_MAX, &aterms, NULL, mya, "a");
  cache_update("a");
  if ( verbose>1 )
    yap_message("sample_a (post):  a=%lf, lp=%lf\n",
		*mya, likelihood());
}
Exemplo n.º 10
0
void *sampling_p(void *pargs)
{
  int i;
  float *p = fvec(ddN.T * 4);
  D_MiSi_t dD;
  D_pargs_p *par =(D_pargs_p *) pargs;
  clock_t t1 = clock();
  
  if ( PCTL_BURSTY() )
    misi_init(&ddM,&dD);
  /*
   *  sampling
   */
  par->thislp = 0;
  par->thisNd = 0;
  while ( (i=atomic_incr(*par->doc)-1)<ddN.DT ) {
    if ( PCTL_BURSTY() )
      misi_build(&dD,i,0);
    par->thislp += gibbs_lda(GibbsNone, i, ddD.N_dT[i], p, &dD);
    par->thisNd += ddD.N_dT[i];
    if ( par->dots>0 && i>0 && (i%par->dots==0) ) 
      yap_message(".");
    if ( PCTL_BURSTY() )
      misi_unbuild(&dD,i,0);
  }
  free(p);
  if ( PCTL_BURSTY() )
    misi_free(&dD);
  par->tot_time = (double)(clock() - t1) / CLOCKS_PER_SEC;
  return NULL;
}
Exemplo n.º 11
0
/*
 *   post. prob of word given topic used in sampling
 */
double wordfact(int j, int t, float *tip) {
  if ( ddP.phi!=NULL ) {
    assert(ddP.phi);
    return ddP.phi[t][j];
  } 
  if ( ddP.PYbeta ) {
    double p;
    if ( ddS.Twt[j][t]==0 ) {
#ifndef NDEBUG
      if ( ddS.Nwt[j][t]>0 ) {
	yap_message("ddS.Nwt[%d][%d]==%d\n", j, t, ddS.Nwt[j][t]);
	assert(ddS.Nwt[j][t]==0);
      }
#endif
      p = ((double)ddP.bwpar+ddP.awpar*ddS.TWt[t]) * betabasewordprob(j);
      *tip = 1.0;
    } else {
      double uone, uzero;
      wordtableindicatorprob(j, t, &uone, &uzero);
      p = uone + uzero;
      *tip = uone/(uone + uzero);
    }
    return p/((double)ddS.NWt[t]+ddP.bwpar);
  }
  return ((double)ddS.Nwt[j][t]+ddP.betapr[j]) / ((double)ddS.NWt[t]+ddP.betatot);
}
Exemplo n.º 12
0
static int next_best(bestmerge_t *B) {
  int k, bk=-1;
  double v = 0;
  yap_message("merge buffer: ");
  for (k=0; k<ddN.T; k++) {
    if ( B[k].ml>0 ) {
      yap_message("%d+%d ", k, B[k].k2);
    }
  }
  yap_message("\n");
  for (k=0; k<ddN.T; k++) {
    if ( B[k].ml>v ) {
      bk = k;
      v = B[k].ml;
    }
  }
  return bk;
}
Exemplo n.º 13
0
void yap_probs() {
  int t;
  int empty = 0;
  double ent = 0;
  double factor = 0;
  double *vp = malloc(sizeof(*vp)*ddN.T);
  get_probs(vp);
  yap_message("probs = ");
  factor = ddP.alpha;
  for (t=0; t<ddN.T; t++) 
    if ( vp[t]>0 ) {
      yap_message(" %lf", vp[t]);
      ent -= vp[t]*log(vp[t]);
    } else {
      empty++;
      yap_message(" -");
    }
  yap_message("\nfactor = %lf, empty = %d, ent = %lf\n", 
	      factor, empty, exp(ent));
  free(vp);
}
Exemplo n.º 14
0
static double aterms_phi1(double mya, void *mydata) {
  int e, t, v;
  double val = 0;
#ifdef A_DEBUG
  float save_a = ddC.a_phi1->a;
  double like;
#endif
  S_remake(ddC.a_phi1, mya);
  for (e=0; e<ddN.E; e++) {
    for (t=0; t<ddN.T; t++) {
      if ( ddS.S_Vte[t][e]==0 )
	continue;
      val += poch(ddP.b_phi[e][t], mya, ddS.S_Vte[t][e]);
      for (v=0; v<ddN.W; v++) {
	if ( ddS.s_vte[v][t][e]==0 )
	  continue;
	if (e<ddN.E-1) {
	  val += S_S(ddC.a_phi1, ddS.m_vte[v][t][e] + ddS.s_vte[v][t][e+1] , ddS.s_vte[v][t][e]);
	} else {
	  val += S_S(ddC.a_phi1, ddS.m_vte[v][t][e], ddS.s_vte[v][t][e]);
	}
      }
    }
  }
  myarms_evals++;
#ifdef A_DEBUG
  yap_message("Eval aterms_phi1(%lf) = %lf (S had %f)", mya, val, save_a);
  ddP.a_phi1 = mya;
  cache_update("ap"1);
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like, 
		val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 15
0
/*
 *  assumes uniform prior Dirichlet
 */
void sample_alpha(double *alphatot) {
  double dirmax = DIR_TOTAL_MAX;
  if ( dirmax>ddN.T * DIR_MAX )
    dirmax = ddN.T * DIR_MAX;
#ifdef A_DEBUG
  last_val = 0;
  last_like = 0;
#endif
  if ( myarmsMH(DIR_MIN*ddN.T, dirmax,
                &alphaterms, NULL, alphatot, "alphatot",1) ) {
    yap_message("sample_alpha: error in result\n");
  }
  cache_update("alpha");
}
Exemplo n.º 16
0
static double alphaterms(double alphatot, void *mydata) {
  int t,s;
  double val = 0;
  double tot;
  double lga = lgamma(alphatot/ddN.T);
  double lgat = lgamma(alphatot); 
#ifdef A_DEBUG
  double like;
#endif
#ifdef CONJPRIOR
  val += ddN.T*(lgamma((alphatot+1.0)/ddN.T) - lga);
  val -= lgamma(alphatot+1.0) - lgamma(alphatot);
#endif
  for (s=0; s<ddN.DT; s++) {
    tot = 0;
    for (t=0; t<ddN.T; t++) {
      tot += alphatot/ddN.T+ddS.Ndt[s][t];
      val += gammadiff(ddS.Ndt[s][t],alphatot/ddN.T,lga);
    }
    val -= lgamma(tot) - lgat;
  }
  myarms_evals++;
  myarms_last = alphatot;
#ifdef A_DEBUG
  yap_message("Eval alphaterms(%lf) = %lf\n", alphatot, val);
  ddP.alphatot = alphatot;
  cache_update("alpha");
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like,
                val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  return val;
}
Exemplo n.º 17
0
static double a0terms(double mya0, void *mydata) {
  int i;
  double l1a0 = log(1-mya0);
  double l2a0 = log((1-mya0)*(2-mya0));
  double lga0 = lgamma(1-mya0);
  double val = 0;
#ifdef A_DEBUG
  double like;
#endif
  val += ddS.TDTnz*log(mya0) + lgamma(ddP.b0/mya0+ddS.TDTnz)
    - lgamma(ddP.b0/mya0);
  for (i=0; i<ddN.T; i++)
    /*  note the root node is a PDD so all t's = 1 */
    if ( ddS.TDt[i]>1 ) {
      if ( ddS.TDt[i]==2 )
        val += l1a0;
      else if ( ddS.TDt[i]==3 )
        val += l2a0;
      else
        val += lgamma(ddS.TDt[i]-mya0) - lga0;
    }
#ifdef A_DEBUG
  yap_message("Eval a0terms(%lf) = %lf", mya0, val);
  ddP.a0 = mya0;
  cache_update("a0");
  like = likelihood();
  if ( last_val != 0 ) {
    yap_message(", lp=%lf diffs=%lf vs %lf\n", like,
                val-last_val, like-last_like);
  }
  last_like = like;
  last_val = val;
#endif
  myarms_evals++;
  return val;
}
Exemplo n.º 18
0
void get_probs(double *vp) {
  int zerod = 1;
  int t;
  double tot = 0;
  if ( ddP.PYalpha==0 ) {
    get_probs_alpha(vp);
    return;
  } 
  for (t=0; t<ddN.T; t++) {
    if ( ddP.PYalpha!=H_HPDD || ddS.TDt[t]>0 || zerod ) 
      tot += vp[t] = alphabasetopicprob(t);
    else
      vp[t] = 0;
    if ( ddP.PYalpha==H_HPDD && ddS.TDt[t]==0 ) 
      zerod = 0;
  }
#ifndef NDEBUG
  if ( fabs(tot-1.0)>1e-4 ) {
    yap_message("get_probs() probs doesn't normalise, get %lf\n", tot);
  }
#endif
  for (t=0; t<ddN.T; t++) 
    vp[t] /= tot;
}
Exemplo n.º 19
0
/*
 *  print out the topic topk=10 words. report the PMI score. 
 */
double report_pmi(char *topfile,   /* name of topics file */
		  char *pmifile,  /* name of PMI file */
		  int T,          /* total topics */
		  int W,          /* total words */
		  int E,          /*  number of epochs */
		  int topk,
		  double *tp)
{
  int lineno = 0;
  int i,k, thee;
  /*
   *   mapping from local index to actual word index
   */
  uint32_t *wind = u32vec(topk*T*E);
  int n_wind = 0;
  /*
   *   boolean vector ... is word used
   */
  uint32_t *wuse = u32vec(W/32+1);
  /*
   *  PMI's by local index
   */
  uint32_t *topic = u32vec(topk);
  float *coherency = fvec(E);
  double **pmi;
  float ave = 0;

  char *line;
  size_t n_line;
  FILE *fr;
  if ( !wind || !wuse )
    yap_quit("Out of memory in report_pmi()\n");

  /*
   *   read in file of top word indices in topic
   */
  fr = fopen(topfile,"r");
  if ( !fr ) 
    yap_sysquit("Topic file '%s' not read\n", topfile);
  
  line = NULL;
  n_line = 0;
  lineno = 0;
  while ( getline(&line, &n_line, fr)>0 ) {
    char *buf = line;
    unsigned j;
    int e = 0;
    lineno ++;
    buf += strspn(buf," \t\n");    //   skip space
    if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || 
	 (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) 
      yap_quit("Cannot read topic in topic line %d from file '%s'\n", 
	       lineno, topfile);
    if ( k<0 || k>=T )
      continue;
    if ( e<0 || e>=E )
      continue;
    for (i = 0; i<topk && *buf; i++) {
      buf = strpbrk(buf," \t\n");    //   skip to next space
      if ( sscanf(buf, " %u", &j) <1 ) {
	if ( verbose>2 ) 
	    yap_message("Cannot read word %d in topic line %d from file '%s'\n", 
		    i+1, lineno, topfile);
	break;
      }
      if ( j>=W) {
	yap_quit("Bad word %d in topic line %d from file '%s'\n", 
		 i+1, lineno, topfile);
      }
      buf += strspn(buf," \t\n");    //   skip space
      /*
       *   check if word exists, and set up its index
       */
      if ( wuse[j/32U] & (1U<<(j%32U)) ) {
	// yes, so search for it
	int ii;
	for (ii=0; ii<n_wind; ii++)
	  if ( wind[ii]==j )
	    break;
	if ( ii>=n_wind )
	  yap_quit("Lookup of word %d failed at line %d in report_pmi()\n", 
		   (int)j, lineno);
      } else {
	// no, so add it
	wuse[j/32U] |= (1U<<(j%32U));
	wind[n_wind] = j;	
	n_wind++;
      }
    }
    free(line);
    line = NULL;
    n_line = 0;
  }
  fclose(fr);

  pmi = dmat(n_wind,n_wind);
  /*
   *  build hash table now since we know size
   */
  hashsize = n_wind*2;
  hashtab = malloc(sizeof(*hashtab)*hashsize);
  if ( !pmi || !hashtab )
    yap_quit("Out of memory in report_pmi()\n");
  for (i=0; i<hashsize; i++)
    hashtab[i] = 0;
  for (i=0; i<n_wind; i++)
    addw(wind[i],i);

  /*
   *   load up PMI file, only keeping words mentioned in hash table
   */
  {
    unsigned t1, t2;
    double value;
    int zcat = 0;
    fr = fopen(pmifile,"r");
    if ( !fr ) {
      /*
       *    try to zcat it
       */
      char *cmd = malloc(strlen(pmifile)+20);
      sprintf(cmd,"%s.gz", pmifile);
      fr = fopen(cmd,"r");
      if ( !fr ) 
	yap_sysquit("Cannot open pmifile '%s' in report_pmi()\n", 
		    pmifile);
      fclose(fr);
      sprintf(cmd,"gunzip -c %s", pmifile);
      fr = popen(cmd,"r");
      if ( !fr )
	yap_sysquit("Cannot open or zcat pmifile '%s' in report_pmi()\n", 
		    pmifile);
      zcat = 1;
      free(cmd);
    }
    while (fscanf(fr, "%u %u %lg", &t1, &t2, &value)==3 ) { 
      if ( t1>=W || t2>= W )
	yap_quit("Illegal word index in report_pmi()\n");
      if ( t1!= t2 && ( wuse[t1/32U] & (1U<<(t1%32U)) ) 
	   && ( wuse[t2/32U] & (1U<<(t2%32U))) ) {
	int i1, i2;
	i1 = findw(t1,wind);
	i2 = findw(t2,wind);
	if ( i1==UINT32_MAX || i2==UINT32_MAX )
	  yap_quit("Could not locate word index in report_pmi()\n");
	pmi[i1][i2]=value;
	pmi[i2][i1]=value;
      }
    }
    if ( zcat )
      pclose(fr);
    else
      fclose(fr);
  }
  
  /*
   *    compute PMI score for each topic
   */

  fr = fopen(topfile,"r");
  if ( !fr ) 
    yap_sysquit("Topic file '%s' not read\n", topfile);
  line = NULL;
  n_line = 0;
  thee = 0;
  lineno = 0;
  if ( E>1 ) 
    yap_message("PMI %d:: ", 0);
  else
    yap_message("PMI :: ");

  while ( getline(&line, &n_line, fr)>0 ) {
    /*
     *  repeat logic above to read topic file again
     */
    char *buf = line;
    unsigned j;
    int cnt = 0;
    int e = 0;
    double coh = 0;
    buf += strspn(buf," \t\n");    //   skip space
    if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || 
	 (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) 
      yap_quit("Cannot read topic in topic line %d from file '%s'\n", 
	       lineno, topfile);
    if ( k<0 || k>=T )
      continue;
    if ( e<0 || e>=E )
      continue;
    if ( e!=thee ) {
      thee = e;
      yap_message("\nPMI %d:: ", e);
    }
    for (i = 0; i<topk && *buf; i++) {
      buf = strpbrk(buf," \t\n");    //   skip to next space
      if ( sscanf(buf, " %u", &j) <1 ) {
	yap_message("Cannot read word %d in topic line %d from file '%s'\n", i+1, lineno, topfile);
	break;
      }
      if ( j>=W) {
	yap_quit("Bad word %d in topic line %d from file '%s'\n", i+1, lineno, topfile);
      }
      buf += strspn(buf," \t\n");    //   skip space
      topic[i] = findw(j,wind);
    }
    if ( i<topk )
      topic[i] = W;
    /*
     *  topics now read 
     */
    for (i=0; i<topk && topic[i]<W; i++) {
      for (j=i+1; j<topk && topic[j]<W; j++) {
	coh += pmi[topic[i]][topic[j]];
	cnt ++;
      }
    }
    if ( cnt>0 ) coh /= cnt;
    coherency[e] += coh * tp[k];
    yap_message(" %d:%.3lf", k, coh);
  }
  fclose(fr);
  yap_message("\nPMI =");
  if ( E==1 ) {
    yap_message(" %.3lf\n", coherency[0]);
    ave = coherency[0];
  } else {
    int e;
    for (e=0; e<E; e++) {
      ave += coherency[e];
      yap_message(" %.3lf", coherency[e]);
    }
    ave /= E;
    yap_message(" -> %.3lf\n", ave);
  }
      
  free(wind);
  free(coherency);
  free(wuse);
  free(topic);
  free(pmi[0]); free(pmi);
  free(hashtab);
  hashtab = NULL;
  hashsize = 0;
  return ave;
}
Exemplo n.º 20
0
void hca_displaytopics(char *stem, char *resstem, int topword, 
                       enum ScoreType scoretype, int pmicount, int fullreport) {
  int w,k;
  uint32_t *termindk = NULL;
  uint32_t *indk = NULL;
  int Nk_tot = 0;
  double (*termtscore)(int) = NULL;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  uint32_t *top1cnt = NULL;
  FILE *fp;
  float *tpmi = NULL;
  char *topfile;
  char *repfile;
  uint32_t *psort;
  FILE *rp = NULL;
  float *gtvec = globalprop();
//#define XTRA // prints model topic probs after observed
#ifdef XTRA
  double *gtavec = calloc(ddN.T,sizeof(gtavec[0]));
#endif
  float *gpvec = calloc(ddN.W,sizeof(gpvec[0]));
  float *pvec = calloc(ddN.W,sizeof(pvec[0]));
#ifdef KL
  float *dfvec = calloc(ddN.W,sizeof(dfvec[0]));
#endif
  double *ngalpha = NULL;
  T_stats_t *termstats;
  
#ifdef XTRA
  get_probs(gtavec);
#endif

  if ( pmicount>topword )
    pmicount = topword;
  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phirat ) {
    tscore = phiratioscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }  

  if ( ddS.TwT==NULL && ddP.phi==NULL && scoretype == ST_phirat ) 
	yap_quit("Cannot use '-orat' option with this model/settings.\n");	

  if ( ddP.PYalpha==H_NG ) {
    /*
     *  provide an estimate of alpha
     */
    ngalpha = dvec(ddN.T);
    get_probs(ngalpha);
    for (k=0; k<ddN.T; k++) {
      ddP.alphapr[k] = ngalpha[k];
    }
  }

  /*
   *  returns null if no relevant data file
   */
  termstats = tstats_init(ddS.z, ddD.NdTcum, ddN.T, ddN.DT, stem);
  if ( termstats ) {
    if ( scoretype == ST_idf ) {
      termtscore = termidfscore;
    } else 
      termtscore = termcountscore;
  }  

  
  /*
   *  first collect counts of each word/term,
   *  and build gpvec (mean word probs)
   */
  build_NwK();
  if ( termstats )
    build_termNwK(termstats);
  {
    /*
     *  gpvec[] is normalised NwK[]
     */
    double tot = 0;
    for (w=0; w<ddN.W; w++)
      tot += gpvec[w] = NwK[w]+0.1; 
    for (w=0; w<ddN.W; w++)
      gpvec[w] /= tot;
  }
  if ( ddS.Nwt ) {
    for (k=0; k<ddN.T; k++) {
      Nk_tot += ddS.NWt[k];
    }
  } 
  
  psort = sorttops(gtvec, ddN.T);
  
  top1cnt = hca_top1cnt();
  if ( !top1cnt )
    yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n");

  if ( pmicount ) {
    tpmi = malloc(sizeof(*tpmi)*(ddN.T+1));
    if ( !tpmi )
      yap_quit("Cannot allocate tpmi in hca_displaytopics()\n");
  }
  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk in hca_displaytopics()\n");
  if ( termstats ) {
    termindk = malloc(sizeof(*indk)*termstats->K);
    if ( !termindk )
      yap_quit("Cannot allocate termindk in hca_displaytopics()\n");
  }

  
  data_df(stem);

#ifdef KL
  for (w=0; w<ddN.W; w++)
    dfvec[w] = ddD.df[w];
#endif
  
  /*
   *   two passes through, 
   *           first to build the top words and dump to file
   */
  repfile = yap_makename(resstem,".topset");
  topfile = yap_makename(resstem,".toplst");
  fp = fopen(topfile,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", topfile);
  yap_message("\n");
  for (k=0; k<ddN.T; k++) {
    int cnt, termcnt = 0;
    tscorek = k;
    /*
     *    build sorted word list
     */
    cnt = buildindk(k, indk);
    topk(topword, cnt, indk, tscore);
    if ( cnt==0 )
      continue;
    if ( termstats ) {
      termcnt = buildtermindk(k, termindk, termstats);
      topk(topword, termcnt, termindk, termtscore);
    }
    /*
     *   dump words to file
     */
    fprintf(fp,"%d: ", k);
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    if ( termstats ) {
      for (w=0; w<topword && w<termcnt; w++) {
	fprintf(fp," %d", (int)termstats->Kmin+termindk[w]);
      }
    }
    fprintf(fp, "\n");
  }
  if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr)  ) {
    int cnt;
     /*
     *    dump root words
     */
    tscorek = -1;
    cnt = buildindk(-1, indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  if ( verbose>1 ) yap_message("\n");

  if ( pmicount ) {
    /*
     * compute PMI
     */
    char *toppmifile;
    char *pmifile;
    double *tp;
    tp = dvec(ddN.T);
    pmifile=yap_makename(stem,".pmi");
    toppmifile=yap_makename(resstem,".toppmi");
    get_probs(tp);
    report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, 
               pmicount, tp, tpmi);
    free(toppmifile);
    free(pmifile);
    free(tp);
  }

  /*
   *   now report words and diagnostics
   */
  //ttop_open(topfile);
  if ( fullreport ) {
    rp = fopen(repfile,"w");
    if ( !rp ) 
      yap_sysquit("Cannot open file '%s' for write\n", repfile);
    fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one "
	    "dist-unif dist-unigrm");
    if ( PCTL_BURSTY() ) 
      fprintf(rp, " burst-concent");
    if ( ddN.tokens )  
      fprintf(rp, " ave-length");
    fprintf(rp, " coher");
    if ( pmicount ) 
      fprintf(rp, " pmi");
    fprintf(rp, "\n#word topic index rank");
    if ( ddS.Nwt )
      fprintf(rp, " count");
    fprintf(rp, " prop cumm df coher\n");
    
  }
  for (k=0; k<ddN.T; k++) {
    int cnt, termcnt = 0;
    int kk = psort[k];
    uint32_t **dfmtx;

    if ( ddP.phi==NULL && ddS.NWt[kk]==0 )
      continue;
    /*
     *   grab word prob vec for later use
     */
    if ( ddS.Nwt ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = wordprob(w,kk);
    } else if ( ddP.phi ) 
      fv_copy(pvec, ddP.phi[kk], ddN.W);
    else if ( ddS.phi ) 
      fv_copy(pvec, ddS.phi[kk], ddN.W);

    /*
     *  rebuild word list
     */
    tscorek = kk;
    cnt = buildindk(kk, indk);
    topk(topword, cnt, indk, tscore);
    if ( topword<cnt )
      cnt = topword;
    assert(cnt>0);
    if ( termstats ) {
      termcnt = buildtermindk(kk, termindk, termstats);
      topk(topword, termcnt, termindk, termtscore);
      if ( topword<termcnt )
	termcnt = topword;
    }
    /*
     *     df stats for topic returned as matrix
     */
    dfmtx = hca_dfmtx(indk, cnt, kk);

    if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) 
      underused++;
    /*
     *  print stats for topic
     *    Mallet:  tokens, doc_ent, ave-word-len, coher., 
     *             uni-dist, corp-dist, eff-no-words
     */
    yap_message("Topic %d/%d", kk, k);
    {
      /*
       *   compute diagnostics
       */
      double prop = gtvec[kk];
      float *dprop = docprop(kk);
      double spw = 0;
      double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT);
#ifdef KL
      double ew = fv_kl(dfvec,pvec,ddN.W);
#else
      double ew = exp(fv_entropy(pvec,ddN.W));
#endif
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W);
      double co = coherence(dfmtx, cnt);
      double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT;
#define MALLET_EW
#ifdef MALLET_EW
      double ewp = dprop?(1.0/fv_expprob(pvec,ddN.W)):ddN.W;
#endif
      double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0;
      sparsitydoc += spd;
      yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop);   
#ifdef XTRA
      yap_message((ddN.T>200)?"/%.3lf%%":"/%.2lf%%",100*gtavec[kk]);   
#endif
      if ( ddS.Nwt ) {
	spw = ((double)nonzero_Nwt(kk))/((double)ddN.W);
	sparsityword += spw;
	yap_message(" ws=%.1lf%%", 100*(1-spw));
      } 
      yap_message(" ds=%.1lf%%", 100*(1-spd) );
#ifdef KL
      yap_message(" ew=%lf", ew);
#else
      yap_message(" ew=%.0lf", ew);
#endif
#ifdef MALLET_EW
      yap_message(" ewp=%.1lf", ewp); 
#endif
      yap_message(" ed=%.1lf", ed); 
      yap_message(" da=%.0lf", da+0.1); 
      yap_message(" t1=%u", top1cnt[kk]); 
      yap_message(" ud=%.3lf", ud); 
      yap_message(" pd=%.3lf", pd); 
      if ( PCTL_BURSTY() ) 
	yap_message(" bd=%.3lf", ddP.bdk[kk]); 
      if ( ddP.NGbeta ) {
	/*
	 *   approx. as sqrt(var(lambda_k)/lambda-normaliser
	 */
	double ngvar = sqrt(ddP.NGalpha[kk])
	  * (ngalpha[kk]/ddP.NGalpha[kk]);
	yap_message(" ng=%.4lf,%.4lf", 
		    ngalpha[kk], ngvar/ngalpha[kk]);
	if ( ddS.sparse )
	    yap_message(",%.4f", 1-((float)ddS.sparseD[kk])/ddN.DTused);
	if ( verbose>2 )
	    yap_message(" ngl=%.4lf,%.4lf, nga=%.4lf,%.4lf", 
		    ddP.NGalpha[kk]/ddP.NGbeta[kk], 
		    sqrt(ddP.NGalpha[kk]/ddP.NGbeta[kk]/ddP.NGbeta[kk]),
		    ddP.NGalpha[kk], ddP.NGbeta[kk]); 
      }
      if ( ddN.tokens )  
	yap_message(" sl=%.2lf", sl); 
      yap_message(" co=%.3lf%%", co);
      if ( pmicount ) 
	yap_message(" pmi=%.3f", tpmi[kk]);
      if ( fullreport ) {
	fprintf(rp,"topic %d %d", kk, k);
	fprintf(rp," %.6lf", prop);   
	if ( ddS.Nwt ) {
	  fprintf(rp," %.6lf", (1-spw));
	} else {
	  fprintf(rp," 0");
	}
	fprintf(rp," %.6lf", (1-spd) );
#ifdef KL
	yap_message(" %lf", ew);
#else
	fprintf(rp," %.2lf", ew);
#endif
#ifdef MALLET_EW
	fprintf(rp," %.2lf", ewp); 
#endif
	fprintf(rp," %.2lf", ed); 
	fprintf(rp," %.0lf", da+0.1); 
	fprintf(rp," %u", top1cnt[kk]); 
	fprintf(rp," %.6lf", ud); 
	fprintf(rp," %.6lf", pd); 
	if ( PCTL_BURSTY() ) 
	  fprintf(rp," %.3lf", ddP.bdk[kk]); 
	fprintf(rp," %.4lf", (ddN.tokens)?sl:0); 
	fprintf(rp," %.6lf", co);
	if ( pmicount ) 
	  fprintf(rp," %.4f", tpmi[kk]);
	fprintf(rp,"\n");
      }
      if ( dprop) free(dprop);
    }
    if ( verbose>1 ) {
      double pcumm = 0;
      /*
       *   print top words:
       *     Mallet:   rank, count, prob, cumm, docs, coh
       */
      yap_message("\ntopic %d/%d", kk, k);
      yap_message(" words=");
      for (w=0; w<cnt; w++) {
	if ( w>0 ) yap_message(",");
	if ( ddN.tokens ) 
	  yap_message("%s", ddN.tokens[indk[w]]);
	else
	  yap_message("%d", indk[w]);
	if ( verbose>2 ) {
	  if ( scoretype == ST_count )
	    yap_message("(%d)", (int)(tscore(indk[w])+0.2));
	  else
	    yap_message("(%6lf)", tscore(indk[w]));
	}
	if ( fullreport ) {
	  fprintf(rp, "word %d %d %d", kk, indk[w], w);
	  if ( ddS.Nwt )
	    fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]);
	  pcumm += pvec[indk[w]];
	  fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	  fprintf(rp, " %d", dfmtx[w][w]); 
	  fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w));
	  if ( ddN.tokens ) 
	    fprintf(rp, " %s", ddN.tokens[indk[w]]);
	  fprintf(rp, "\n");
	}
      }
      if ( termstats ) {
	yap_message(" terms=");
	for (w=0; w<termcnt; w++) {
	  if ( w>0 ) yap_message(",");
	  if ( ddN.tokens ) 
	    yap_message("%s", termstats->tokens[termindk[w]]);
	  else
	    yap_message("%d", termstats->Kmin+termindk[w]);
	  if ( verbose>2 ) {
	    if ( scoretype == ST_count )
	      yap_message("(%d)", (int)(termtscore(termindk[w])+0.2));
	    else
	      yap_message("(%6lf)", termtscore(termindk[w]));
	  }
	  if ( fullreport ) {
	    fprintf(rp, "term %d %d %d", kk, termindk[w], w);
	    fprintf(rp, " %d", termstats->Nkt[termindk[w]][kk]);
	    fprintf(rp, " %s", termstats->tokens[termindk[w]]);
	    fprintf(rp, "\n");
	  }
	}
      }
    }
    yap_message("\n");
    free(dfmtx[0]); free(dfmtx); 
  }
  if ( verbose>1 && ddP.PYbeta ) {
    int cnt;
    double pcumm = 0;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt = buildindk(-1,indk);
    /*  this case gives bad results */
    // if ( scoretype == ST_phirat ) topk(topword, cnt, indk, phiratioscore);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    /*
     *     cannot build df mtx for root because
     *     it is latent w.r.t. topics
     */
    yap_message("Topic root words=");
    if ( fullreport ) {
      int w;
      if ( ddP.phi && ddP.PYbeta!=H_PDP ) {
	for (w=0; w<ddN.W; w++)
	  pvec[w] = ddS.phi[ddN.T][w];
      } else {
	for (w=0; w<ddN.W; w++)
	  pvec[w] = betabasewordprob(w);
      }
#ifdef KL
      double ew = fv_kl(dfvec,pvec,ddN.W);
#else
      double ew = exp(fv_entropy(pvec,ddN.W));
#endif

      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      fprintf(rp,"topic -1 -1 0 0");
      fprintf(rp," %.4lf", ew); 
      fprintf(rp," %.6lf", ud); 
      fprintf(rp," %.6lf", pd); 
      fprintf(rp,"\n");
    }
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
      if ( verbose>2 && !ddP.phi )
	yap_message("(%6lf)", countscore(indk[w]));
      if ( fullreport ) {
	fprintf(rp, "word %d %d %d", -1, indk[w], w);
	if ( ddS.TwT )
	  fprintf(rp, " %d", ddS.TwT[w]);
	pcumm += pvec[indk[w]];
	fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	fprintf(rp, " 0 0"); 
	if ( ddN.tokens ) 
	  fprintf(rp, " %s", ddN.tokens[indk[w]]);
	fprintf(rp, "\n");
      }
    }
    yap_message("\nTopical words=");
    topk(topword, cnt, indk, phiinvratioscore);
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
    }
    yap_message("\n");
  }  
  yap_message("\n");
  if ( rp )
    fclose(rp);
	     
  if ( ddS.Nwt )
    yap_message("Average topicXword sparsity = %.2lf%%\n",
                100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%\n"
	      "Underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( ddS.sparse && ddP.PYalpha==H_NG ) {
    double avesp = 0;
    // correct_docsp();
    for (k=0; k<ddN.T; k++) {
      avesp += gtvec[k];
    }
    // check gtvec[] sums to 1
    assert(fabs(avesp-1.0)<0.00001);
    avesp = 0;
    for (k=0; k<ddN.T; k++) {
        avesp += gtvec[k]*((float)ddS.sparseD[k])/ddN.DTused;
	assert(ddS.sparseD[k]<=ddN.DTused);
    }
    assert(avesp<=1.0);
    assert(avesp>0.0);
    yap_message("IBP sparsity = %.2lf%%\n", 100*(1-avesp));
  }
	
  if ( pmicount ) 
    yap_message("Average PMI = %.3f\n", tpmi[ddN.T]);

  /*
   *   print 
   */
  if ( 1 ) {
    float **cmtx = hca_topmtx();
    int t1, t2;
    int m1, m2;
    float mval;
    char *corfile = yap_makename(resstem,".topcor");
    fp = fopen(corfile,"w");
    if ( !fp ) 
      yap_sysquit("Cannot open file '%s' for write\n", corfile);
    /*
     *   print file
     */
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) 
	 if ( cmtx[t1][t2]>1.0e-7 ) 
	  fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]);
    }
    fclose(fp);
    free(corfile);
    /*
     *   display maximum
     */
    m1 = 1; m2 = 0;
    mval = cmtx[1][0];
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) {
	if ( mval<cmtx[t1][t2] ) {
	  mval = cmtx[t1][t2];
	  m1 = t1;
	  m2 = t2;
	}
      }
    }
    yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval);
    free(cmtx[0]); free(cmtx);
  }

  /*
   *  print burstiness report
   */
  if ( PCTL_BURSTY() ) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  yap_message("\n");

  free(topfile);
  if ( repfile ) free(repfile);
  if ( top1cnt ) free(top1cnt);
  free(indk);
  free(psort);
  if ( ngalpha )
    free(ngalpha);
  if ( pmicount )
    free(tpmi);
  if ( NwK ) {
    free(NwK);
    NwK = NULL;
  }
#ifdef KL
  free(dfvec);
#endif
  free(pvec); 
  free(gtvec);
  free(gpvec);
  tstats_free(termstats);
}
Exemplo n.º 21
0
/*
 *    run regular gibbs cycles on the data with phi used;
 *    the evaluation on each doc, and sample word probs
 *
 *    if qparts>0, split collection into parts and only search this
 *
 *    K = number of top results to retain
 */
void gibbs_query(int K, char *qname, int dots, int this_qpart, int qparts) {
  /*
   *    mapping from query word posn. to its mi in current doc
   *       >ddN.N  = not in current doc
   *       -ve  = has no mi since occurs just once, found at
   *              posn  (-map[]-1)
   *       non -ve = mi value
   */
  int     *mimap = NULL;
  /*
   *     usual stuff for Gibbs loop over docs
   */
  int i, j;
  float *fact = fvec(ddN.T*4);
  D_MiSi_t dD;
  /*
   *   an index into topk[] which maintains ordering
   */
  int     *topind;

  /*
   *    these store statistics of the results, for printing
   *    these are unordered, ordered by topind[]
   */
  /*      document score  */
  float   *topscore;
  /*      document number   */
  int     *topk;
  /*      flags if ord is irrelevant, thus not scored  */
  char    *wordunused;

  /*
   *    per word stats for top results saved
   */
  int     *found;
  float   *topcnt;
  float   *topwordscore;
  /*
   *    temporary versions for when gibbs running
   */
  int     *found_buf;
  float   *topcnt_buf;
  float   *topwordscore_buf;
  double  *logprob;
  /*
   *   search here
   */
  int startdoc = 0;
  int enddoc = ddN.DT;

  /*
   *    setup
   */
  topcnt = malloc(sizeof(topcnt[0])*K*ddP.n_words);
  topwordscore = malloc(sizeof(topwordscore[0])*K*ddP.n_words);
  found = malloc(sizeof(found)*ddP.n_words*K);
  wordunused = malloc(sizeof(wordunused[0])*ddP.n_words);

  topcnt_buf = malloc(sizeof(topcnt[0])*ddP.n_words);
  topwordscore_buf = malloc(sizeof(topwordscore[0])*ddP.n_words);
  found_buf = malloc(sizeof(found)*ddP.n_words);
  if ( !topcnt || !topwordscore || !found || 
       !topcnt_buf || !topwordscore_buf || !found_buf )
    yap_quit("Cannot allocate memory in gibbs_query()\n");

  logprob = malloc(sizeof(logprob[0])*ddP.n_query);
  topscore = malloc(sizeof(topscore[0])*K*ddP.n_query);
  topind = malloc(sizeof(topind[0])*K*ddP.n_query);
  topk = malloc(sizeof(topk[0])*K*ddP.n_query);
  if ( ddP.bdk!=NULL ) 
    mimap = malloc(sizeof(mimap[0])*ddP.n_words);
  if ( !topk || !topscore || !logprob || !topind )
    yap_quit("Cannot allocate memory in gibbs_query()\n");
  for (i=0; i<ddP.n_words; i++) {
    wordunused[i] = 0;
  }
  for (i=0; i<K*ddP.n_query; i++) {
    topind[i] = i%K;
    topk[i] = -1;
    topscore[i] = INFINITY;
  }
  
  /*
   *  check words to exclude using topics
   */
  if ( ddP.n_excludetopic>0 ) {
    double *tprob = malloc(sizeof(tprob[0])*ddN.T);
    get_probs(tprob);
    yap_probs();
    if ( verbose>1 )
      yap_message("Excluding words: ");
    for (i=0; i<ddP.n_words; i++) {
      int t = besttopic(ddP.qword[i],tprob);
      if ( Q_excludetopic(t) ) {
	wordunused[i] = 1;
	if ( verbose>1 )
	  yap_message(" %d/%d", (int)ddP.qword[i], t);
      }
    } 
    if ( verbose>1 )
      yap_message("\n");
    free(tprob);
  }
  
  if ( ddP.bdk!=NULL ) misi_init(&ddM,&dD);

  if ( qparts>0 ) {
    startdoc = ((double)this_qpart)/qparts * ddN.DT;
    enddoc = ((double)this_qpart+1.0)/qparts * ddN.DT;
  }
  for(i=startdoc; i<enddoc; i++) {
    int  thisw =  add_doc(i, GibbsNone);
    int  r;
    if ( thisw<=1 ) {
      remove_doc(i, GibbsNone);
      continue;
    }
    if ( ddP.bdk!=NULL ) 
      misi_build(&dD, i, 0);
    map_query(i, mimap, found_buf);
    for (j=0; j<ddP.n_words; j++) {
      topcnt_buf[j] = 0;
      topwordscore_buf[j] = 0;
    }
    
    for (r=0; r<ddP.queryiter; r++) {
      gibbs_lda(GibbsNone, ddN.T, i, ddD.NdT[i], fact, &dD, 0, 0);
      query_docprob(i, mimap, fact, &dD, topcnt_buf, topwordscore_buf);
    }  
    /*
     *  now adjust stats
     */
    for (j=0; j<ddP.n_query; j++) 
      logprob[j] = 0;
    for (j=0; j<ddP.n_words; j++) {
      if ( wordunused[j]>0 )
	continue;
      if ( ddP.query[ddP.qword[j]]==j ) {
	topcnt_buf[j] /= ddP.queryiter;
	topwordscore_buf[j] /= ddP.queryiter;
      } else {
	/*  word in previous query so copy  */
	int jj =  ddP.query[ddP.qword[j]];
	topcnt_buf[j] = topcnt_buf[jj];
	topwordscore_buf[j] = topwordscore_buf[jj];
	found_buf[j] = found_buf[jj];
      }
      if ( wordunused[j]==0 )
	logprob[ddP.qid[j]] += topwordscore_buf[j];
    }
    if ( dots>0 && i>0 && (i%dots==0) ) 
      yap_message(".");
    if ( ddP.bdk!=NULL ) misi_unbuild(&dD,i,0);
    remove_doc(i, GibbsNone);
    /*
     *   enter into the arrays
     */
    for (j=0; j<ddP.n_query; j++) {
      if ( i<K || logprob[j] < topscore[j*K+topind[j*K+K-1]] ) {
	int newind, l;
	/*
	 *   better than current lowest 
	 */
	newind = bubble((i<K)?(i+1):K, 
			&topind[j*K], &topscore[j*K], logprob[j]);
	/*
	 *   save the current details
	 */
	topscore[j*K+newind] = logprob[j];
	topk[j*K+newind] = i;
	for (l=ddP.qposn[j]; l<ddP.qposn[j+1]; l++) {
	  topcnt[newind*ddP.n_words+l] = topcnt_buf[l]; 
	  topwordscore[newind*ddP.n_words+l] = topwordscore_buf[l]; 
	  found[newind*ddP.n_words+l] = found_buf[l]; 
	}
      }
    }
  }
  if ( dots>0 ) yap_message("\n");
  
  /*
   *  write result
   */
  {
    float *ws = fvec(ddP.n_words);
    FILE *fp = fopen(qname,"w");
    int q;
    if ( !fp )
      yap_sysquit("Cannot write query results to '%s'\n", qname);
    for (q=0; q<ddP.n_query; q++) {
      int nw = ddP.qposn[q+1]-ddP.qposn[q];
      for (i=0; i<K && i<ddN.DT && topk[topind[q*K+i]]>=0; i++) {
	int l, ind = topind[q*K+i];
	double tfidf;
        tfidf = bm25(topk[q*K+ind],&found[ind*ddP.n_words+ddP.qposn[q]],
			    &ddP.qword[ddP.qposn[q]], nw, ws);
	assert(ind>=0 && ind<K);
	fprintf(fp, "%d %d ", q, topk[q*K+ind]);
	fprintf(fp, "%.4f %.4lf ", topscore[q*K+ind]/nw, tfidf);
        if ( verbose>1 ) {
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%d ", found[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topcnt[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topwordscore[ind*ddP.n_words+l]);
          for (l=0; l<nw; l++)
            fprintf(fp, "%lf ", ws[l]);
        }
        fprintf(fp, "\n");
      }
    }
    fclose(fp);
    free(ws);
  }

  /*
   *  clean up
   */
  free(fact);
  if ( ddP.bdk!=NULL ) misi_free(&dD);
  if ( mimap ) free(mimap);
  free(found);
  free(topwordscore);
  free(topcnt);
  free(found_buf);
  free(topwordscore_buf);
  free(topcnt_buf);
  free(topscore);
  free(topind);
  free(topk);
  free(logprob);
}
Exemplo n.º 22
0
/********************************
 *   code for LDA 
 *****************************/
double gibbs_lda(/*
      *  fix==GibbsNone for standard ML training/testing
      *  fix==GibbsHold for word hold-out testing,
                  *       same as GibbsNone but also handles
      *       train and test words differently
      */
     enum GibbsType fix,
     int did,    //  document index
     int words,  //  do this many
     float *p,    //  temp store
     D_MiSi_t *dD
     ) {
  int i, wid, t, mi=0;
  int e;
  double Z, tot;
  double logdoc = 0;
  int logdocinf = 0;
  int StartWord = ddD.N_dTcum[did];
  int EndWord = StartWord + words;
  float dtip[ddN.T];
#ifdef MH_STEP
  double doc_side_cache[ddN.T];
  for (t=0; t<ddN.T; t++) 
    doc_side_cache[t] = doc_side_fact(did,t);
#endif

  /*
   *   some of the latent variables are not sampled
   *   are kept in the testing version, uses enum GibbsType
   *      fix = global document setting
   *      fix_doc = settings for word in this doc
   *
   *   NB.   if fix==GibbsNone, then fix_doc==fix
   *         if fix==GibbsHold then fix_doc==GibbsHold or GibbsNone
   */
  enum GibbsType fix_doc = fix;

  if ( PCTL_BURSTY() ) {
    mi = ddM.MI[did];
  }
  e = ddD.e[did];

  for (i=StartWord; i<EndWord; i++) {
#ifdef MH_STEP
    int oldt;
#endif
    if ( fix==GibbsHold ) {
      if ( pctl_hold(i) )
	fix_doc = GibbsHold;  //   this word is a hold out
      else
	fix_doc = GibbsNone;
    }
    // check_m_vte(e);
    wid=ddD.w[i]; 
    /*******************
     *   first we remove affects of this word on the stats
     *******************/
#ifdef MH_STEP
    oldt = 
#endif
      t = Z_t(ddS.z[i]); 
    if ( fix_doc!=GibbsHold ) {
      if ( remove_topic(i, did, (!PCTL_BURSTY()||Z_issetr(ddS.z[i]))?wid:-1, 
                        t, mi, dD) ) {
	goto endword;
      }
    }
    /***********************
     *    get topic probabilities
     ***********************/
    // check_m_vte(e);
#ifdef MU_CACHE
    mu_side_fact_update(e);
#endif
#ifdef PHI_CACHE
    phi_norm_update(wid, e);
    phi_sum_update(wid, e, i);
#endif
    for (t=0, Z=0, tot=0; t<ddN.T; t++) {
#ifdef MH_STEP
      int saveback = ddP.back;
      if ( fix_doc!=GibbsHold )
        ddP.back = 0;
#endif
      /*
       *   (fix_doc==GibbsHold) =>
       *       doing estimation, not sampling so use prob versions
       *    else
       *        doing sampling so use fact versions
       */
#ifdef MH_STEP
      double tf = (fix_doc==GibbsHold)?doc_side_prob(did,t):
        doc_side_cache[t];
      if ( tf>0 ) {
        double wf = (fix_doc==GibbsHold)?word_side_prob(e, wid, t):
          word_side_fact(e, wid, t);
#else
      double tf = (fix_doc==GibbsHold)?doc_side_prob(did,t):
        doc_side_fact(did,t);
      if ( tf>0 ) {
        double wf = (fix_doc==GibbsHold)?word_side_prob(e, wid, t):
          word_side_fact(e, wid, t);
#endif
        tot += tf;
        if ( PCTL_BURSTY() ) 
          wf = (fix_doc==GibbsHold)?docprob(dD, t, i, mi, wf):
            docfact(dD, t, i, mi, wf, &dtip[t]);
        Z += p[t] = tf * wf;
      } else
        p[t] = 0;
#ifdef MH_STEP
      ddP.back = saveback;
#endif
    }
    if ( fix!=GibbsHold || fix_doc==GibbsHold )
      logdoc += log(Z/tot);
    if ( logdocinf==0 ) 
      if ( !finite(logdoc) ) {
	logdocinf++;
	yap_infinite(logdoc);
      }

    /*******************
     *   now sample t using p[] and install affects of this on the stats;
     *   but note this needs indicator to be set!
     *******************/
    if ( fix_doc!=GibbsHold ) {
      /*
       *  sample and update core stats 
       */
      t = samplet(p, Z, ddN.T, rng_unit(rngp));
#ifdef MH_STEP
      if ( t != oldt ) {
        double ratio  = p[oldt]/p[t];
        if ( PCTL_BURSTY() ) {
          ratio *= docfact(dD, t, i, mi, word_side_fact(e, wid, t), &dtip[t])
            * doc_side_fact(did,t);
          ratio /= docfact(dD, oldt, i, mi, word_side_fact(e, wid, oldt), &dtip[oldt])
            * doc_side_fact(did,oldt);
        } else {
          ratio *= word_side_fact(e, wid, t) * doc_side_fact(did, t);
          ratio /= word_side_fact(e, wid, oldt) * doc_side_fact(did, oldt);
        }
        if ( ratio<1 && ratio<rng_unit(rngp) )
          t = oldt;
      }
#endif
      Z_sett(ddS.z[i],t);
#ifdef TRACE_WT
      if ( wid==TR_W && t==TR_T )
        yap_message("update_topic(w=%d,t=%d,d=%d,l=%d,z=%d,N=%d,T=%d)\n",
                    wid,t,did,i,ddS.z[i],
                    (int)ddS.m_vte[wid][t][e],(int)ddS.s_vte[wid][t][e]);
#endif
      update_topic(i, did, wid, t, mi, dtip[t], dD);
#ifdef TRACE_WT
      if ( wid==TR_W && t==TR_T )
        yap_message("after update_topic(w=%d,t=%d,d=%d,l=%d,z=%d,N=%d,T=%d)\n",
                    wid,t,did,i,ddS.z[i],
                    (int)ddS.m_vte[wid][t][e],(int)ddS.s_vte[wid][t][e]);
#endif
    }
    endword:
    if ( PCTL_BURSTY() && M_multi(i) ) {
      mi++;
    }
  }
  return logdoc;
}
Exemplo n.º 23
0
void hca_displaytopics(char *stem, char *resstem, int topword, 
                       enum ScoreType scoretype, int pmicount, int fullreport) {
  int w,k;
  uint32_t *indk = NULL;
  int Nk_tot = 0;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  uint32_t *top1cnt = NULL;
  FILE *fp;
  float *tpmi = NULL;
  char *topfile;
  char *repfile;
  uint32_t *psort;
  FILE *rp = NULL;
  float *gtvec = globalprop();
  float *gpvec = calloc(ddN.W,sizeof(gpvec[0]));
  float *pvec = calloc(ddN.W,sizeof(pvec[0]));
  
  if ( pmicount>topword )
    pmicount = topword;
  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }    

  /*
   *  first collect counts of each word/term,
   *  and build gpvec (mean word probs)
   */
  build_NwK();
  {
    /*
     *  gpvec[] is normalised NwK[]
     */
    double tot = 0;
    for (w=0; w<ddN.W; w++)
      tot += gpvec[w] = NwK[w]+0.1; 
    for (w=0; w<ddN.W; w++)
      gpvec[w] /= tot;
  }
  if ( ddS.Nwt ) {
    for (k=0; k<ddN.T; k++) {
      Nk_tot += ddS.NWt[k];
    }
  } 
  
  psort = sorttops(gtvec, ddN.T);
  
  top1cnt = hca_top1cnt();
  if ( !top1cnt )
    yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n");


  if ( pmicount ) {
    tpmi = malloc(sizeof(*tpmi)*(ddN.T+1));
    if ( !tpmi )
      yap_quit("Cannot allocate tpmi in hca_displaytopics()\n");
  }
  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk in hca_displaytopics()\n");

  /*
   *   two passes through, 
   *           first to build the top words and dump to file
   */
  repfile = yap_makename(resstem,".topset");
  topfile = yap_makename(resstem,".toplst");
  fp = fopen(topfile,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", topfile);
  yap_message("\n");
  for (k=0; k<ddN.T; k++) {
    int cnt;
    tscorek = k;
    /*
     *    build sorted word list
     */
    cnt = buildindk(k, indk);
    topk(topword, cnt, indk, tscore);
    if ( cnt==0 )
      continue;
    /*
     *   dump words to file
     */
    fprintf(fp,"%d: ", k);
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr)  ) {
    int cnt;
     /*
     *    dump root words
     */
    tscorek = -1;
    cnt = buildindk(-1, indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  if ( verbose>1 ) yap_message("\n");

  if ( pmicount ) {
    /*
     * compute PMI
     */
    char *toppmifile;
    char *pmifile;
    double *tp;
    tp = dvec(ddN.T);
    pmifile=yap_makename(stem,".pmi");
    toppmifile=yap_makename(resstem,".toppmi");
    get_probs(tp);
    report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, 
               pmicount, tp, tpmi);
    free(toppmifile);
    free(pmifile);
    free(tp);
  }

  /*
   *   now report words and diagnostics
   */
  //ttop_open(topfile);
  if ( fullreport ) {
    rp = fopen(repfile,"w");
    if ( !rp ) 
      yap_sysquit("Cannot open file '%s' for write\n", repfile);
    fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one "
	    "dist-unif dist-unigrm");
    if ( PCTL_BURSTY() ) 
      fprintf(rp, " burst-concent");
    if ( ddN.tokens )  
      fprintf(rp, " ave-length");
    fprintf(rp, " coher");
    if ( pmicount ) 
      fprintf(rp, " pmi");
    fprintf(rp, "\n#word topic index rank");
    if ( ddS.Nwt )
      fprintf(rp, " count");
    fprintf(rp, " prop cumm df coher\n");
    
  }
  for (k=0; k<ddN.T; k++) {
    int cnt;
    int kk = psort[k];
    uint32_t **dfmtx;

    if ( ddP.phi==NULL && ddS.NWt[kk]==0 )
      continue;
    /*
     *   grab word prob vec for later use
     */
    if ( ddS.Nwt ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = wordprob(w,kk);
    } else if ( ddP.phi ) 
      fv_copy(pvec, ddP.phi[kk], ddN.W);
    else if ( ddS.phi ) 
      fv_copy(pvec, ddS.phi[kk], ddN.W);

    /*
     *  rebuild word list
     */
    tscorek = kk;
    cnt = buildindk(kk, indk);
    topk(topword, cnt, indk, tscore);
    if ( topword<cnt )
      cnt = topword;
    assert(cnt>0);
    /*
     *     df stats for topic returned as matrix
     */
    dfmtx = hca_dfmtx(indk, cnt, kk);

    if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) 
      underused++;
    /*
     *  print stats for topic
     *    Mallet:  tokens, doc_ent, ave-word-len, coher., 
     *             uni-dist, corp-dist, eff-no-words
     */
    yap_message("Topic %d/%d", kk, k);
    {
      /*
       *   compute diagnostics
       */
      double prop = gtvec[kk];
      float *dprop = docprop(kk);
      double spw = 0;
      double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); 
      double ew = exp(fv_entropy(pvec,ddN.W));
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W);
      double co = coherence(dfmtx, cnt);
      double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT;
      double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0;
      sparsitydoc += spd;
      yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop);   
      if ( ddS.Nwt ) {
	spw = ((double)nonzero_Nwt(kk))/((double)ddN.W);
	sparsityword += spw;
	yap_message(" ws=%.1lf%%", 100*(1-spw));
      } 
      yap_message(" ds=%.1lf%%", 100*(1-spd) );
      yap_message(" ew=%.0lf", ew); 
      yap_message(" ed=%.1lf", ed); 
      yap_message(" da=%.0lf", da+0.1); 
      yap_message(" t1=%u", top1cnt[kk]); 
      yap_message(" ud=%.3lf", ud); 
      yap_message(" pd=%.3lf", pd); 
      if ( PCTL_BURSTY() ) 
	yap_message(" bd=%.3lf", ddP.bdk[kk]); 
      if ( ddN.tokens )  
	yap_message(" sl=%.2lf", sl); 
      yap_message(" co=%.3lf%%", co);
      if ( pmicount ) 
	yap_message(" pmi=%.3f", tpmi[kk]);
      if ( fullreport ) {
	fprintf(rp,"topic %d %d", kk, k);
	fprintf(rp," %.6lf", prop);   
	if ( ddS.Nwt ) {
	  fprintf(rp," %.6lf", (1-spw));
	} else {
	  fprintf(rp," 0");
	}
	fprintf(rp," %.6lf", (1-spd) );
	fprintf(rp," %.2lf", ew); 
	fprintf(rp," %.2lf", ed); 
	fprintf(rp," %.0lf", da+0.1); 
	fprintf(rp," %u", top1cnt[kk]); 
	fprintf(rp," %.6lf", ud); 
	fprintf(rp," %.6lf", pd); 
	if ( PCTL_BURSTY() ) 
	  fprintf(rp," %.3lf", ddP.bdk[kk]); 
	fprintf(rp," %.4lf", (ddN.tokens)?sl:0); 
	fprintf(rp," %.6lf", co);
	if ( pmicount ) 
	  fprintf(rp," %.4f", tpmi[kk]);
	fprintf(rp,"\n");
      }
      if ( dprop) free(dprop);
    }
    if ( verbose>1 ) {
      double pcumm = 0;
      /*
       *   print top words:
       *     Mallet:   rank, count, prob, cumm, docs, coh
       */
      yap_message("\ntopic %d/%d", kk, k);
      yap_message(" words=");
      for (w=0; w<cnt; w++) {
	if ( w>0 ) yap_message(",");
	if ( ddN.tokens ) 
	  yap_message("%s", ddN.tokens[indk[w]]);
	else
	  yap_message("%d", indk[w]);
	if ( verbose>2 )
	  yap_message("(%6lf)", tscore(indk[w]));
	if ( fullreport ) {
	  fprintf(rp, "word %d %d %d", kk, indk[w], w);
	  if ( ddS.Nwt )
	    fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]);
	  pcumm += pvec[indk[w]];
	  fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	  fprintf(rp, " %d", dfmtx[w][w]); 
	  fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w));
	  if ( ddN.tokens ) 
	    fprintf(rp, " %s", ddN.tokens[indk[w]]);
	  fprintf(rp, "\n");
	}
      }
    }
    yap_message("\n");
    free(dfmtx[0]); free(dfmtx); 
  }
  if ( verbose>1 && ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) {
    int cnt;
    double pcumm = 0;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt = buildindk(-1,indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    /*
     *     cannot build df mtx for root because
     *     it is latent w.r.t. topics
     */
    yap_message("Topic root words=");
    if ( fullreport ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = betabasewordprob(w);
      double ew = exp(fv_entropy(pvec,ddN.W));
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      fprintf(rp,"topic -1 -1 0 0");
      fprintf(rp," %.4lf", ew); 
      fprintf(rp," %.6lf", ud); 
      fprintf(rp," %.6lf", pd); 
      fprintf(rp,"\n");
    }
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
      if ( verbose>2 )
	yap_message("(%6lf)", countscore(indk[w]));
      if ( fullreport ) {
	fprintf(rp, "word %d %d %d", -1, indk[w], w);
	if ( ddS.TwT )
	  fprintf(rp, " %d", ddS.TwT[w]);
	pcumm += pvec[indk[w]];
	fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	fprintf(rp, " 0 0"); 
	if ( ddN.tokens ) 
	  fprintf(rp, " %s", ddN.tokens[indk[w]]);
	fprintf(rp, "\n");
      }   
    }
    yap_message("\n");
  }
  yap_message("\n");
  if ( rp )
    fclose(rp);
	     
  if ( ddS.Nwt )
    yap_message("Average topicXword sparsity = %.2lf%%\n",
                100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%\n"
	      "Underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( pmicount ) 
    yap_message("Average PMI = %.3f\n", tpmi[ddN.T]);

  /*
   *   print 
   */
  if ( 1 ) {
    float **cmtx = hca_topmtx();
    int t1, t2;
    int m1, m2;
    float mval;
    char *corfile = yap_makename(resstem,".topcor");
    fp = fopen(corfile,"w");
    if ( !fp ) 
      yap_sysquit("Cannot open file '%s' for write\n", corfile);
   /*
    *   print file
     */
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) 
	 if ( cmtx[t1][t2]>1.0e-3 ) 
	  fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]);
    }
    fclose(fp);
    free(corfile);
    /*
     *   display maximum
     */
    m1 = 1; m2 = 0;
    mval = cmtx[1][0];
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) {
	if ( mval<cmtx[t1][t2] ) {
	  mval = cmtx[t1][t2];
	  m1 = t1;
	  m2 = t2;
	}
      }
    }
    yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval);
    free(cmtx[0]); free(cmtx);
  }

  /*
   *  print burstiness report
   */
  if ( PCTL_BURSTY() ) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  yap_message("\n");

  free(topfile);
  if ( repfile ) free(repfile);
  if ( top1cnt ) free(top1cnt);
  free(indk);
  free(psort);
  if ( pmicount )
    free(tpmi);
  if ( NwK ) {
    free(NwK);
    NwK = NULL;
  }
  free(pvec); 
  free(gtvec);
  free(gpvec);
}
Exemplo n.º 24
0
/*==========================================
 * main
 *========================================== */
int main(int argc, char* argv[])
{
  int c, iter, ITER=0, seed=0;
  enum dataType data = LdaC;
  enum dataType testdata = LdaC;
  int dots = 0;

  enum GibbsType fix_hold = GibbsNone;
  char *stem;
  char *resstem;
  int topwords = 20;
  int noerrorlog = 0;
  int displayed = 0;
  int load_vocab = 0;
  int checkpoint = 0;
  int restart = 0;
  int dopmi = 0;
  int restart_hca = 0;
  int load_phi = 0;
  int load_mu = 0;
  int procs = 1;
  int maxW = 0;
  enum ScoreType score=ST_idf;
  
  double BM0val=0, BM1val =0, BP0val=0, BP1val=0;
  
  clock_t t1=0, t2=0, t3=0;
  double tot_time = 0;
  double psample_time = 0;
  enum ParType par;
  /*
   *  default values
   */
  ddN.T = 10;
  ITER = 100;
  ddN.TEST = 0;

  pctl_init();

  while ( (c=getopt(argc, argv,"b:c:C:d:ef:F:g:G:h:K:l:L:N:o:pq:vr:s:S:t:T:vVW:"))>=0 ) {
    switch ( c ) {
    case 'b':
      if ( !optarg || sscanf(optarg,"%d",&ddP.back)!=1 )
        yap_quit("Need a valid 'b' argument\n");
      break;
    case 'c':
      if ( !optarg || sscanf(optarg,"%d",&checkpoint)!=1 )
        yap_quit("Need a valid 'c' argument\n");
      break;
    case 'C':
      if ( !optarg || sscanf(optarg,"%d",&ITER)!=1 )
	yap_quit("Need a valid 'C' argument\n");
      break;
    case 'd':
      if ( !optarg || sscanf(optarg,"%d",&dots)!=1 )
	yap_quit("Need a valid 'd' argument\n");
      break;
    case 'e':
      noerrorlog++;
      break;
    case 'f':
      if ( strcmp(optarg,"witdit")==0 ) 
	data = WitDit;
      else if ( strcmp(optarg,"docword")==0 ) 
	data = Docword;
      else if ( strcmp(optarg,"ldac")==0 ) 
	data = LdaC;
      else if ( strcmp(optarg,"bag")==0 ) 
	data = TxtBag;
      else if ( strcmp(optarg,"lst")==0 ) 
	data = SeqTxtBag;
       else
	yap_quit("Illegal data type for -f\n");
      break;
    case 'F':
      if ( strcmp(optarg,"all")==0 ) {
	for (par=ParAM; par<=ParBB; par++) 
	  ddT[par].fix = 1;
      } else {
	par = findpar(optarg);
	if ( par==ParNone )
	  yap_quit("Illegal arg for -F\n");
	ddT[par].fix = 1;
      }
      break;
    case 'g':
	{
	  char var[100];
	  int st=0;
	  if ( !optarg || sscanf(optarg,"%[^, ],%d", &var[0], &st)<1  )
            yap_quit("Need a valid 'g' argument\n");
          par = findpar(var);
          if ( par==ParBP1 )
            ddP.kbatch = st;
          else
            yap_quit("Illegal var for -g\n");
        }
        break;      
    case 'G':
      {
	char var[100];
	int st=0, cy=0;
	if ( !optarg || sscanf(optarg,"%[^, ],%d,%d",
			       &var[0], &cy, &st)<2 || st<0 || cy<0 )
	  yap_quit("Need a valid 'G' argument\n");
	par = findpar(var);
	if ( par==ParNone || par==ParB0P || par==ParB0M )
	  yap_quit("Illegal var for -G\n");
        ddT[par].fix = 0;
	ddT[par].start = st;
	ddT[par].cycles = cy;
      }
      break;
    case 'h':
      {
	fix_hold = GibbsHold;
	if ( !optarg  )
	  yap_quit("Need a valid 'h' argument\n");
        if ( strncmp(optarg,"dict,",5)==0 ) {
          if ( sscanf(&optarg[5],"%d",&ddP.hold_dict)<1 || ddP.hold_dict<2 )
            yap_quit("Need a valid 'hdict' argument\n");
        } else if ( strncmp(optarg,"fract,",6)==0 ) {
          if ( sscanf(&optarg[6],"%lf",&ddP.hold_fraction)<1 
               || ddP.hold_fraction<=0 || ddP.hold_fraction>=1 )
            yap_quit("Need a valid 'hfract' argument\n");
        } else if ( strncmp(optarg,"doc,",4)==0 ) {
          if ( sscanf(&optarg[4],"%d",&ddP.hold_every)<1 || ddP.hold_every<2 )
            yap_quit("Need a valid 'hdoc' argument\n");
        } else
          yap_quit("Need a valid 'h' argument\n");
      }
      break;
   case 'K':
      if ( !optarg || sscanf(optarg,"%d",&ddN.T)!=1 )
	yap_quit("Need a valid 'K' argument\n");
      break;
    case 'l':
      if ( !optarg )
	yap_quit("Need a valid 'l ' argument\n");
      if ( strncmp(optarg,"phi,",4)==0 ) {
	if ( sscanf(&optarg[4],"%d,%d",&ddP.phiiter, &ddP.phiburn)<2 )
	  yap_quit("Need a valid 'l word,' argument\n");      
      } else if ( strncmp(optarg,"theta,",6)==0 ) {
	if ( sscanf(&optarg[6],"%d,%d",&ddP.thetaiter, &ddP.thetaburn)<2 )
	  yap_quit("Need a valid 'l word,' argument\n");      
      } else if ( strncmp(optarg,"mu,",3)==0 ) {
	if ( sscanf(&optarg[3],"%d,%d",&ddP.muiter, &ddP.muburn)<2 )
	  yap_quit("Need a valid 'l word,' argument\n");      
      } else if ( strncmp(optarg,"prog,",5)==0 ) {
	if ( sscanf(&optarg[5],"%d,%d",&ddP.progiter, &ddP.progburn)<2 )
	  yap_quit("Need a valid 'l prog,' argument\n");
      } else
	yap_quit("Need a valid DIAG code in 'l' argument\n");
      break;
    case 'L':
      if ( !optarg )
	yap_quit("Need a valid 'L ' argument\n");
      if ( strncmp(optarg,"like,",5)==0 ) {
	if ( sscanf(&optarg[5],"%d,%d",&ddP.mltiter, &ddP.mltburn)<1 )
	  yap_quit("Need a valid 'L like' argument\n");
      } else
	yap_quit("Need a valid DIAG code in 'L' argument\n");
      break;
    case 'N':
      if ( !optarg || sscanf(optarg,"%d,%d", &ddP.maxN, &ddP.maxM)<1 )
	yap_quit("Need a valid 'N' argument\n");
      break;
    case 'o':
      {
	char *ptr = strchr(optarg, ',');
	int len = strlen(optarg);
	if ( ptr ) 
	  len = ptr - optarg;
        if ( strncmp(optarg,"idf",len)==0 )
          score = ST_idf;
        else if ( strncmp(optarg,"count",len)==0 )
          score = ST_count;
        else if ( strncmp(optarg,"Q",len)==0 )
          score = ST_Q;
        else if ( strncmp(optarg,"cost",len)==0 )
          score = ST_cost;
        else
          yap_quit("Need a valid parameter for 'o' argument\n");
	if ( ptr ) {
	  /*  there was a second arg */
	  if ( sscanf(ptr+1, "%d", &topwords) != 1)
	    yap_quit("Need a valid second 'o' argument\n");
	}
      break;
      }
      break;
   case 'p':
      dopmi++;
      break;
   case 'q':
      if(!optarg || sscanf(optarg, "%d", &procs) != 1)
	yap_quit("Need a valid 'q' argument\n");
      break;
    case 'r':
      if(!optarg )
	yap_quit("Need a valid 'r' argument\n");
      if ( strcmp(optarg,"tca")==0 )
	restart++;
      else if ( strcmp(optarg,"hca")==0 )
	restart_hca++;
      else if ( strcmp(optarg,"phi")==0 )
	load_phi++;
      else if ( strcmp(optarg,"mu")==0 )
	load_mu++;
      else
	yap_quit("Need a valid 'r' argument\n");
      break;
    case 's':
      if ( !optarg || sscanf(optarg,"%d",&seed)!=1 )
	yap_quit("Need a valid 's' argument\n");
      break;
    case 'S':
      {
	char var[100];
	double vin=0;
	if ( !optarg || sscanf(optarg,"%[^=, ]=%lf",
			       &var[0], &vin)<2  )
	  yap_quit("Need a valid 'S' argument\n");
	par = findpar(var);
	if ( par==ParNone )
	  yap_quit("Illegal var for -S\n");
	else if ( par==ParBM0 ) 
	  BM0val = vin;
	else if ( par==ParBM1 ) 
	  BM1val = vin;
	else if ( par==ParBP0 ) 
	  BP0val = vin;
	else if ( par==ParBP1 ) 
	  BP1val = vin;
	else
	  *(ddT[par].ptr) = vin;
      }   
      break;
    case 't':
      if ( !optarg || sscanf(optarg,"%d",&ddP.training)!=1 )
	yap_quit("Need a valid 't' argument\n");
      break;
    case 'T':
      if ( !optarg )
	yap_quit("Need a valid 'T' argument\n");
      {
	char *tname = data_name(optarg,data);
	FILE *fp = fopen(tname,"r");
	if ( fp==NULL ) {
          free(tname);
	  tname = data_name(optarg,testdata);
	  fp = fopen(tname,"r");
        } else {
	  testdata = data;
        }
        free(tname);
	if ( fp!=NULL ) {
	  /*  its a valid test filename */
          ddP.teststem = optarg;
	  fclose(fp);
	} else if ( sscanf(optarg,"%d",&ddN.TEST)!=1 )
	  yap_quit("Need a valid 'T' argument\n");
      }
      break;
    case 'v':
      verbose++;
      break;
    case 'V':
      load_vocab = 1;
      break;
    case 'W':
      if ( !optarg || sscanf(optarg,"%d",&maxW)<1 )
	yap_quit("Need a valid 'W' argument\n");
      break;
    default:
      yap_quit("Unknown option '%c'\n", c);
    }
  }

  if (argc-optind != 2) {
    usage();
    exit(-1);
  }
  if ( optind>=argc ) {
    yap_quit("No arguments given\n");
  }
  stem = strdup(argv[optind++]);
  resstem = strdup(argv[optind++]);

  if ( dopmi )
    load_vocab = 1;
  if ( dopmi && verbose !=2 ) {
    /*  
     *   due to the use of the ".top" file
     *   its really multi-purpose 
     */
    yap_quit("When computing PMI verbose must be exactly 2\n");
  }

  if ( noerrorlog==0 ) {
    char *wname = yap_makename(resstem, ".log");
    yap_file(wname);
    free(wname);
  }
  
  yap_commandline(argc, argv);
#ifdef H_THREADS
  yap_message(" Threads,");
#endif

  if ( restart || restart_hca ) {
    char *fname = yap_makename(resstem,".par");
    FILE *fp = fopen(fname,"r");
    char *buf;
    if ( !fp ) 
      yap_quit("Parameter file '%s' doesn't exist\n", fname);
    fclose(fp);
    free(fname);
    buf = readpar(resstem,"T",50);
    if ( !buf ) 
      yap_quit("Parameter file '%s' has no T\n", fname);
    ddN.T = atoi(buf);
    free(buf);
    if ( restart ) {
      buf = readpar(resstem,"E",50);
      if ( !buf ) 
	yap_quit("Parameter file '%s' has no E\n", fname);
      ddN.E = atoi(buf);
      free(buf);
      pctl_read(resstem);
    }
    if ( maxW==0 ) {
      buf = readpar(resstem,"W",50);
      if ( buf ) {
	maxW = atoi(buf);
	free(buf);
      }
    }
    if ( ddP.training==0 ) {
      buf = readpar(resstem,"TRAIN",50);
      if ( buf ) {
	ddP.training = atoi(buf);
	free(buf);
      } 
    }
    if ( ddN.TEST==0 ) {
      buf = readpar(resstem,"TEST",50);
      if ( buf ) {
	ddN.TEST = atoi(buf);
	free(buf);
      }
    }
  } 

  assert(ddN.T>0);
  assert(ddN.TEST>=0);
  assert(restart || restart_hca || ITER>0);
	
  if ( load_phi && ddP.phiiter>0 )
    yap_quit("Options '-l phi,...' and '-r phi' incompatible\n");
  if ( load_mu && ddP.muiter>0 )
    yap_quit("Options '-l mu,...' and '-r mu' incompatible\n");

  /*
   *   set random number generator
   */
  if ( seed ) {
    rng_seed(rngp,seed);
  } else {
    rng_time(rngp,&seed);
  }
  yap_message("Setting seed = %lu\n", seed);
  
  /*
   *  read data and get dimensions
   */
  {
    D_bag_t *dbp = data_read(stem, data);
    int training = pctl_training(dbp->D);
    if ( ddP.teststem ) {
      D_bag_t *dbpt = data_read(ddP.teststem, testdata);
      /* need to load a separate test set, strip to bare training */
      data_shrink(dbp, training);
      ddN.TEST = dbpt->D;
      data_append(dbp, dbpt);
      free(dbpt->w);  free(dbpt->d); free(dbpt);
    }
    if ( maxW>0 ) {
      if ( dbp->W <= maxW ) 
        dbp->W = maxW;
      if ( dbp->W > maxW )
        data_vocabshrink(dbp, maxW);
    }
    /*
     *  transfer into system
     */
    ddN.D = dbp->D;
    ddN.W = dbp->W;    
    ddN.N = dbp->N;
    ddN.NT = dbp->N;
    ddN.DT = training;
    ddD.w = dbp->w;
    ddD.d = dbp->d;
    free(dbp);
    if ( ddN.DT<ddN.D ) {
      /*  recompute NT */
      int i;
      for (i=0; i<ddN.N; i++)
        if ( ddD.d[i]>=ddN.DT )
          break;
      ddN.NT = i;
    }
  }

  data_read_epoch(stem);

  /*
   *  at this point, dimensions are fixed, so load phi and mu if needed
   */
  if ( load_phi )
    pctl_loadphi(resstem);
  if ( load_mu )
    pctl_loadmu(resstem);

  /*
   *   correct parameters after command line
   */
  pctl_fix(ITER);
  if ( BM0val>0 ) {
    ddP.b_mu[0] = BM0val;
  }
  if ( BM1val>0 ) {
    int i;
    for (i=1; i<ddN.E; i++)
      ddP.b_mu[i] = BM1val;
  }
  if ( BP0val>0 ) {
    int i;
    for (i=0; i<ddN.T; i++)
      ddP.b_phi[0][i] = BP0val;
  }
  if ( BP1val>0 ) {
    int i;
    if ( ddN.E==1 )
      yap_quit("b_phi[1] invalid when epochs==1\n");
    for (i=0; i<ddN.T; i++)
      ddP.b_phi[1][i] = BP1val;
  }
  pctl_samplereport();

  /*
   *   all data structures
   */
  data_alloc();
  if ( ddP.phiiter>0 )
    phi_init(resstem);
  else 
    ddS.phi = NULL;
  if ( ddP.muiter>0 )
    mu_init(resstem);
  else 
    ddS.mu = NULL;
  if ( ddP.thetaiter>0 )
    theta_init(resstem);
  else 
    ddS.theta = NULL;
  tca_alloc();
  if ( PCTL_BURSTY() ) 
    dmi_init(&ddM, ddS.z, ddD.w, ddD.N_dTcum,
             ddN.T, ddN.N, ddN.W, ddN.D, ddN.DT,
	     (fix_hold==GibbsHold)?pctl_hold:NULL);
  if ( load_vocab ) {
    data_vocab(stem);
  }

  cache_init();
  
  /*
   *  yap some details
   */
  data_report(ITER, seed);
  pctl_report();
 
  /*
   *  load/init topic assignments and prepare statistics
   */
  if ( restart || restart_hca) {
    tca_read_z(resstem, 0, ddN.DT);
    tca_rand_z(ddN.T, ddN.DT, ddN.D);
  } else {
    tca_rand_z(ddN.T, 0, ddN.D);
  }
  tca_reset_stats(resstem, restart, 0);

  if ( (restart || restart_hca ) && ITER ) 
      yap_message("Initial log_2(perp)=%lf\n", -M_LOG2E * likelihood()/ddN.NT);

  if ( ITER )
      yap_report("cycles: ");
  
  for (iter=0; iter<ITER; iter++) {
    int  pro;
    double thislp = 0;
    int   thisNd = 0;
    int doc;
#ifdef H_THREADS
    pthread_t thread[procs];
#endif
    D_pargs_p parg[procs];
    
#ifdef MU_CACHE
    mu_side_fact_reinit();
#endif
#ifdef PHI_CACHE
    phi_cache_reinit();
#endif

    t1 = clock();
    
    /*
     *  sampling
     */
#ifdef IND_STATS
    ddP.doc_ind_stats = u32tri(ddN.T,ddN.E,ddN.E);
    ddP.word_ind_stats = u32tri(ddN.T,ddN.E,ddN.E);
#endif

   /*  a bit complex if no threads!  */
    doc = 0;
    for (pro = 0 ; pro < procs ; pro++){
      parg[pro].dots=dots;
      parg[pro].procs=procs;
      parg[pro].doc = &doc;
#ifndef H_THREADS
      sampling_p(&parg[pro]);
#else
      if ( procs==1 ) 
	sampling_p(&parg[pro]);
      else if( pthread_create(&thread[pro],NULL,sampling_p,(void*) &parg[pro]) != 0){
        yap_message("thread failed %d\n",pro+1 );
      }
#endif
    }
#ifdef H_THREADS
    if ( procs>1 ) {
       //waiting for threads to finish
       for (pro = 0; pro < procs; pro++){
         pthread_join(thread[pro], NULL);
       }
    }
#endif

    // getting lp, Nd and clock
    for(pro = 0; pro < procs; pro++){
      thislp +=  parg[pro].thislp;
      thisNd +=  parg[pro].thisNd;
      tot_time += parg[pro].tot_time;
    }
#ifdef H_THREADS
    if ( procs>1 )
      tca_reset_stats(NULL,1,1);
#endif
    /*
     *  full check
     */
#ifndef NDEBUG
    {
      int e, d;
      check_cp_et();
      for (e=0; e<ddN.E; e++)
        check_m_vte(e);
      for (d=0; d<ddN.DT; d++)
        check_n_dt(d);
    }
#endif

#ifdef IND_STATS
    {
      char *fname = yap_makename(resstem,".istats");
      FILE *ifp = fopen(fname,"a");
      int e1, e2, kk;
      fprintf(ifp,"Iteration %d\n", iter);
      for (kk=0; kk<ddN.T; kk++) {
	fprintf(ifp," Topic %d\n", kk);
	for (e1=0; e1<ddN.E; e1++) {
	  fprintf(ifp,"  Epoch %d\n     ", e1);
	  for (e2=0; e2<ddN.E; e2++)
	    fprintf(ifp," %u", (unsigned)ddP.doc_ind_stats[kk][e1][e2]);
	  fprintf(ifp,"\n     ");
	  for (e2=0; e2<ddN.E; e2++)
	    fprintf(ifp," %u", (unsigned)ddP.word_ind_stats[kk][e1][e2]);
	  fprintf(ifp,"\n");
	}
      }
      fclose(ifp);
      free(ddP.doc_ind_stats[0][0]); free(ddP.doc_ind_stats[0]); 
      free(ddP.doc_ind_stats); 
      free(ddP.word_ind_stats[0][0]); free(ddP.word_ind_stats[0]); 
      free(ddP.word_ind_stats);
      free(fname);
    }
#endif
    
    /*
     *   sample hyperparameters
     */
    t3 = clock();
    pctl_sample(iter, procs);
   
    /*
     *   do time calcs here to remove diagnostics+reporting
     */
    t2 = clock();
    tot_time += (double)(t2 - t1) / CLOCKS_PER_SEC;
    psample_time += (double)(t2 - t3) / CLOCKS_PER_SEC;
    /*
     *   progress reports
     */
    if ( ( iter>ddP.progburn && (iter%ddP.progiter)==0 ) || iter+1>=ITER ) {
      yap_message(" %d\nlog_2(perp)=%lf,%lf", iter, 
		  -M_LOG2E * likelihood()/ddN.NT, -M_LOG2E * thislp/thisNd);
      pctl_update(iter);
      if ( verbose && iter%10==0 )
	yap_probs();
      if ( iter>0 && verbose>1 ) {
	if ( ddN.tokens ) {
	  tca_displaytopics(resstem,topwords,score);
	  displayed++;
	}
      }
      if ( iter+1<ITER ) {
	// yap_message("\n");
	yap_report("cycles: ");
      }
    } else {
      yap_message(" %d", iter);
      if ( verbose>1)  yap_message("\n");
    }
  
    if ( checkpoint>0 && iter>0 && iter%checkpoint==0 ) {
      data_checkpoint(resstem, stem, iter+1);
      yap_message(" checkpointed\n");
      tca_report(resstem, stem, ITER, procs, fix_hold, 
		 (dopmi&&displayed>0)?1:0);
    }
    if ( ddP.phiiter>0 && iter>ddP.phiburn && (iter%ddP.phiiter)==0 )
      phi_update();
    if ( ddP.thetaiter>0 && iter>ddP.thetaburn && (iter%ddP.thetaiter)==0 )
      theta_update();
    if ( ddP.muiter>0 && iter>ddP.muburn && (iter%ddP.muiter)==0 )
      mu_update();
  } // over iter
  
  if ( ITER ) 
      yap_report("Finished after %d cycles on average of %lf+%lf(s) per cycle\n",
	     iter,  (tot_time-psample_time)/iter, psample_time/iter);
  
  if ( ( verbose==1 || ((iter+1)%5!=0 && verbose>1) ) ) {
    if ( ddN.tokens ) {
       tca_displaytopics(resstem,topwords,score);
       displayed++;
    }
  }

  yap_probs();

  if ( ITER>0 ) 
	data_checkpoint(resstem, stem, ITER);
 
  tca_report(resstem, stem, ITER, procs, fix_hold, (dopmi&&displayed>0)?1:0);

  if ( ddP.phiiter>0 )
      phi_save(resstem);
  if ( ddP.thetaiter>0 )
      theta_save(resstem);
  if ( ddP.muiter>0 )
      mu_save(resstem);

  /*
   *  free
   */
  phi_free();
  theta_free();
  mu_free();
  cache_free();
  pctl_free();
  data_free();
  dmi_free(&ddM);
  tca_free();
  free(stem);
  free(resstem);
  rng_free(rngp);

  return 0;
}
Exemplo n.º 25
0
int myarmsMH(double xl, double xr,
	     double (*myfunc)(double x, void *mydata), 
	     void *mydata, double *xval, char *label,
	     int doMH) {
  double result = *xval;
  double *resvec = NULL;
  double startval = *xval;
  int errcode;
  if ( fabs(*xval-xr)/(xr)<0.00001 ) {
    *xval = xr * 0.999 + xl * 0.001;
  }
  if ( fabs(*xval-xl)/xl<0.00001 ) {
    *xval = xl * 0.999 + xr * 0.001;
  }
  myarms_evals = 0;
  if ( doMH ) {
    resvec = malloc(sizeof(resvec[0])*NSAMP);
    errcode = myarms_simple(6, &xl, &xr, myfunc, mydata, doMH, xval, 
			    resvec, NSAMP);
    result = resvec[NSAMP-1];
    free(resvec);
  } else 
    errcode = myarms_simple(6, &xl, &xr, myfunc, mydata, 0, xval, &result, 1);
  /*
   *  1007, 1003 is out of bounds
   */
  if ( errcode && errcode!=1007 && errcode!=1003 && errcode!=2001
       && (errcode!=2000 || startval!=result  ) ) {
    yap_quit("   myarmsMH(%lf,%s)->%d = %lf,%lf%s->%lf, w %d calls, quitting\n", 
	     startval, label, errcode,
	     myarms_last, result, (!ISFINITE(result))?"(inf)":"",
	     *xval, myarms_evals);
  }
  if ( errcode==1007 || errcode==1003 ) {
    /*  
     *   hit bounds, for safety use start val 
     */
    yap_message("   error myarmsMH(%lf,%s)->%d = %lf,%lf%s->%lf, w %d calls, quitting\n",
		startval, label, errcode,
		myarms_last, result, (!ISFINITE(result))?"(inf)":"",
		*xval, myarms_evals);
    result = startval;
  } 
  if ( errcode==2001 ) {
    /* 
     *   too many calls in Metrop-Hastings 
     */
    yap_message("   error myarmsMH(%lf,%s)->%d = %lf,%lf%s->%lf, w %d calls, quitting\n",
		startval, label, errcode,
		myarms_last, result, (!ISFINITE(result))?"(inf)":"",
		*xval, myarms_evals);
  } 
  /*
   *    note, sometimes the value is returned
   *    unchanged .... seems to be when the 
   *    posterior is really focussed
   */
  if ( verbose>1 || !ISFINITE(result) || startval==result )
    yap_message("   myarmsMH(%s) = %lf%s<-%lf, w %d calls %s\n", 
		label,
		result, (!ISFINITE(result))?"(inf)":"",
		*xval, myarms_evals,
		(startval==result)?"UNCHANGED":"");
  if ( ISFINITE(result) ) {
    if ( result<xl )
      result = xl;
    else if ( result>xr )
      result = xr;
    *xval = result;
  }
  if ( !ISFINITE(result) ) {
    return 1;
  }
  return 0;
}
Exemplo n.º 26
0
void hca_displaytopics(char *resstem, int topword, enum ScoreType scoretype) {
  int w,k;
  int *indk = NULL;
  int Nk_tot = 0;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  char *fname = yap_makename(resstem,".top");
  int nophi = (ddP.phi==NULL) && (ddS.phi==NULL);
  FILE *fp;

  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }    

  fp = fopen(fname,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", fname);

  /*
   *  first collect counts of each word/term
   */
  if ( scoretype != ST_count && scoretype != ST_phi ) {
    NwK = u32vec(ddN.W);
    if ( !NwK )
      yap_quit("Out of memory in hca_displaytopics()\n");
    for (w=0; w<ddN.W; w++) {
      NwK[w] = 0;
    }
    NWK = 0;
    for (w=0; w<ddN.W; w++) {
      for (k=0; k<ddN.T; k++) {
	NwK[w] += ddS.Nwt[w][k];    //  should use CCT_ReadN()
      }
      NWK += NwK[w];
    }
  }

  assert(ddN.tokens);

  for (k=0; k<ddN.T; k++) {
    Nk_tot += ddS.NWt[k];
  }

  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk\n");
  
  for (k=0; k<ddN.T; k++) {
    int cnt;
    double spw;
    double spd; 
    tscorek = k;
    /*
     *    print top words
     */
    cnt=0;
    if ( ddP.phi==NULL ) {
      for (w=0; w<ddN.W; w++) {
	if ( ddS.Nwt[w][k]>0 ) indk[cnt++] = w;
      }
    } else {
      float **phi;
      if ( ddP.phi )
	phi = ddP.phi;
      else
	phi = ddS.phi;
      for (w=0; w<ddN.W; w++) {
	if ( phi[k][w]>0.5/ddN.W ) indk[cnt++] = w;
      }
    }
    topk(topword, cnt, indk, tscore);
    spd = ((double)nonzero_Ndt(k))/((double)ddN.DT);
    sparsitydoc += spd;
    if ( nophi ) {
      spw = ((double)nonzero_Nwt(k))/((double)ddN.W);
      sparsityword += spw;
    }
    if ( ddS.NWt[k]*ddN.T*100<Nk_tot ) 
      underused++;
    yap_message("\nTopic %d (", k);
    if ( ddP.phi==NULL ) 
      yap_message((ddN.T>200)?"p=%.3lf%%,":"p=%.2lf%%,", 
		  100*((double)ddS.NWt[k])/(double)Nk_tot);   
    if ( nophi ) 
      yap_message("ws=%.1lf%%,", 100*(1-spw));
    else
      yap_message("#=%.0lf,", exp(phi_entropy(k)));
    yap_message("ds=%.1lf%%", 100*(1-spd) );
    fprintf(fp,"%d: ", k);
    yap_message(") words =");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
      if ( verbose>2 ) {
	double score = tscore(indk[w]);
	yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score);
      } else
	yap_message(",%s", ddN.tokens[indk[w]]);
    }
    yap_message("\n");
    fprintf(fp, "\n");
  }
	     
  if ( ddP.PYbeta && nophi ) {
    int cnt;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt=0;
    for (w=0; w<ddN.W; w++) {
      if ( ddS.TwT[w]>0 ) indk[cnt++] = w;
    }
    topk(topword, cnt, indk, tscore);
    yap_message("\nTopic root words =");
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
      if ( verbose>2 ) {
	double score = tscore(indk[w]);
	yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score);
      } else
	yap_message(",%s", ddN.tokens[indk[w]]);
    }
    yap_message("\n");
    fprintf(fp, "\n");
  }
  if ( nophi )
    yap_message("Average topicXword sparsity = %.2lf%%, ",
		100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%, "
	      "underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( ddP.bdk!=NULL) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("doc PYP report:   multis=%.2lf%%,  tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  fclose(fp);
  free(fname);
  free(indk);
  if ( scoretype != ST_count ) {
    free(NwK);
    NwK = NULL;
  }
}
Exemplo n.º 27
0
void like_merge(float minprop, double scale, int best) {
  int k1, k2;
  double realdiff = 0;
  double likediff;
  int got=0;
  /*  only use this for reporting ; should disable in production */
  float **cmtx;
  int title = 0;
  int mincount = minprop * ddN.NT;
  bestmerge_t B[ddN.T];

  if ( mincount<5 )
    mincount = 5;

  assert(ddP.phi==NULL);
  assert(ddP.theta==NULL);

  for (k1=0; k1<ddN.T; k1++) 
    B[k1].ml = 0;

  cmtx = hca_topmtx();
  if ( !cmtx )
    yap_quit("Out of memory in like_merge()\n");
 
  if ( ddP.PYbeta!=H_None ) 
    yap_quit("Non-parametric beta unimplemented with merge\n");

  for (k1=1; k1<ddN.T; k1++) {
    if ( ddS.NWt[k1]<=mincount ) 
      continue;
    for (k2=0; k2<k1; k2++) {
      if ( ddS.NWt[k2]<=mincount )
	continue;
      /*  now have a pair to check (k1,k2) with OK counts */
      if ( ddP.PYalpha==H_None )
	likediff = likemerge_DIRalpha(k1,k2);
      else
	likediff = likemerge_alpha(k1, k2);
      if ( ddP.PYbeta==H_None )
        likediff += likemerge_DIRbeta(k1,k2);
      else
	likediff += likemerge_beta(k1, k2);
      if ( likediff>0 ) {
	got++;
	if ( title==0 && verbose ) {
	  double like = scale * likelihood();
	  yap_message("\nPre merge log_2(perp)=%.4lf", like);
	  realdiff = like;
	}
	if ( verbose>1 ) {
	  if ( title==0 ) 
	    yap_message(", merge report:\n");
	  yap_message("   %d+%d cor=%0.6f like+=%0.6g", k1, k2, cmtx[k1][k2], 
		      scale * likediff);
	}     
	title = 1;
	if ( likediff>B[k1].ml ) {
	  B[k1].ml = likediff;
	  B[k1].k2 = k2; 
	  if ( verbose>1 ) yap_message("  stored");
	}
	if ( likediff>B[k2].ml ) {
	  B[k2].ml = likediff;
	  B[k2].k2 = k1; 
	  if ( verbose>1 ) yap_message("  stored");
	}
	if ( verbose>1 ) yap_message("\n");
      } else if ( verbose>2 ) {
        yap_message("   %d+%d cor=%0.6f like+=%0.6g\n", k1, k2, cmtx[k1][k2], 
                    scale * likediff);
      }
    }
  }
  while ( got && best-->0 && (k1=next_best(&B[0]))>=0 ) {
    /*
     *   have a good merge at position k1;
     */
    merge_alpha_t Ma;
    merge_beta_t Mb;
    k2 = B[k1].k2;
    yap_message("  best merge is %d+%d giving diff=%lf\n", k1, k2,
		scale* B[k1].ml);
    Ma.Tdt = NULL;
    Mb.Twt = NULL;
    if ( ddP.PYalpha ) 
      merge_init_Tdt(k1, k2, &Ma);
    if ( ddP.PYbeta ) 
      merge_init_Twt(k1, k2, &Mb);	       
    //  WRAY:  need to checkk what this does, it it why change?
    hca_merge_stats(k1, k2, Ma.Tdt, Mb.Twt);
    // hca_correct_tdt(0);
    if ( ddP.PYalpha ) 
      merge_free_Tdt(&Ma);
    if ( ddP.PYbeta ) 
      merge_free_Twt(&Mb);
    /*  block them from getting picked again */
    B[k1].ml = 0;
    B[k2].ml = 0;
    {
      int k;
      for (k=0; k<ddN.T; k++) {
	if ( B[k].k2==k1 || B[k].k2==k2 )
	  B[k].ml = 0;
      } 
    }
  }	
  if ( got && verbose ) {
    double like = scale * likelihood();
    realdiff -= like;
    yap_message("\nPost merge log_2(perp)=%.4lf (%.6lf)", like, realdiff);
  }  
  if ( got==0 && verbose ) {
    yap_message("Merge found no candidates\n");
  }
  free(cmtx[0]); free(cmtx);
}