示例#1
0
static void merge_init_Twt(int k1, int k2, merge_beta_t *M) {
  int w;
  /*
   *   build local store
   */
  M->Twt = u16vec(ddN.W); 
  M->TwT = u32vec(ddN.W); 
  M->Nwt = u32vec(ddN.W);
  if ( !M->Nwt || !M->TwT || !M->Twt  )
    yap_quit("Out of memory in likemerge()\n");

  /*
   *  initialise all data entries to simple sum
   */
  M->NWt = ddS.NWt[k1] + ddS.NWt[k2];
  M->TWt = ddS.TWt[k1] + ddS.TWt[k2];
  for (w=0; w<ddN.W; w++) {
    M->Twt[w] = ddS.Twt[w][k1] + ddS.Twt[w][k2];
    M->Nwt[w] = ddS.Nwt[w][k1] + ddS.Nwt[w][k2];
    M->TwT[w] = ddS.TwT[w];
  }
  M->TWTm = ddS.TWT - M->TWt;
#ifndef NDEBUG
  for (w=0; w<ddN.W; w++) {
    assert(M->Twt[w]<=M->Nwt[w]);
  }
#endif
}
示例#2
0
/*
 *   global probability for topic:
 *   taken from data if exists;
 *   else from ddP.alphapr
 *   else is 0
 */
static float *globalprop() {
  float *vec = fvec(ddN.T);
  double tot = 0;
  int k;
  if ( !vec) return NULL;
  if ( ddS.Nwt  ) {
    for (k=0; k<ddN.T; k++) {
      tot += vec[k] = ddS.NWt[k];
    }
  } else if ( ddS.Ndt ) {
    /*  have to total from Ndt */
    int d;
    uint32_t *uvec = u32vec(ddN.T);
    for (d=0; d<ddN.DT; d++) {
      for (k=0; k<ddN.T; k++) 
	uvec[k] += ddS.Ndt[d][k];
    }
    for (k=0; k<ddN.T; k++) 
      tot += vec[k] = uvec[k];
    free(uvec);
  } else if ( ddP.alphapr ) {
    /*
     *   this is rather poor, and should be rewritten
     */
    for (k=0; k<ddN.T; k++) 
      tot += vec[k] = ddP.alphapr[k];
  }
  if ( tot <=0 )
    return vec;
  for (k=0; k<ddN.T; k++) 
    vec[k] /= tot;
  return vec;
}
示例#3
0
static void build_NwK() {
  int w, k;
  NwK = u32vec(ddN.W);
  if ( !NwK )
    yap_quit("Out of memory in hca_displaytopics()\n");
  for (w=0; w<ddN.W; w++) {
    NwK[w] = 0;
  }
  NWK = 0;
  if ( !ddS.Nwt ) {
    /*
     *  recompute from scratch
     */
    int i;
    for (i=0; i<ddN.NT; i++) 
      NwK[ddD.w[i]]++;
    for (w=0; w<ddN.W; w++) 
      NWK += NwK[w];
  } else {
    for (w=0; w<ddN.W; w++) {
      for (k=0; k<ddN.T; k++) {
	NwK[w] += ddS.Nwt[w][k];    //  should use CCT_ReadN()
      }
      NWK += NwK[w];
    }
  }
  if ( NWK==0 )
    yap_quit("empty NWK in build_NwK()\n");
}
示例#4
0
static uint32_t *sorttops(float *vec, int K) {
  uint32_t *psort = u32vec(K);
  int k;
  for (k=0; k<K; k++) psort[k] = k;
  stvec = vec;
  qsort(psort, K, sizeof(*psort), pcompar);
  return psort;
}
示例#5
0
/*
 *   Build a vector giving number of times each topic is the
 *   most common in a doc.
 */
static uint32_t *hca_top1cnt() {
  uint32_t *cnt = u32vec(ddN.T);
  int i, t;
  for (i=0; i<ddN.DT; i++) {
    float *tvec = topprop(i);
    int maxt = 0;
    for (t=0; t<ddN.T; t++) 
      if ( tvec[t]>tvec[maxt] )
	maxt = t;
    cnt[maxt]++;
    free(tvec);
  }     
  return cnt;
}
示例#6
0
static void build_termNwK(T_stats_t *ptr) {
  int w, k;
  termNwK = u32vec(ptr->K);
  if ( !termNwK )
    yap_quit("Out of memory in hca_displaytopics()\n");
  for (w=0; w<ptr->K; w++) {
    termNwK[w] = 0;
  }
  termNWK = 0;
  for (w=0; w<ptr->K; w++) {
    for (k=0; k<ddN.T; k++) {
      termNwK[w] += ptr->Nkt[w][k]; 
    }
    termNWK += termNwK[w];
  }
  if ( termNWK==0 ) 
    yap_quit("empty termNWK in build_termNwK(), collocations empty!\n");
  termNwk = ptr->Nkt;
}
示例#7
0
/*
 *  print out the topic topk=10 words. report the PMI score. 
 */
double report_pmi(char *topfile,   /* name of topics file */
		  char *pmifile,  /* name of PMI file */
		  int T,          /* total topics */
		  int W,          /* total words */
		  int E,          /*  number of epochs */
		  int topk,
		  double *tp)
{
  int lineno = 0;
  int i,k, thee;
  /*
   *   mapping from local index to actual word index
   */
  uint32_t *wind = u32vec(topk*T*E);
  int n_wind = 0;
  /*
   *   boolean vector ... is word used
   */
  uint32_t *wuse = u32vec(W/32+1);
  /*
   *  PMI's by local index
   */
  uint32_t *topic = u32vec(topk);
  float *coherency = fvec(E);
  double **pmi;
  float ave = 0;

  char *line;
  size_t n_line;
  FILE *fr;
  if ( !wind || !wuse )
    yap_quit("Out of memory in report_pmi()\n");

  /*
   *   read in file of top word indices in topic
   */
  fr = fopen(topfile,"r");
  if ( !fr ) 
    yap_sysquit("Topic file '%s' not read\n", topfile);
  
  line = NULL;
  n_line = 0;
  lineno = 0;
  while ( getline(&line, &n_line, fr)>0 ) {
    char *buf = line;
    unsigned j;
    int e = 0;
    lineno ++;
    buf += strspn(buf," \t\n");    //   skip space
    if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || 
	 (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) 
      yap_quit("Cannot read topic in topic line %d from file '%s'\n", 
	       lineno, topfile);
    if ( k<0 || k>=T )
      continue;
    if ( e<0 || e>=E )
      continue;
    for (i = 0; i<topk && *buf; i++) {
      buf = strpbrk(buf," \t\n");    //   skip to next space
      if ( sscanf(buf, " %u", &j) <1 ) {
	if ( verbose>2 ) 
	    yap_message("Cannot read word %d in topic line %d from file '%s'\n", 
		    i+1, lineno, topfile);
	break;
      }
      if ( j>=W) {
	yap_quit("Bad word %d in topic line %d from file '%s'\n", 
		 i+1, lineno, topfile);
      }
      buf += strspn(buf," \t\n");    //   skip space
      /*
       *   check if word exists, and set up its index
       */
      if ( wuse[j/32U] & (1U<<(j%32U)) ) {
	// yes, so search for it
	int ii;
	for (ii=0; ii<n_wind; ii++)
	  if ( wind[ii]==j )
	    break;
	if ( ii>=n_wind )
	  yap_quit("Lookup of word %d failed at line %d in report_pmi()\n", 
		   (int)j, lineno);
      } else {
	// no, so add it
	wuse[j/32U] |= (1U<<(j%32U));
	wind[n_wind] = j;	
	n_wind++;
      }
    }
    free(line);
    line = NULL;
    n_line = 0;
  }
  fclose(fr);

  pmi = dmat(n_wind,n_wind);
  /*
   *  build hash table now since we know size
   */
  hashsize = n_wind*2;
  hashtab = malloc(sizeof(*hashtab)*hashsize);
  if ( !pmi || !hashtab )
    yap_quit("Out of memory in report_pmi()\n");
  for (i=0; i<hashsize; i++)
    hashtab[i] = 0;
  for (i=0; i<n_wind; i++)
    addw(wind[i],i);

  /*
   *   load up PMI file, only keeping words mentioned in hash table
   */
  {
    unsigned t1, t2;
    double value;
    int zcat = 0;
    fr = fopen(pmifile,"r");
    if ( !fr ) {
      /*
       *    try to zcat it
       */
      char *cmd = malloc(strlen(pmifile)+20);
      sprintf(cmd,"%s.gz", pmifile);
      fr = fopen(cmd,"r");
      if ( !fr ) 
	yap_sysquit("Cannot open pmifile '%s' in report_pmi()\n", 
		    pmifile);
      fclose(fr);
      sprintf(cmd,"gunzip -c %s", pmifile);
      fr = popen(cmd,"r");
      if ( !fr )
	yap_sysquit("Cannot open or zcat pmifile '%s' in report_pmi()\n", 
		    pmifile);
      zcat = 1;
      free(cmd);
    }
    while (fscanf(fr, "%u %u %lg", &t1, &t2, &value)==3 ) { 
      if ( t1>=W || t2>= W )
	yap_quit("Illegal word index in report_pmi()\n");
      if ( t1!= t2 && ( wuse[t1/32U] & (1U<<(t1%32U)) ) 
	   && ( wuse[t2/32U] & (1U<<(t2%32U))) ) {
	int i1, i2;
	i1 = findw(t1,wind);
	i2 = findw(t2,wind);
	if ( i1==UINT32_MAX || i2==UINT32_MAX )
	  yap_quit("Could not locate word index in report_pmi()\n");
	pmi[i1][i2]=value;
	pmi[i2][i1]=value;
      }
    }
    if ( zcat )
      pclose(fr);
    else
      fclose(fr);
  }
  
  /*
   *    compute PMI score for each topic
   */

  fr = fopen(topfile,"r");
  if ( !fr ) 
    yap_sysquit("Topic file '%s' not read\n", topfile);
  line = NULL;
  n_line = 0;
  thee = 0;
  lineno = 0;
  if ( E>1 ) 
    yap_message("PMI %d:: ", 0);
  else
    yap_message("PMI :: ");

  while ( getline(&line, &n_line, fr)>0 ) {
    /*
     *  repeat logic above to read topic file again
     */
    char *buf = line;
    unsigned j;
    int cnt = 0;
    int e = 0;
    double coh = 0;
    buf += strspn(buf," \t\n");    //   skip space
    if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || 
	 (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) 
      yap_quit("Cannot read topic in topic line %d from file '%s'\n", 
	       lineno, topfile);
    if ( k<0 || k>=T )
      continue;
    if ( e<0 || e>=E )
      continue;
    if ( e!=thee ) {
      thee = e;
      yap_message("\nPMI %d:: ", e);
    }
    for (i = 0; i<topk && *buf; i++) {
      buf = strpbrk(buf," \t\n");    //   skip to next space
      if ( sscanf(buf, " %u", &j) <1 ) {
	yap_message("Cannot read word %d in topic line %d from file '%s'\n", i+1, lineno, topfile);
	break;
      }
      if ( j>=W) {
	yap_quit("Bad word %d in topic line %d from file '%s'\n", i+1, lineno, topfile);
      }
      buf += strspn(buf," \t\n");    //   skip space
      topic[i] = findw(j,wind);
    }
    if ( i<topk )
      topic[i] = W;
    /*
     *  topics now read 
     */
    for (i=0; i<topk && topic[i]<W; i++) {
      for (j=i+1; j<topk && topic[j]<W; j++) {
	coh += pmi[topic[i]][topic[j]];
	cnt ++;
      }
    }
    if ( cnt>0 ) coh /= cnt;
    coherency[e] += coh * tp[k];
    yap_message(" %d:%.3lf", k, coh);
  }
  fclose(fr);
  yap_message("\nPMI =");
  if ( E==1 ) {
    yap_message(" %.3lf\n", coherency[0]);
    ave = coherency[0];
  } else {
    int e;
    for (e=0; e<E; e++) {
      ave += coherency[e];
      yap_message(" %.3lf", coherency[e]);
    }
    ave /= E;
    yap_message(" -> %.3lf\n", ave);
  }
      
  free(wind);
  free(coherency);
  free(wuse);
  free(topic);
  free(pmi[0]); free(pmi);
  free(hashtab);
  hashtab = NULL;
  hashsize = 0;
  return ave;
}
示例#8
0
void hca_displaytopics(char *resstem, int topword, enum ScoreType scoretype) {
  int w,k;
  int *indk = NULL;
  int Nk_tot = 0;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  char *fname = yap_makename(resstem,".top");
  int nophi = (ddP.phi==NULL) && (ddS.phi==NULL);
  FILE *fp;

  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }    

  fp = fopen(fname,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", fname);

  /*
   *  first collect counts of each word/term
   */
  if ( scoretype != ST_count && scoretype != ST_phi ) {
    NwK = u32vec(ddN.W);
    if ( !NwK )
      yap_quit("Out of memory in hca_displaytopics()\n");
    for (w=0; w<ddN.W; w++) {
      NwK[w] = 0;
    }
    NWK = 0;
    for (w=0; w<ddN.W; w++) {
      for (k=0; k<ddN.T; k++) {
	NwK[w] += ddS.Nwt[w][k];    //  should use CCT_ReadN()
      }
      NWK += NwK[w];
    }
  }

  assert(ddN.tokens);

  for (k=0; k<ddN.T; k++) {
    Nk_tot += ddS.NWt[k];
  }

  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk\n");
  
  for (k=0; k<ddN.T; k++) {
    int cnt;
    double spw;
    double spd; 
    tscorek = k;
    /*
     *    print top words
     */
    cnt=0;
    if ( ddP.phi==NULL ) {
      for (w=0; w<ddN.W; w++) {
	if ( ddS.Nwt[w][k]>0 ) indk[cnt++] = w;
      }
    } else {
      float **phi;
      if ( ddP.phi )
	phi = ddP.phi;
      else
	phi = ddS.phi;
      for (w=0; w<ddN.W; w++) {
	if ( phi[k][w]>0.5/ddN.W ) indk[cnt++] = w;
      }
    }
    topk(topword, cnt, indk, tscore);
    spd = ((double)nonzero_Ndt(k))/((double)ddN.DT);
    sparsitydoc += spd;
    if ( nophi ) {
      spw = ((double)nonzero_Nwt(k))/((double)ddN.W);
      sparsityword += spw;
    }
    if ( ddS.NWt[k]*ddN.T*100<Nk_tot ) 
      underused++;
    yap_message("\nTopic %d (", k);
    if ( ddP.phi==NULL ) 
      yap_message((ddN.T>200)?"p=%.3lf%%,":"p=%.2lf%%,", 
		  100*((double)ddS.NWt[k])/(double)Nk_tot);   
    if ( nophi ) 
      yap_message("ws=%.1lf%%,", 100*(1-spw));
    else
      yap_message("#=%.0lf,", exp(phi_entropy(k)));
    yap_message("ds=%.1lf%%", 100*(1-spd) );
    fprintf(fp,"%d: ", k);
    yap_message(") words =");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
      if ( verbose>2 ) {
	double score = tscore(indk[w]);
	yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score);
      } else
	yap_message(",%s", ddN.tokens[indk[w]]);
    }
    yap_message("\n");
    fprintf(fp, "\n");
  }
	     
  if ( ddP.PYbeta && nophi ) {
    int cnt;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt=0;
    for (w=0; w<ddN.W; w++) {
      if ( ddS.TwT[w]>0 ) indk[cnt++] = w;
    }
    topk(topword, cnt, indk, tscore);
    yap_message("\nTopic root words =");
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
      if ( verbose>2 ) {
	double score = tscore(indk[w]);
	yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score);
      } else
	yap_message(",%s", ddN.tokens[indk[w]]);
    }
    yap_message("\n");
    fprintf(fp, "\n");
  }
  if ( nophi )
    yap_message("Average topicXword sparsity = %.2lf%%, ",
		100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%, "
	      "underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( ddP.bdk!=NULL) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("doc PYP report:   multis=%.2lf%%,  tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  fclose(fp);
  free(fname);
  free(indk);
  if ( scoretype != ST_count ) {
    free(NwK);
    NwK = NULL;
  }
}
示例#9
0
static void merge_opt_Tdt(int k1, int k2, merge_alpha_t *M) {
int d;
  struct heap_s up;
  struct heap_s down;
  /*
   *    sorting on moves,
   *    stores doc index for 
   */
  uint32_t *Tdt_up;
  uint32_t *Tdt_down;
  /*
   *     change from incr/decr this docs Tdt
   */
  float *score_up;
  float *score_down;

  Tdt_up = u32vec(ddN.DT);
  Tdt_down = u32vec(ddN.DT);
  score_up = fvec(ddN.DT);
  score_down = fvec(ddN.DT);
  if ( !score_down || !score_up || !Tdt_up || !Tdt_down )
    yap_quit("Out of memory in likemerge()\n");
  /*
   *  initialise sort
   */
  for (d=0; d<ddN.DT; d++) {
    assert(M->Tdt[d]<=M->Ndt[d]);
    /*   don't change for some docs */
    Tdt_up[d] = d;
    Tdt_down[d] = d;
    if ( M->Tdt[d]<M->Ndt[d] )
      score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) 
	* S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1);
    else 
      score_up[d] = 0;
    if ( M->Tdt[d]>1 )
      score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d])
	/(ddP.bpar + ddP.apar*(M->TdT[d]-1));
    else
      score_down[d] = 0;    
    assert((M->Tdt[d]>1)||score_down[d]==0);
    assert((M->Tdt[d]<M->Ndt[d])||score_up[d]==0);
    assert(M->Tdt[d]<=M->Ndt[d]);
  }
  assert(M->TDt>0);

  /*  
   *  use a heap, so only top of heap is least 
   */
  heap_init(&up, Tdt_up, ddN.DT, fveccmp, (void *)score_up);
  heap_init(&down, Tdt_down, ddN.DT, fveccmp, (void *)score_down);

  while ( 1 ) {
    float upv;
    float downv;
    upv = merge_alphabasetopicprob(M->TDTm+M->TDt, M->TDt, k1)
      *score_up[heap_front(&up)];
    if ( M->TDt>1 )
      downv = score_down[heap_front(&down)] 
        / merge_alphabasetopicprob(M->TDTm+M->TDt-1, M->TDt-1, k1);
    else
      downv = 0.0;
    if ( downv>upv && downv>1.0 ){
      //  decrement this
      d = heap_front(&down); 
      M->TdT[d]--;
      M->Tdt[d]--;
      assert(M->Tdt[d]>0);
      M->TDt--;
      heap_pop(&down);
      heap_remove(&up,d);
    } else if ( downv<upv && upv>1.0 ){
      //  increment this
      d = heap_front(&up);
      M->TdT[d]++;
      M->Tdt[d]++;
      assert(M->Tdt[d]<=M->Ndt[d]);
      M->TDt++;
      heap_pop(&up);
      heap_remove(&down,d);
    } else {
      //  none are better so quit
      break;
    }
    if ( M->Tdt[d]<M->Ndt[d] )
      score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) 
	* S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1);
    else 
      score_up[d] = 0;
    if ( M->Tdt[d]>1 )
      score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d])
	/(ddP.bpar + ddP.apar*(M->TdT[d]-1));
    else
      score_down[d] = 0;
    assert(M->Tdt[d]>1||score_down[d]==0);
    assert(M->Tdt[d]<M->Ndt[d] ||score_up[d]==0);
    assert(M->Tdt[d]<=M->Ndt[d]);
    /*
     *  now adjust the two heaps for new vals for [d]
     */
    heap_push(&down,d);
    heap_push(&up,d);
  }  
  free(score_up);
  free(score_down);
  heap_free(&up);
  heap_free(&down);
}