示例#1
0
/*
 *   post. prob of word given topic used in sampling
 */
double wordfact(int j, int t, float *tip) {
  if ( ddP.phi!=NULL ) {
    assert(ddP.phi);
    return ddP.phi[t][j];
  } 
  if ( ddP.PYbeta ) {
    double p;
    if ( ddS.Twt[j][t]==0 ) {
#ifndef NDEBUG
      if ( ddS.Nwt[j][t]>0 ) {
	yap_message("ddS.Nwt[%d][%d]==%d\n", j, t, ddS.Nwt[j][t]);
	assert(ddS.Nwt[j][t]==0);
      }
#endif
      p = ((double)ddP.bwpar+ddP.awpar*ddS.TWt[t]) * betabasewordprob(j);
      *tip = 1.0;
    } else {
      double uone, uzero;
      wordtableindicatorprob(j, t, &uone, &uzero);
      p = uone + uzero;
      *tip = uone/(uone + uzero);
    }
    return p/((double)ddS.NWt[t]+ddP.bwpar);
  }
  return ((double)ddS.Nwt[j][t]+ddP.betapr[j]) / ((double)ddS.NWt[t]+ddP.betatot);
}
示例#2
0
/*
 *    the word table indicator is increased, but not forced
 */
void wordtableindicatorprob(int j, int t, double *uone, double *uzero) {
  int nn = ddS.Nwt[j][t];
  int tt = ddS.Twt[j][t];
  double e1, e0;
  /*
   *   fudge to handle multi-threading  case where constraints violated
   */
  if ( nn>0 && tt==0 ) tt=1;
  if ( tt>nn ) tt = nn;
  if ( nn==0 ) {
    /*
     *    should only happen during multi-threading
     */
    tt = 0;
    e1 = 1;
    e0 = 0;
  } else {
    e1 = S_UV(ddC.SY,nn,tt+1);
    if ( tt==1 )
      e0 = nn - ddP.awpar;
    else
      e0 = S_U(ddC.SY,nn,tt);
  }
  *uone = e1 * (ddP.bwpar+ddP.awpar*ddS.TWt[t]) * betabasewordprob(j) 
    * (tt+1)/(nn+1);
  *uzero = e0 * (nn-tt+1)/(nn+1);
}
示例#3
0
/*
 *   probability of word given topic used for estimation,
 *   which means the topic parts are fixed!
 *   unseen words can still be in the topic, just with
 *   less probability
 */
double wordprob(int j, int t) {
  if ( ddP.phi!=NULL )
    return ddP.phi[t][j];
  if ( ddP.PYbeta ) {
    double pnew = ((double)ddP.bwpar+ddP.awpar*ddS.TWt[t]) * betabasewordprob(j);
    double pold = 0;
    if ( ddS.Nwt[j][t]>0 ) 
      pold = (double)ddS.Nwt[j][t]-ddS.Twt[j][t]*ddP.awpar;
    return (pnew+pold)/((double)ddS.NWt[t]+ddP.bwpar);
  }
  return ((double)ddS.Nwt[j][t]+ddP.betapr[j]) / ((double)ddS.NWt[t]+ddP.betatot);
}
示例#4
0
void hca_displaytopics(char *stem, char *resstem, int topword, 
                       enum ScoreType scoretype, int pmicount, int fullreport) {
  int w,k;
  uint32_t *termindk = NULL;
  uint32_t *indk = NULL;
  int Nk_tot = 0;
  double (*termtscore)(int) = NULL;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  uint32_t *top1cnt = NULL;
  FILE *fp;
  float *tpmi = NULL;
  char *topfile;
  char *repfile;
  uint32_t *psort;
  FILE *rp = NULL;
  float *gtvec = globalprop();
//#define XTRA // prints model topic probs after observed
#ifdef XTRA
  double *gtavec = calloc(ddN.T,sizeof(gtavec[0]));
#endif
  float *gpvec = calloc(ddN.W,sizeof(gpvec[0]));
  float *pvec = calloc(ddN.W,sizeof(pvec[0]));
#ifdef KL
  float *dfvec = calloc(ddN.W,sizeof(dfvec[0]));
#endif
  double *ngalpha = NULL;
  T_stats_t *termstats;
  
#ifdef XTRA
  get_probs(gtavec);
#endif

  if ( pmicount>topword )
    pmicount = topword;
  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phirat ) {
    tscore = phiratioscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }  

  if ( ddS.TwT==NULL && ddP.phi==NULL && scoretype == ST_phirat ) 
	yap_quit("Cannot use '-orat' option with this model/settings.\n");	

  if ( ddP.PYalpha==H_NG ) {
    /*
     *  provide an estimate of alpha
     */
    ngalpha = dvec(ddN.T);
    get_probs(ngalpha);
    for (k=0; k<ddN.T; k++) {
      ddP.alphapr[k] = ngalpha[k];
    }
  }

  /*
   *  returns null if no relevant data file
   */
  termstats = tstats_init(ddS.z, ddD.NdTcum, ddN.T, ddN.DT, stem);
  if ( termstats ) {
    if ( scoretype == ST_idf ) {
      termtscore = termidfscore;
    } else 
      termtscore = termcountscore;
  }  

  
  /*
   *  first collect counts of each word/term,
   *  and build gpvec (mean word probs)
   */
  build_NwK();
  if ( termstats )
    build_termNwK(termstats);
  {
    /*
     *  gpvec[] is normalised NwK[]
     */
    double tot = 0;
    for (w=0; w<ddN.W; w++)
      tot += gpvec[w] = NwK[w]+0.1; 
    for (w=0; w<ddN.W; w++)
      gpvec[w] /= tot;
  }
  if ( ddS.Nwt ) {
    for (k=0; k<ddN.T; k++) {
      Nk_tot += ddS.NWt[k];
    }
  } 
  
  psort = sorttops(gtvec, ddN.T);
  
  top1cnt = hca_top1cnt();
  if ( !top1cnt )
    yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n");

  if ( pmicount ) {
    tpmi = malloc(sizeof(*tpmi)*(ddN.T+1));
    if ( !tpmi )
      yap_quit("Cannot allocate tpmi in hca_displaytopics()\n");
  }
  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk in hca_displaytopics()\n");
  if ( termstats ) {
    termindk = malloc(sizeof(*indk)*termstats->K);
    if ( !termindk )
      yap_quit("Cannot allocate termindk in hca_displaytopics()\n");
  }

  
  data_df(stem);

#ifdef KL
  for (w=0; w<ddN.W; w++)
    dfvec[w] = ddD.df[w];
#endif
  
  /*
   *   two passes through, 
   *           first to build the top words and dump to file
   */
  repfile = yap_makename(resstem,".topset");
  topfile = yap_makename(resstem,".toplst");
  fp = fopen(topfile,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", topfile);
  yap_message("\n");
  for (k=0; k<ddN.T; k++) {
    int cnt, termcnt = 0;
    tscorek = k;
    /*
     *    build sorted word list
     */
    cnt = buildindk(k, indk);
    topk(topword, cnt, indk, tscore);
    if ( cnt==0 )
      continue;
    if ( termstats ) {
      termcnt = buildtermindk(k, termindk, termstats);
      topk(topword, termcnt, termindk, termtscore);
    }
    /*
     *   dump words to file
     */
    fprintf(fp,"%d: ", k);
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    if ( termstats ) {
      for (w=0; w<topword && w<termcnt; w++) {
	fprintf(fp," %d", (int)termstats->Kmin+termindk[w]);
      }
    }
    fprintf(fp, "\n");
  }
  if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr)  ) {
    int cnt;
     /*
     *    dump root words
     */
    tscorek = -1;
    cnt = buildindk(-1, indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  if ( verbose>1 ) yap_message("\n");

  if ( pmicount ) {
    /*
     * compute PMI
     */
    char *toppmifile;
    char *pmifile;
    double *tp;
    tp = dvec(ddN.T);
    pmifile=yap_makename(stem,".pmi");
    toppmifile=yap_makename(resstem,".toppmi");
    get_probs(tp);
    report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, 
               pmicount, tp, tpmi);
    free(toppmifile);
    free(pmifile);
    free(tp);
  }

  /*
   *   now report words and diagnostics
   */
  //ttop_open(topfile);
  if ( fullreport ) {
    rp = fopen(repfile,"w");
    if ( !rp ) 
      yap_sysquit("Cannot open file '%s' for write\n", repfile);
    fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one "
	    "dist-unif dist-unigrm");
    if ( PCTL_BURSTY() ) 
      fprintf(rp, " burst-concent");
    if ( ddN.tokens )  
      fprintf(rp, " ave-length");
    fprintf(rp, " coher");
    if ( pmicount ) 
      fprintf(rp, " pmi");
    fprintf(rp, "\n#word topic index rank");
    if ( ddS.Nwt )
      fprintf(rp, " count");
    fprintf(rp, " prop cumm df coher\n");
    
  }
  for (k=0; k<ddN.T; k++) {
    int cnt, termcnt = 0;
    int kk = psort[k];
    uint32_t **dfmtx;

    if ( ddP.phi==NULL && ddS.NWt[kk]==0 )
      continue;
    /*
     *   grab word prob vec for later use
     */
    if ( ddS.Nwt ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = wordprob(w,kk);
    } else if ( ddP.phi ) 
      fv_copy(pvec, ddP.phi[kk], ddN.W);
    else if ( ddS.phi ) 
      fv_copy(pvec, ddS.phi[kk], ddN.W);

    /*
     *  rebuild word list
     */
    tscorek = kk;
    cnt = buildindk(kk, indk);
    topk(topword, cnt, indk, tscore);
    if ( topword<cnt )
      cnt = topword;
    assert(cnt>0);
    if ( termstats ) {
      termcnt = buildtermindk(kk, termindk, termstats);
      topk(topword, termcnt, termindk, termtscore);
      if ( topword<termcnt )
	termcnt = topword;
    }
    /*
     *     df stats for topic returned as matrix
     */
    dfmtx = hca_dfmtx(indk, cnt, kk);

    if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) 
      underused++;
    /*
     *  print stats for topic
     *    Mallet:  tokens, doc_ent, ave-word-len, coher., 
     *             uni-dist, corp-dist, eff-no-words
     */
    yap_message("Topic %d/%d", kk, k);
    {
      /*
       *   compute diagnostics
       */
      double prop = gtvec[kk];
      float *dprop = docprop(kk);
      double spw = 0;
      double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT);
#ifdef KL
      double ew = fv_kl(dfvec,pvec,ddN.W);
#else
      double ew = exp(fv_entropy(pvec,ddN.W));
#endif
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W);
      double co = coherence(dfmtx, cnt);
      double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT;
#define MALLET_EW
#ifdef MALLET_EW
      double ewp = dprop?(1.0/fv_expprob(pvec,ddN.W)):ddN.W;
#endif
      double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0;
      sparsitydoc += spd;
      yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop);   
#ifdef XTRA
      yap_message((ddN.T>200)?"/%.3lf%%":"/%.2lf%%",100*gtavec[kk]);   
#endif
      if ( ddS.Nwt ) {
	spw = ((double)nonzero_Nwt(kk))/((double)ddN.W);
	sparsityword += spw;
	yap_message(" ws=%.1lf%%", 100*(1-spw));
      } 
      yap_message(" ds=%.1lf%%", 100*(1-spd) );
#ifdef KL
      yap_message(" ew=%lf", ew);
#else
      yap_message(" ew=%.0lf", ew);
#endif
#ifdef MALLET_EW
      yap_message(" ewp=%.1lf", ewp); 
#endif
      yap_message(" ed=%.1lf", ed); 
      yap_message(" da=%.0lf", da+0.1); 
      yap_message(" t1=%u", top1cnt[kk]); 
      yap_message(" ud=%.3lf", ud); 
      yap_message(" pd=%.3lf", pd); 
      if ( PCTL_BURSTY() ) 
	yap_message(" bd=%.3lf", ddP.bdk[kk]); 
      if ( ddP.NGbeta ) {
	/*
	 *   approx. as sqrt(var(lambda_k)/lambda-normaliser
	 */
	double ngvar = sqrt(ddP.NGalpha[kk])
	  * (ngalpha[kk]/ddP.NGalpha[kk]);
	yap_message(" ng=%.4lf,%.4lf", 
		    ngalpha[kk], ngvar/ngalpha[kk]);
	if ( ddS.sparse )
	    yap_message(",%.4f", 1-((float)ddS.sparseD[kk])/ddN.DTused);
	if ( verbose>2 )
	    yap_message(" ngl=%.4lf,%.4lf, nga=%.4lf,%.4lf", 
		    ddP.NGalpha[kk]/ddP.NGbeta[kk], 
		    sqrt(ddP.NGalpha[kk]/ddP.NGbeta[kk]/ddP.NGbeta[kk]),
		    ddP.NGalpha[kk], ddP.NGbeta[kk]); 
      }
      if ( ddN.tokens )  
	yap_message(" sl=%.2lf", sl); 
      yap_message(" co=%.3lf%%", co);
      if ( pmicount ) 
	yap_message(" pmi=%.3f", tpmi[kk]);
      if ( fullreport ) {
	fprintf(rp,"topic %d %d", kk, k);
	fprintf(rp," %.6lf", prop);   
	if ( ddS.Nwt ) {
	  fprintf(rp," %.6lf", (1-spw));
	} else {
	  fprintf(rp," 0");
	}
	fprintf(rp," %.6lf", (1-spd) );
#ifdef KL
	yap_message(" %lf", ew);
#else
	fprintf(rp," %.2lf", ew);
#endif
#ifdef MALLET_EW
	fprintf(rp," %.2lf", ewp); 
#endif
	fprintf(rp," %.2lf", ed); 
	fprintf(rp," %.0lf", da+0.1); 
	fprintf(rp," %u", top1cnt[kk]); 
	fprintf(rp," %.6lf", ud); 
	fprintf(rp," %.6lf", pd); 
	if ( PCTL_BURSTY() ) 
	  fprintf(rp," %.3lf", ddP.bdk[kk]); 
	fprintf(rp," %.4lf", (ddN.tokens)?sl:0); 
	fprintf(rp," %.6lf", co);
	if ( pmicount ) 
	  fprintf(rp," %.4f", tpmi[kk]);
	fprintf(rp,"\n");
      }
      if ( dprop) free(dprop);
    }
    if ( verbose>1 ) {
      double pcumm = 0;
      /*
       *   print top words:
       *     Mallet:   rank, count, prob, cumm, docs, coh
       */
      yap_message("\ntopic %d/%d", kk, k);
      yap_message(" words=");
      for (w=0; w<cnt; w++) {
	if ( w>0 ) yap_message(",");
	if ( ddN.tokens ) 
	  yap_message("%s", ddN.tokens[indk[w]]);
	else
	  yap_message("%d", indk[w]);
	if ( verbose>2 ) {
	  if ( scoretype == ST_count )
	    yap_message("(%d)", (int)(tscore(indk[w])+0.2));
	  else
	    yap_message("(%6lf)", tscore(indk[w]));
	}
	if ( fullreport ) {
	  fprintf(rp, "word %d %d %d", kk, indk[w], w);
	  if ( ddS.Nwt )
	    fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]);
	  pcumm += pvec[indk[w]];
	  fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	  fprintf(rp, " %d", dfmtx[w][w]); 
	  fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w));
	  if ( ddN.tokens ) 
	    fprintf(rp, " %s", ddN.tokens[indk[w]]);
	  fprintf(rp, "\n");
	}
      }
      if ( termstats ) {
	yap_message(" terms=");
	for (w=0; w<termcnt; w++) {
	  if ( w>0 ) yap_message(",");
	  if ( ddN.tokens ) 
	    yap_message("%s", termstats->tokens[termindk[w]]);
	  else
	    yap_message("%d", termstats->Kmin+termindk[w]);
	  if ( verbose>2 ) {
	    if ( scoretype == ST_count )
	      yap_message("(%d)", (int)(termtscore(termindk[w])+0.2));
	    else
	      yap_message("(%6lf)", termtscore(termindk[w]));
	  }
	  if ( fullreport ) {
	    fprintf(rp, "term %d %d %d", kk, termindk[w], w);
	    fprintf(rp, " %d", termstats->Nkt[termindk[w]][kk]);
	    fprintf(rp, " %s", termstats->tokens[termindk[w]]);
	    fprintf(rp, "\n");
	  }
	}
      }
    }
    yap_message("\n");
    free(dfmtx[0]); free(dfmtx); 
  }
  if ( verbose>1 && ddP.PYbeta ) {
    int cnt;
    double pcumm = 0;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt = buildindk(-1,indk);
    /*  this case gives bad results */
    // if ( scoretype == ST_phirat ) topk(topword, cnt, indk, phiratioscore);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    /*
     *     cannot build df mtx for root because
     *     it is latent w.r.t. topics
     */
    yap_message("Topic root words=");
    if ( fullreport ) {
      int w;
      if ( ddP.phi && ddP.PYbeta!=H_PDP ) {
	for (w=0; w<ddN.W; w++)
	  pvec[w] = ddS.phi[ddN.T][w];
      } else {
	for (w=0; w<ddN.W; w++)
	  pvec[w] = betabasewordprob(w);
      }
#ifdef KL
      double ew = fv_kl(dfvec,pvec,ddN.W);
#else
      double ew = exp(fv_entropy(pvec,ddN.W));
#endif

      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      fprintf(rp,"topic -1 -1 0 0");
      fprintf(rp," %.4lf", ew); 
      fprintf(rp," %.6lf", ud); 
      fprintf(rp," %.6lf", pd); 
      fprintf(rp,"\n");
    }
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
      if ( verbose>2 && !ddP.phi )
	yap_message("(%6lf)", countscore(indk[w]));
      if ( fullreport ) {
	fprintf(rp, "word %d %d %d", -1, indk[w], w);
	if ( ddS.TwT )
	  fprintf(rp, " %d", ddS.TwT[w]);
	pcumm += pvec[indk[w]];
	fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	fprintf(rp, " 0 0"); 
	if ( ddN.tokens ) 
	  fprintf(rp, " %s", ddN.tokens[indk[w]]);
	fprintf(rp, "\n");
      }
    }
    yap_message("\nTopical words=");
    topk(topword, cnt, indk, phiinvratioscore);
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
    }
    yap_message("\n");
  }  
  yap_message("\n");
  if ( rp )
    fclose(rp);
	     
  if ( ddS.Nwt )
    yap_message("Average topicXword sparsity = %.2lf%%\n",
                100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%\n"
	      "Underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( ddS.sparse && ddP.PYalpha==H_NG ) {
    double avesp = 0;
    // correct_docsp();
    for (k=0; k<ddN.T; k++) {
      avesp += gtvec[k];
    }
    // check gtvec[] sums to 1
    assert(fabs(avesp-1.0)<0.00001);
    avesp = 0;
    for (k=0; k<ddN.T; k++) {
        avesp += gtvec[k]*((float)ddS.sparseD[k])/ddN.DTused;
	assert(ddS.sparseD[k]<=ddN.DTused);
    }
    assert(avesp<=1.0);
    assert(avesp>0.0);
    yap_message("IBP sparsity = %.2lf%%\n", 100*(1-avesp));
  }
	
  if ( pmicount ) 
    yap_message("Average PMI = %.3f\n", tpmi[ddN.T]);

  /*
   *   print 
   */
  if ( 1 ) {
    float **cmtx = hca_topmtx();
    int t1, t2;
    int m1, m2;
    float mval;
    char *corfile = yap_makename(resstem,".topcor");
    fp = fopen(corfile,"w");
    if ( !fp ) 
      yap_sysquit("Cannot open file '%s' for write\n", corfile);
    /*
     *   print file
     */
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) 
	 if ( cmtx[t1][t2]>1.0e-7 ) 
	  fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]);
    }
    fclose(fp);
    free(corfile);
    /*
     *   display maximum
     */
    m1 = 1; m2 = 0;
    mval = cmtx[1][0];
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) {
	if ( mval<cmtx[t1][t2] ) {
	  mval = cmtx[t1][t2];
	  m1 = t1;
	  m2 = t2;
	}
      }
    }
    yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval);
    free(cmtx[0]); free(cmtx);
  }

  /*
   *  print burstiness report
   */
  if ( PCTL_BURSTY() ) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  yap_message("\n");

  free(topfile);
  if ( repfile ) free(repfile);
  if ( top1cnt ) free(top1cnt);
  free(indk);
  free(psort);
  if ( ngalpha )
    free(ngalpha);
  if ( pmicount )
    free(tpmi);
  if ( NwK ) {
    free(NwK);
    NwK = NULL;
  }
#ifdef KL
  free(dfvec);
#endif
  free(pvec); 
  free(gtvec);
  free(gpvec);
  tstats_free(termstats);
}
示例#5
0
void hca_displaytopics(char *stem, char *resstem, int topword, 
                       enum ScoreType scoretype, int pmicount, int fullreport) {
  int w,k;
  uint32_t *indk = NULL;
  int Nk_tot = 0;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  uint32_t *top1cnt = NULL;
  FILE *fp;
  float *tpmi = NULL;
  char *topfile;
  char *repfile;
  uint32_t *psort;
  FILE *rp = NULL;
  float *gtvec = globalprop();
  float *gpvec = calloc(ddN.W,sizeof(gpvec[0]));
  float *pvec = calloc(ddN.W,sizeof(pvec[0]));
  
  if ( pmicount>topword )
    pmicount = topword;
  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }    

  /*
   *  first collect counts of each word/term,
   *  and build gpvec (mean word probs)
   */
  build_NwK();
  {
    /*
     *  gpvec[] is normalised NwK[]
     */
    double tot = 0;
    for (w=0; w<ddN.W; w++)
      tot += gpvec[w] = NwK[w]+0.1; 
    for (w=0; w<ddN.W; w++)
      gpvec[w] /= tot;
  }
  if ( ddS.Nwt ) {
    for (k=0; k<ddN.T; k++) {
      Nk_tot += ddS.NWt[k];
    }
  } 
  
  psort = sorttops(gtvec, ddN.T);
  
  top1cnt = hca_top1cnt();
  if ( !top1cnt )
    yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n");


  if ( pmicount ) {
    tpmi = malloc(sizeof(*tpmi)*(ddN.T+1));
    if ( !tpmi )
      yap_quit("Cannot allocate tpmi in hca_displaytopics()\n");
  }
  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk in hca_displaytopics()\n");

  /*
   *   two passes through, 
   *           first to build the top words and dump to file
   */
  repfile = yap_makename(resstem,".topset");
  topfile = yap_makename(resstem,".toplst");
  fp = fopen(topfile,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", topfile);
  yap_message("\n");
  for (k=0; k<ddN.T; k++) {
    int cnt;
    tscorek = k;
    /*
     *    build sorted word list
     */
    cnt = buildindk(k, indk);
    topk(topword, cnt, indk, tscore);
    if ( cnt==0 )
      continue;
    /*
     *   dump words to file
     */
    fprintf(fp,"%d: ", k);
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr)  ) {
    int cnt;
     /*
     *    dump root words
     */
    tscorek = -1;
    cnt = buildindk(-1, indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  if ( verbose>1 ) yap_message("\n");

  if ( pmicount ) {
    /*
     * compute PMI
     */
    char *toppmifile;
    char *pmifile;
    double *tp;
    tp = dvec(ddN.T);
    pmifile=yap_makename(stem,".pmi");
    toppmifile=yap_makename(resstem,".toppmi");
    get_probs(tp);
    report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, 
               pmicount, tp, tpmi);
    free(toppmifile);
    free(pmifile);
    free(tp);
  }

  /*
   *   now report words and diagnostics
   */
  //ttop_open(topfile);
  if ( fullreport ) {
    rp = fopen(repfile,"w");
    if ( !rp ) 
      yap_sysquit("Cannot open file '%s' for write\n", repfile);
    fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one "
	    "dist-unif dist-unigrm");
    if ( PCTL_BURSTY() ) 
      fprintf(rp, " burst-concent");
    if ( ddN.tokens )  
      fprintf(rp, " ave-length");
    fprintf(rp, " coher");
    if ( pmicount ) 
      fprintf(rp, " pmi");
    fprintf(rp, "\n#word topic index rank");
    if ( ddS.Nwt )
      fprintf(rp, " count");
    fprintf(rp, " prop cumm df coher\n");
    
  }
  for (k=0; k<ddN.T; k++) {
    int cnt;
    int kk = psort[k];
    uint32_t **dfmtx;

    if ( ddP.phi==NULL && ddS.NWt[kk]==0 )
      continue;
    /*
     *   grab word prob vec for later use
     */
    if ( ddS.Nwt ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = wordprob(w,kk);
    } else if ( ddP.phi ) 
      fv_copy(pvec, ddP.phi[kk], ddN.W);
    else if ( ddS.phi ) 
      fv_copy(pvec, ddS.phi[kk], ddN.W);

    /*
     *  rebuild word list
     */
    tscorek = kk;
    cnt = buildindk(kk, indk);
    topk(topword, cnt, indk, tscore);
    if ( topword<cnt )
      cnt = topword;
    assert(cnt>0);
    /*
     *     df stats for topic returned as matrix
     */
    dfmtx = hca_dfmtx(indk, cnt, kk);

    if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) 
      underused++;
    /*
     *  print stats for topic
     *    Mallet:  tokens, doc_ent, ave-word-len, coher., 
     *             uni-dist, corp-dist, eff-no-words
     */
    yap_message("Topic %d/%d", kk, k);
    {
      /*
       *   compute diagnostics
       */
      double prop = gtvec[kk];
      float *dprop = docprop(kk);
      double spw = 0;
      double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); 
      double ew = exp(fv_entropy(pvec,ddN.W));
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W);
      double co = coherence(dfmtx, cnt);
      double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT;
      double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0;
      sparsitydoc += spd;
      yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop);   
      if ( ddS.Nwt ) {
	spw = ((double)nonzero_Nwt(kk))/((double)ddN.W);
	sparsityword += spw;
	yap_message(" ws=%.1lf%%", 100*(1-spw));
      } 
      yap_message(" ds=%.1lf%%", 100*(1-spd) );
      yap_message(" ew=%.0lf", ew); 
      yap_message(" ed=%.1lf", ed); 
      yap_message(" da=%.0lf", da+0.1); 
      yap_message(" t1=%u", top1cnt[kk]); 
      yap_message(" ud=%.3lf", ud); 
      yap_message(" pd=%.3lf", pd); 
      if ( PCTL_BURSTY() ) 
	yap_message(" bd=%.3lf", ddP.bdk[kk]); 
      if ( ddN.tokens )  
	yap_message(" sl=%.2lf", sl); 
      yap_message(" co=%.3lf%%", co);
      if ( pmicount ) 
	yap_message(" pmi=%.3f", tpmi[kk]);
      if ( fullreport ) {
	fprintf(rp,"topic %d %d", kk, k);
	fprintf(rp," %.6lf", prop);   
	if ( ddS.Nwt ) {
	  fprintf(rp," %.6lf", (1-spw));
	} else {
	  fprintf(rp," 0");
	}
	fprintf(rp," %.6lf", (1-spd) );
	fprintf(rp," %.2lf", ew); 
	fprintf(rp," %.2lf", ed); 
	fprintf(rp," %.0lf", da+0.1); 
	fprintf(rp," %u", top1cnt[kk]); 
	fprintf(rp," %.6lf", ud); 
	fprintf(rp," %.6lf", pd); 
	if ( PCTL_BURSTY() ) 
	  fprintf(rp," %.3lf", ddP.bdk[kk]); 
	fprintf(rp," %.4lf", (ddN.tokens)?sl:0); 
	fprintf(rp," %.6lf", co);
	if ( pmicount ) 
	  fprintf(rp," %.4f", tpmi[kk]);
	fprintf(rp,"\n");
      }
      if ( dprop) free(dprop);
    }
    if ( verbose>1 ) {
      double pcumm = 0;
      /*
       *   print top words:
       *     Mallet:   rank, count, prob, cumm, docs, coh
       */
      yap_message("\ntopic %d/%d", kk, k);
      yap_message(" words=");
      for (w=0; w<cnt; w++) {
	if ( w>0 ) yap_message(",");
	if ( ddN.tokens ) 
	  yap_message("%s", ddN.tokens[indk[w]]);
	else
	  yap_message("%d", indk[w]);
	if ( verbose>2 )
	  yap_message("(%6lf)", tscore(indk[w]));
	if ( fullreport ) {
	  fprintf(rp, "word %d %d %d", kk, indk[w], w);
	  if ( ddS.Nwt )
	    fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]);
	  pcumm += pvec[indk[w]];
	  fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	  fprintf(rp, " %d", dfmtx[w][w]); 
	  fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w));
	  if ( ddN.tokens ) 
	    fprintf(rp, " %s", ddN.tokens[indk[w]]);
	  fprintf(rp, "\n");
	}
      }
    }
    yap_message("\n");
    free(dfmtx[0]); free(dfmtx); 
  }
  if ( verbose>1 && ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) {
    int cnt;
    double pcumm = 0;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt = buildindk(-1,indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    /*
     *     cannot build df mtx for root because
     *     it is latent w.r.t. topics
     */
    yap_message("Topic root words=");
    if ( fullreport ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = betabasewordprob(w);
      double ew = exp(fv_entropy(pvec,ddN.W));
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      fprintf(rp,"topic -1 -1 0 0");
      fprintf(rp," %.4lf", ew); 
      fprintf(rp," %.6lf", ud); 
      fprintf(rp," %.6lf", pd); 
      fprintf(rp,"\n");
    }
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
      if ( verbose>2 )
	yap_message("(%6lf)", countscore(indk[w]));
      if ( fullreport ) {
	fprintf(rp, "word %d %d %d", -1, indk[w], w);
	if ( ddS.TwT )
	  fprintf(rp, " %d", ddS.TwT[w]);
	pcumm += pvec[indk[w]];
	fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	fprintf(rp, " 0 0"); 
	if ( ddN.tokens ) 
	  fprintf(rp, " %s", ddN.tokens[indk[w]]);
	fprintf(rp, "\n");
      }   
    }
    yap_message("\n");
  }
  yap_message("\n");
  if ( rp )
    fclose(rp);
	     
  if ( ddS.Nwt )
    yap_message("Average topicXword sparsity = %.2lf%%\n",
                100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%\n"
	      "Underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( pmicount ) 
    yap_message("Average PMI = %.3f\n", tpmi[ddN.T]);

  /*
   *   print 
   */
  if ( 1 ) {
    float **cmtx = hca_topmtx();
    int t1, t2;
    int m1, m2;
    float mval;
    char *corfile = yap_makename(resstem,".topcor");
    fp = fopen(corfile,"w");
    if ( !fp ) 
      yap_sysquit("Cannot open file '%s' for write\n", corfile);
   /*
    *   print file
     */
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) 
	 if ( cmtx[t1][t2]>1.0e-3 ) 
	  fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]);
    }
    fclose(fp);
    free(corfile);
    /*
     *   display maximum
     */
    m1 = 1; m2 = 0;
    mval = cmtx[1][0];
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) {
	if ( mval<cmtx[t1][t2] ) {
	  mval = cmtx[t1][t2];
	  m1 = t1;
	  m2 = t2;
	}
      }
    }
    yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval);
    free(cmtx[0]); free(cmtx);
  }

  /*
   *  print burstiness report
   */
  if ( PCTL_BURSTY() ) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  yap_message("\n");

  free(topfile);
  if ( repfile ) free(repfile);
  if ( top1cnt ) free(top1cnt);
  free(indk);
  free(psort);
  if ( pmicount )
    free(tpmi);
  if ( NwK ) {
    free(NwK);
    NwK = NULL;
  }
  free(pvec); 
  free(gtvec);
  free(gpvec);
}