Exemple #1
0
static inline void calc(int argc, char** argv) {   
    StdDeck_CardMask hole, board;
    StdDeck_CardMask card;
    StdDeck_CardMask_RESET(hole);
    StdDeck_CardMask_RESET(board);
    int i, cardi;
    
    for(i = 1; i < 5; ++i) {
       StdDeck_stringToCard(argv[i], &cardi); 
       card = StdDeck_MASK(cardi);
       StdDeck_CardMask_OR(hole, hole, card);
    }
    po_probs probs;
    probs = get_probs(hole, board);
    printf("%.4f", probs.win + probs.draw);
            
    if ( argc > 7) {
    for(i = 5; i < 8; ++i) {
       StdDeck_stringToCard(argv[i], &cardi); 
       card = StdDeck_MASK(cardi);
       StdDeck_CardMask_OR(board, board, card);
    }
    probs = get_probs(hole, board);
    printf(" %.4f", probs.win + probs.draw);
    }
    
    if ( argc > 8) {
        i = 8;
       StdDeck_stringToCard(argv[i], &cardi); 
       card = StdDeck_MASK(cardi);
       StdDeck_CardMask_OR(board, board, card);
    
    probs = get_probs(hole, board);
    printf(" %.4f", probs.win + probs.draw);
    }
    
    if ( argc > 9) {
        i = 9;
       StdDeck_stringToCard(argv[i], &cardi); 
       card = StdDeck_MASK(cardi);
       StdDeck_CardMask_OR(board, board, card);
    
    probs = get_probs(hole, board);
    printf(" %.4f", probs.win + probs.draw);
    }
    
    
    
    
    printf("\n");
}
Exemple #2
0
void print_probs(FILE *fp) {
  int t, num = 0;
  double *vp = malloc(sizeof(*vp)*ddN.T);
  get_probs(vp);
  fprintf(fp, "factor = %lf\nprobs = ", ddP.alpha);
  for (t=0; t<ddN.T; t++) 
    if ( vp[t]>0 ) {
      fprintf(fp, " %lf", vp[t]);
      num++;
    } else
      fprintf(fp, " -");
  fprintf(fp, "\n");
  fprintf(fp, "# topics: %d\n", num);
  free(vp);
}
Exemple #3
0
void yap_probs() {
  int t;
  int empty = 0;
  double ent = 0;
  double factor = 0;
  double *vp = malloc(sizeof(*vp)*ddN.T);
  get_probs(vp);
  yap_message("probs = ");
  factor = ddP.alpha;
  for (t=0; t<ddN.T; t++) 
    if ( vp[t]>0 ) {
      yap_message(" %lf", vp[t]);
      ent -= vp[t]*log(vp[t]);
    } else {
      empty++;
      yap_message(" -");
    }
  yap_message("\nfactor = %lf, empty = %d, ent = %lf\n", 
	      factor, empty, exp(ent));
  free(vp);
}
Exemple #4
0
void hca_displaytopics(char *stem, char *resstem, int topword, 
                       enum ScoreType scoretype, int pmicount, int fullreport) {
  int w,k;
  uint32_t *termindk = NULL;
  uint32_t *indk = NULL;
  int Nk_tot = 0;
  double (*termtscore)(int) = NULL;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  uint32_t *top1cnt = NULL;
  FILE *fp;
  float *tpmi = NULL;
  char *topfile;
  char *repfile;
  uint32_t *psort;
  FILE *rp = NULL;
  float *gtvec = globalprop();
//#define XTRA // prints model topic probs after observed
#ifdef XTRA
  double *gtavec = calloc(ddN.T,sizeof(gtavec[0]));
#endif
  float *gpvec = calloc(ddN.W,sizeof(gpvec[0]));
  float *pvec = calloc(ddN.W,sizeof(pvec[0]));
#ifdef KL
  float *dfvec = calloc(ddN.W,sizeof(dfvec[0]));
#endif
  double *ngalpha = NULL;
  T_stats_t *termstats;
  
#ifdef XTRA
  get_probs(gtavec);
#endif

  if ( pmicount>topword )
    pmicount = topword;
  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phirat ) {
    tscore = phiratioscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }  

  if ( ddS.TwT==NULL && ddP.phi==NULL && scoretype == ST_phirat ) 
	yap_quit("Cannot use '-orat' option with this model/settings.\n");	

  if ( ddP.PYalpha==H_NG ) {
    /*
     *  provide an estimate of alpha
     */
    ngalpha = dvec(ddN.T);
    get_probs(ngalpha);
    for (k=0; k<ddN.T; k++) {
      ddP.alphapr[k] = ngalpha[k];
    }
  }

  /*
   *  returns null if no relevant data file
   */
  termstats = tstats_init(ddS.z, ddD.NdTcum, ddN.T, ddN.DT, stem);
  if ( termstats ) {
    if ( scoretype == ST_idf ) {
      termtscore = termidfscore;
    } else 
      termtscore = termcountscore;
  }  

  
  /*
   *  first collect counts of each word/term,
   *  and build gpvec (mean word probs)
   */
  build_NwK();
  if ( termstats )
    build_termNwK(termstats);
  {
    /*
     *  gpvec[] is normalised NwK[]
     */
    double tot = 0;
    for (w=0; w<ddN.W; w++)
      tot += gpvec[w] = NwK[w]+0.1; 
    for (w=0; w<ddN.W; w++)
      gpvec[w] /= tot;
  }
  if ( ddS.Nwt ) {
    for (k=0; k<ddN.T; k++) {
      Nk_tot += ddS.NWt[k];
    }
  } 
  
  psort = sorttops(gtvec, ddN.T);
  
  top1cnt = hca_top1cnt();
  if ( !top1cnt )
    yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n");

  if ( pmicount ) {
    tpmi = malloc(sizeof(*tpmi)*(ddN.T+1));
    if ( !tpmi )
      yap_quit("Cannot allocate tpmi in hca_displaytopics()\n");
  }
  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk in hca_displaytopics()\n");
  if ( termstats ) {
    termindk = malloc(sizeof(*indk)*termstats->K);
    if ( !termindk )
      yap_quit("Cannot allocate termindk in hca_displaytopics()\n");
  }

  
  data_df(stem);

#ifdef KL
  for (w=0; w<ddN.W; w++)
    dfvec[w] = ddD.df[w];
#endif
  
  /*
   *   two passes through, 
   *           first to build the top words and dump to file
   */
  repfile = yap_makename(resstem,".topset");
  topfile = yap_makename(resstem,".toplst");
  fp = fopen(topfile,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", topfile);
  yap_message("\n");
  for (k=0; k<ddN.T; k++) {
    int cnt, termcnt = 0;
    tscorek = k;
    /*
     *    build sorted word list
     */
    cnt = buildindk(k, indk);
    topk(topword, cnt, indk, tscore);
    if ( cnt==0 )
      continue;
    if ( termstats ) {
      termcnt = buildtermindk(k, termindk, termstats);
      topk(topword, termcnt, termindk, termtscore);
    }
    /*
     *   dump words to file
     */
    fprintf(fp,"%d: ", k);
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    if ( termstats ) {
      for (w=0; w<topword && w<termcnt; w++) {
	fprintf(fp," %d", (int)termstats->Kmin+termindk[w]);
      }
    }
    fprintf(fp, "\n");
  }
  if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr)  ) {
    int cnt;
     /*
     *    dump root words
     */
    tscorek = -1;
    cnt = buildindk(-1, indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  if ( verbose>1 ) yap_message("\n");

  if ( pmicount ) {
    /*
     * compute PMI
     */
    char *toppmifile;
    char *pmifile;
    double *tp;
    tp = dvec(ddN.T);
    pmifile=yap_makename(stem,".pmi");
    toppmifile=yap_makename(resstem,".toppmi");
    get_probs(tp);
    report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, 
               pmicount, tp, tpmi);
    free(toppmifile);
    free(pmifile);
    free(tp);
  }

  /*
   *   now report words and diagnostics
   */
  //ttop_open(topfile);
  if ( fullreport ) {
    rp = fopen(repfile,"w");
    if ( !rp ) 
      yap_sysquit("Cannot open file '%s' for write\n", repfile);
    fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one "
	    "dist-unif dist-unigrm");
    if ( PCTL_BURSTY() ) 
      fprintf(rp, " burst-concent");
    if ( ddN.tokens )  
      fprintf(rp, " ave-length");
    fprintf(rp, " coher");
    if ( pmicount ) 
      fprintf(rp, " pmi");
    fprintf(rp, "\n#word topic index rank");
    if ( ddS.Nwt )
      fprintf(rp, " count");
    fprintf(rp, " prop cumm df coher\n");
    
  }
  for (k=0; k<ddN.T; k++) {
    int cnt, termcnt = 0;
    int kk = psort[k];
    uint32_t **dfmtx;

    if ( ddP.phi==NULL && ddS.NWt[kk]==0 )
      continue;
    /*
     *   grab word prob vec for later use
     */
    if ( ddS.Nwt ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = wordprob(w,kk);
    } else if ( ddP.phi ) 
      fv_copy(pvec, ddP.phi[kk], ddN.W);
    else if ( ddS.phi ) 
      fv_copy(pvec, ddS.phi[kk], ddN.W);

    /*
     *  rebuild word list
     */
    tscorek = kk;
    cnt = buildindk(kk, indk);
    topk(topword, cnt, indk, tscore);
    if ( topword<cnt )
      cnt = topword;
    assert(cnt>0);
    if ( termstats ) {
      termcnt = buildtermindk(kk, termindk, termstats);
      topk(topword, termcnt, termindk, termtscore);
      if ( topword<termcnt )
	termcnt = topword;
    }
    /*
     *     df stats for topic returned as matrix
     */
    dfmtx = hca_dfmtx(indk, cnt, kk);

    if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) 
      underused++;
    /*
     *  print stats for topic
     *    Mallet:  tokens, doc_ent, ave-word-len, coher., 
     *             uni-dist, corp-dist, eff-no-words
     */
    yap_message("Topic %d/%d", kk, k);
    {
      /*
       *   compute diagnostics
       */
      double prop = gtvec[kk];
      float *dprop = docprop(kk);
      double spw = 0;
      double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT);
#ifdef KL
      double ew = fv_kl(dfvec,pvec,ddN.W);
#else
      double ew = exp(fv_entropy(pvec,ddN.W));
#endif
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W);
      double co = coherence(dfmtx, cnt);
      double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT;
#define MALLET_EW
#ifdef MALLET_EW
      double ewp = dprop?(1.0/fv_expprob(pvec,ddN.W)):ddN.W;
#endif
      double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0;
      sparsitydoc += spd;
      yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop);   
#ifdef XTRA
      yap_message((ddN.T>200)?"/%.3lf%%":"/%.2lf%%",100*gtavec[kk]);   
#endif
      if ( ddS.Nwt ) {
	spw = ((double)nonzero_Nwt(kk))/((double)ddN.W);
	sparsityword += spw;
	yap_message(" ws=%.1lf%%", 100*(1-spw));
      } 
      yap_message(" ds=%.1lf%%", 100*(1-spd) );
#ifdef KL
      yap_message(" ew=%lf", ew);
#else
      yap_message(" ew=%.0lf", ew);
#endif
#ifdef MALLET_EW
      yap_message(" ewp=%.1lf", ewp); 
#endif
      yap_message(" ed=%.1lf", ed); 
      yap_message(" da=%.0lf", da+0.1); 
      yap_message(" t1=%u", top1cnt[kk]); 
      yap_message(" ud=%.3lf", ud); 
      yap_message(" pd=%.3lf", pd); 
      if ( PCTL_BURSTY() ) 
	yap_message(" bd=%.3lf", ddP.bdk[kk]); 
      if ( ddP.NGbeta ) {
	/*
	 *   approx. as sqrt(var(lambda_k)/lambda-normaliser
	 */
	double ngvar = sqrt(ddP.NGalpha[kk])
	  * (ngalpha[kk]/ddP.NGalpha[kk]);
	yap_message(" ng=%.4lf,%.4lf", 
		    ngalpha[kk], ngvar/ngalpha[kk]);
	if ( ddS.sparse )
	    yap_message(",%.4f", 1-((float)ddS.sparseD[kk])/ddN.DTused);
	if ( verbose>2 )
	    yap_message(" ngl=%.4lf,%.4lf, nga=%.4lf,%.4lf", 
		    ddP.NGalpha[kk]/ddP.NGbeta[kk], 
		    sqrt(ddP.NGalpha[kk]/ddP.NGbeta[kk]/ddP.NGbeta[kk]),
		    ddP.NGalpha[kk], ddP.NGbeta[kk]); 
      }
      if ( ddN.tokens )  
	yap_message(" sl=%.2lf", sl); 
      yap_message(" co=%.3lf%%", co);
      if ( pmicount ) 
	yap_message(" pmi=%.3f", tpmi[kk]);
      if ( fullreport ) {
	fprintf(rp,"topic %d %d", kk, k);
	fprintf(rp," %.6lf", prop);   
	if ( ddS.Nwt ) {
	  fprintf(rp," %.6lf", (1-spw));
	} else {
	  fprintf(rp," 0");
	}
	fprintf(rp," %.6lf", (1-spd) );
#ifdef KL
	yap_message(" %lf", ew);
#else
	fprintf(rp," %.2lf", ew);
#endif
#ifdef MALLET_EW
	fprintf(rp," %.2lf", ewp); 
#endif
	fprintf(rp," %.2lf", ed); 
	fprintf(rp," %.0lf", da+0.1); 
	fprintf(rp," %u", top1cnt[kk]); 
	fprintf(rp," %.6lf", ud); 
	fprintf(rp," %.6lf", pd); 
	if ( PCTL_BURSTY() ) 
	  fprintf(rp," %.3lf", ddP.bdk[kk]); 
	fprintf(rp," %.4lf", (ddN.tokens)?sl:0); 
	fprintf(rp," %.6lf", co);
	if ( pmicount ) 
	  fprintf(rp," %.4f", tpmi[kk]);
	fprintf(rp,"\n");
      }
      if ( dprop) free(dprop);
    }
    if ( verbose>1 ) {
      double pcumm = 0;
      /*
       *   print top words:
       *     Mallet:   rank, count, prob, cumm, docs, coh
       */
      yap_message("\ntopic %d/%d", kk, k);
      yap_message(" words=");
      for (w=0; w<cnt; w++) {
	if ( w>0 ) yap_message(",");
	if ( ddN.tokens ) 
	  yap_message("%s", ddN.tokens[indk[w]]);
	else
	  yap_message("%d", indk[w]);
	if ( verbose>2 ) {
	  if ( scoretype == ST_count )
	    yap_message("(%d)", (int)(tscore(indk[w])+0.2));
	  else
	    yap_message("(%6lf)", tscore(indk[w]));
	}
	if ( fullreport ) {
	  fprintf(rp, "word %d %d %d", kk, indk[w], w);
	  if ( ddS.Nwt )
	    fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]);
	  pcumm += pvec[indk[w]];
	  fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	  fprintf(rp, " %d", dfmtx[w][w]); 
	  fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w));
	  if ( ddN.tokens ) 
	    fprintf(rp, " %s", ddN.tokens[indk[w]]);
	  fprintf(rp, "\n");
	}
      }
      if ( termstats ) {
	yap_message(" terms=");
	for (w=0; w<termcnt; w++) {
	  if ( w>0 ) yap_message(",");
	  if ( ddN.tokens ) 
	    yap_message("%s", termstats->tokens[termindk[w]]);
	  else
	    yap_message("%d", termstats->Kmin+termindk[w]);
	  if ( verbose>2 ) {
	    if ( scoretype == ST_count )
	      yap_message("(%d)", (int)(termtscore(termindk[w])+0.2));
	    else
	      yap_message("(%6lf)", termtscore(termindk[w]));
	  }
	  if ( fullreport ) {
	    fprintf(rp, "term %d %d %d", kk, termindk[w], w);
	    fprintf(rp, " %d", termstats->Nkt[termindk[w]][kk]);
	    fprintf(rp, " %s", termstats->tokens[termindk[w]]);
	    fprintf(rp, "\n");
	  }
	}
      }
    }
    yap_message("\n");
    free(dfmtx[0]); free(dfmtx); 
  }
  if ( verbose>1 && ddP.PYbeta ) {
    int cnt;
    double pcumm = 0;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt = buildindk(-1,indk);
    /*  this case gives bad results */
    // if ( scoretype == ST_phirat ) topk(topword, cnt, indk, phiratioscore);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    /*
     *     cannot build df mtx for root because
     *     it is latent w.r.t. topics
     */
    yap_message("Topic root words=");
    if ( fullreport ) {
      int w;
      if ( ddP.phi && ddP.PYbeta!=H_PDP ) {
	for (w=0; w<ddN.W; w++)
	  pvec[w] = ddS.phi[ddN.T][w];
      } else {
	for (w=0; w<ddN.W; w++)
	  pvec[w] = betabasewordprob(w);
      }
#ifdef KL
      double ew = fv_kl(dfvec,pvec,ddN.W);
#else
      double ew = exp(fv_entropy(pvec,ddN.W));
#endif

      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      fprintf(rp,"topic -1 -1 0 0");
      fprintf(rp," %.4lf", ew); 
      fprintf(rp," %.6lf", ud); 
      fprintf(rp," %.6lf", pd); 
      fprintf(rp,"\n");
    }
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
      if ( verbose>2 && !ddP.phi )
	yap_message("(%6lf)", countscore(indk[w]));
      if ( fullreport ) {
	fprintf(rp, "word %d %d %d", -1, indk[w], w);
	if ( ddS.TwT )
	  fprintf(rp, " %d", ddS.TwT[w]);
	pcumm += pvec[indk[w]];
	fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	fprintf(rp, " 0 0"); 
	if ( ddN.tokens ) 
	  fprintf(rp, " %s", ddN.tokens[indk[w]]);
	fprintf(rp, "\n");
      }
    }
    yap_message("\nTopical words=");
    topk(topword, cnt, indk, phiinvratioscore);
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
    }
    yap_message("\n");
  }  
  yap_message("\n");
  if ( rp )
    fclose(rp);
	     
  if ( ddS.Nwt )
    yap_message("Average topicXword sparsity = %.2lf%%\n",
                100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%\n"
	      "Underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( ddS.sparse && ddP.PYalpha==H_NG ) {
    double avesp = 0;
    // correct_docsp();
    for (k=0; k<ddN.T; k++) {
      avesp += gtvec[k];
    }
    // check gtvec[] sums to 1
    assert(fabs(avesp-1.0)<0.00001);
    avesp = 0;
    for (k=0; k<ddN.T; k++) {
        avesp += gtvec[k]*((float)ddS.sparseD[k])/ddN.DTused;
	assert(ddS.sparseD[k]<=ddN.DTused);
    }
    assert(avesp<=1.0);
    assert(avesp>0.0);
    yap_message("IBP sparsity = %.2lf%%\n", 100*(1-avesp));
  }
	
  if ( pmicount ) 
    yap_message("Average PMI = %.3f\n", tpmi[ddN.T]);

  /*
   *   print 
   */
  if ( 1 ) {
    float **cmtx = hca_topmtx();
    int t1, t2;
    int m1, m2;
    float mval;
    char *corfile = yap_makename(resstem,".topcor");
    fp = fopen(corfile,"w");
    if ( !fp ) 
      yap_sysquit("Cannot open file '%s' for write\n", corfile);
    /*
     *   print file
     */
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) 
	 if ( cmtx[t1][t2]>1.0e-7 ) 
	  fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]);
    }
    fclose(fp);
    free(corfile);
    /*
     *   display maximum
     */
    m1 = 1; m2 = 0;
    mval = cmtx[1][0];
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) {
	if ( mval<cmtx[t1][t2] ) {
	  mval = cmtx[t1][t2];
	  m1 = t1;
	  m2 = t2;
	}
      }
    }
    yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval);
    free(cmtx[0]); free(cmtx);
  }

  /*
   *  print burstiness report
   */
  if ( PCTL_BURSTY() ) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  yap_message("\n");

  free(topfile);
  if ( repfile ) free(repfile);
  if ( top1cnt ) free(top1cnt);
  free(indk);
  free(psort);
  if ( ngalpha )
    free(ngalpha);
  if ( pmicount )
    free(tpmi);
  if ( NwK ) {
    free(NwK);
    NwK = NULL;
  }
#ifdef KL
  free(dfvec);
#endif
  free(pvec); 
  free(gtvec);
  free(gpvec);
  tstats_free(termstats);
}
Exemple #5
0
void extract_boxes(char *cfgfile, char *weightfile)
{
    network net = parse_network_cfg(cfgfile);
    if(weightfile){
        load_weights(&net, weightfile);
    }
    set_batch_network(&net, 1);
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    srand(time(0));

    char *val_images = "/home/pjreddie/data/voc/test/train.txt";
    list *plist = get_paths(val_images);
    char **paths = (char **)list_to_array(plist);

    layer l = net.layers[net.n - 1];

    int num_boxes = l.side;
    int num = l.n;
    int classes = l.classes;

    int j;

    box *boxes = calloc(num_boxes*num_boxes*num, sizeof(box));
    float **probs = calloc(num_boxes*num_boxes*num, sizeof(float *));
    for(j = 0; j < num_boxes*num_boxes*num; ++j) probs[j] = calloc(classes+1, sizeof(float *));

    int N = plist->size;
    int i=0;
    int k;

    int count = 0;
    float iou_thresh = .3;

    for (i = 0; i < N; ++i) {
        fprintf(stderr, "%5d %5d\n", i, count);
        char *path = paths[i];
        image orig = load_image_color(path, 0, 0);
        image resized = resize_image(orig, net.w, net.h);

        float *X = resized.data;
        float *predictions = network_predict(net, X);
        get_boxes(predictions+1+classes, num, num_boxes, 5+classes, boxes);
        get_probs(predictions, num*num_boxes*num_boxes, classes, 5+classes, probs);

        char *labelpath = find_replace(path, "images", "labels");
        labelpath = find_replace(labelpath, "JPEGImages", "labels");
        labelpath = find_replace(labelpath, ".jpg", ".txt");
        labelpath = find_replace(labelpath, ".JPEG", ".txt");

        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
        FILE *label = stdin;
        for(k = 0; k < num_boxes*num_boxes*num; ++k){
            int overlaps = 0;
            for (j = 0; j < num_labels; ++j) {
                box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
                float iou = box_iou(boxes[k], t);
                if (iou > iou_thresh){
                    if (!overlaps) {
                        char buff[256];
                        sprintf(buff, "/data/extracted/labels/%d.txt", count);
                        label = fopen(buff, "w");
                        overlaps = 1;
                    }
                    fprintf(label, "%d %f\n", truth[j].id, iou);
                }
            }
            if (overlaps) {
                char buff[256];
                sprintf(buff, "/data/extracted/imgs/%d", count++);
                int dx = (boxes[k].x - boxes[k].w/2) * orig.w;
                int dy = (boxes[k].y - boxes[k].h/2) * orig.h;
                int w = boxes[k].w * orig.w;
                int h = boxes[k].h * orig.h;
                image cropped = crop_image(orig, dx, dy, w, h);
                image sized = resize_image(cropped, 224, 224);
#ifdef OPENCV
                save_image_jpg(sized, buff);
#endif
                free_image(sized);
                free_image(cropped);
                fclose(label);
            }
        }
        free(truth);
        free_image(orig);
        free_image(resized);
    }
}
Exemple #6
0
void validate_recall(char *cfgfile, char *weightfile)
{
    network net = parse_network_cfg(cfgfile);
    if(weightfile){
        load_weights(&net, weightfile);
    }
    set_batch_network(&net, 1);
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    srand(time(0));

    char *val_images = "/home/pjreddie/data/voc/test/2007_test.txt";
    list *plist = get_paths(val_images);
    char **paths = (char **)list_to_array(plist);

    layer l = net.layers[net.n - 1];

    int num_boxes = l.side;
    int num = l.n;
    int classes = l.classes;

    int j;

    box *boxes = calloc(num_boxes*num_boxes*num, sizeof(box));
    float **probs = calloc(num_boxes*num_boxes*num, sizeof(float *));
    for(j = 0; j < num_boxes*num_boxes*num; ++j) probs[j] = calloc(classes+1, sizeof(float *));

    int N = plist->size;
    int i=0;
    int k;

    float iou_thresh = .5;
    float thresh = .1;
    int total = 0;
    int correct = 0;
    float avg_iou = 0;
    int nms = 1;
    int proposals = 0;
    int save = 1;

    for (i = 0; i < N; ++i) {
        char *path = paths[i];
        image orig = load_image_color(path, 0, 0);
        image resized = resize_image(orig, net.w, net.h);

        float *X = resized.data;
        float *predictions = network_predict(net, X);
        get_boxes(predictions+1+classes, num, num_boxes, 5+classes, boxes);
        get_probs(predictions, num*num_boxes*num_boxes, classes, 5+classes, probs);
        if (nms) do_nms(boxes, probs, num*num_boxes*num_boxes, (classes>0) ? classes : 1, iou_thresh);

        char *labelpath = find_replace(path, "images", "labels");
        labelpath = find_replace(labelpath, "JPEGImages", "labels");
        labelpath = find_replace(labelpath, ".jpg", ".txt");
        labelpath = find_replace(labelpath, ".JPEG", ".txt");

        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
        for(k = 0; k < num_boxes*num_boxes*num; ++k){
            if(probs[k][0] > thresh){
                ++proposals;
                if(save){
                    char buff[256];
                    sprintf(buff, "/data/extracted/nms_preds/%d", proposals);
                    int dx = (boxes[k].x - boxes[k].w/2) * orig.w;
                    int dy = (boxes[k].y - boxes[k].h/2) * orig.h;
                    int w = boxes[k].w * orig.w;
                    int h = boxes[k].h * orig.h;
                    image cropped = crop_image(orig, dx, dy, w, h);
                    image sized = resize_image(cropped, 224, 224);
#ifdef OPENCV
                    save_image_jpg(sized, buff);
#endif
                    free_image(sized);
                    free_image(cropped);
                    sprintf(buff, "/data/extracted/nms_pred_boxes/%d.txt", proposals);
                    char *im_id = basecfg(path);
                    FILE *fp = fopen(buff, "w");
                    fprintf(fp, "%s %d %d %d %d\n", im_id, dx, dy, dx+w, dy+h);
                    fclose(fp);
                    free(im_id);
                }
            }
        }
        for (j = 0; j < num_labels; ++j) {
            ++total;
            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
            float best_iou = 0;
            for(k = 0; k < num_boxes*num_boxes*num; ++k){
                float iou = box_iou(boxes[k], t);
                if(probs[k][0] > thresh && iou > best_iou){
                    best_iou = iou;
                }
            }
            avg_iou += best_iou;
            if(best_iou > iou_thresh){
                ++correct;
            }
        }
        free(truth);
        free_image(orig);
        free_image(resized);
        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
    }
}
/*---------------------------------------------------------------
 Routine : calculate_probs
 Purpose : Calculate the probabilities for the tree, based upon
 the minimal cut sets.
---------------------------------------------------------------*/
BOOL
  calculate_probs(
    char *filename,       /* IN - filename to write report to */
    TREE *tree,           /* IN - tree                         */
    int   max_order,      /* IN - max order of cut sets to use */
    int   prob_n_terms,   /* IN - number of terms in expansion */
    float unit_time )     /* IN - unit time factor to be applied */
{
    BitArray *stop = BitCreate(1);  /* 1-bit zero */
    FILE  *file;
    Expr   e;
    Group *g;
    float *probs, *cp, *imp;
    float  p;
    int    num_bas, num_mcs, i, j;
/*     char  *mcs_file; */
/*     int    order; */
    clock_t time1, time2;
    time_t tp;
    BOOL success = TRUE;
	float one_increment /* value for one increment of the progress bar */;

 /* start clock */
    time1 = clock();

    if ( (file = fopen(filename, "w")) == NULL) {
        printf("*** calculate_probs : error opening file\n");
        return FALSE;
    }

/*     printf("calculate_probs()\n"); */

 /* include transfered-in trees and build the primary event list
  *
  * We need to do something different to deal with Common Cause Analysis
  * We don't need the tree, but we do need the primary event list.
  * Need to add the common cause events into the primary event list.
  */
 /* if necessary, expand tree */
    expand_tree(tree);

 /* set probs in BASLIST from the events database */
    set_bas_prob( unit_time );

 /* get number of primary events */
    if ((num_bas = tree->num_bas) == 0) {
fclose( file );
        return FALSE;
    }

  if (GenerateNumericalProbabilityCheckForInterrupt()) {
    success = FALSE;
fclose( file );
    return success;
  }

 /* create array of probabilities of primary events */
    if ( !fNewMemory( (void *)&probs, ( num_bas * sizeof(float) ) ) ) 
    {
        printf("\n*** calculate_probs 1 : malloc failed ***\n");
        exit(1);
    }

    if ( !fNewMemory( (void *)&imp, ( num_bas * sizeof(float) ) ) ) 
    {
        printf("\n*** calculate_probs : malloc failed ***\n");
        exit(1);
    }

 /* fill array */
    get_probs( probs );

 /* get mcs list */
    e = tree->mcs_expr;

    /* num_mcs =  ExprCount(e); */

    /* how many mcs are actually used? */
	num_mcs = ExprCountOrder(tree->mcs_expr, max_order);

	/* make sure that max_term does not exceed number of mcs */
	/* if number of mcs is zero then return FALSE */
	if(num_mcs == 0) {
		return FALSE;
	} else if (prob_n_terms > num_mcs) {
		prob_n_terms = num_mcs;
	}

	/* initialise Working dialog */
	/* most of the cpu time is taken up in the ExprProb() function */
	/* the working dialog is incremented in the combs() function */
	one_increment = 0.0;
	for(i = 1; i <= prob_n_terms; i++) {
		one_increment += nCr(num_mcs, i);
	}

	/* set up progress bar */
	one_increment /= 100.0;
    set_one_increment(one_increment);
	GenerateNumericalProbabilitySetProgressBarMax(100);


/*  ExprPrint(e);   */

 /* print header */
    fprintf(file,
        "Probabilities Analysis\n"
        "======================\n\n");
    fprintf(file, "Tree   : %s\n", tree->name);
    time(&tp);
    fprintf(file, "Time   : %s\n", ctime(&tp));

    fprintf(file, "Number of primary events   = %d\n",   num_bas);
    fprintf(file, "Number of minimal cut sets = %d\n",   num_mcs);
    fprintf(file, "Order of minimal cut sets  = %d\n",   tree->max_order);
    if (max_order < tree->max_order) {
        fprintf(file, "               (order <= %d used)\n\n", max_order);
    } else {
        fprintf(file, "\n");
    }

    fprintf(file, "Unit time span         = %f\n\n",   unit_time);

 /* calculate cut set probabilities - use ALL the cut sets */
    cp = ExprCutsetProbs(e, probs);

    fprintf(file, "Minimal cut set probabilities :\n\n");

    i = 0;
    for(g=e; !BitEquals(g->b, stop); g=g->next) {
        char **fp = BitPara( g->b, 30 );

/*         printf("(%3d) %s %-20s - %E\n", */
/*                i+1, */
/*                BitString(g->b), */
/*                fp[0], */
/*                cp[i]); */
/*  */
/*         for (j = 1; fp[j] != NULL; j++) { */
/*             printf("       %-20s\n", fp[j]); */
/*         } */

        if (GenerateNumericalProbabilityCheckForInterrupt()) {
            success = FALSE;
            CleanUpOperations(
                file,
                probs,
                cp,
                imp,
				stop);
			ParaDestroy(fp);
            return success;
		}

        fprintf(file,
               "%3d   %-30s   %E\n",
               i+1,
               fp[0],
               cp[i]);

        for (j = 1; fp[j] != NULL; j++) {
            fprintf(file,
                   "      %-20s\n", fp[j]);
        }

        ParaDestroy(fp);
        i++;
    }

 /* calculate top level probability  - use only up to max_order cut sets */
    fprintf(file, "\n\n"
                  "Probability of top level event "
                  "(minimal cut sets up to order %d used):\n\n", max_order);

    p = 0;

	

    for (i = 1; i <= prob_n_terms && i <= num_mcs && !GenerateNumericalProbabilityCheckForInterrupt(); i++) {
        float term;
        char *sign, *s, *bound;

        p += (term = ExprProb(e, probs, max_order, i));
        sign       = ((i % 2) ? "+" :  "-" );
        s          = ((i > 1) ? "s" :  " " );
        bound      = ((i % 2) ? "upper" :  "lower" );

        fprintf(file, "%2d term%s   %s%E   = %E (%s bound)\n",
                i, s, sign, fabs(term), p, bound);
    }

    if (prob_n_terms >= num_mcs) {
        fprintf(file, "\nExact value : %E\n", p);
    }

    if (GenerateNumericalProbabilityCheckForInterrupt()) {
        success = FALSE;
        CleanUpOperations(
            file,
            probs,
            cp,
            imp,
			stop);
        return success;
	}

 /* calculate importances of individual events */

    for (j = 0; j < num_bas; j++) {
        imp[j] = 0;
    }

    i = 0;
    for(g=e; !BitEquals(g->b, stop); g=g->next) {
        for (j = 0; j < g->b->n; j++) {
            if ( BitGet(g->b, (g->b->n-1) - j) ) {
               imp[j] += cp[i];
            }
        }
        i++;
    }

    if (GenerateNumericalProbabilityCheckForInterrupt()) {
        success = FALSE;
        CleanUpOperations(
            file,
            probs,
            cp,
            imp,
			stop);
        return success;
	}

    fprintf(file, "\n\nPrimary Event Analysis:\n\n");

    fprintf(file, " Event          "
                  "Failure contrib.    "
                  "Importance\n\n");

    for (i = 0; i < num_bas; i++) {
        char *fs = BasicString(num_bas, i);
        fprintf(file, "%-15s %E            %5.2f%%\n",
                fs, imp[i], 100 * imp[i] / p);
        strfree(fs);
    }

    time2 = clock();

/*     printf("calculate_probs : num_terms = %d : time = %f\n",   */
/*            prob_n_terms, (time2-time1)/(float)CLOCKS_PER_SEC);   */

    CleanUpOperations(
        file,
        probs,
        cp,
        imp,
		stop);
/*     fclose(file); */
/*     FreeMemory(probs); */
/*     free(cp); */
/*     FreeMemory(imp); */

    return ( TRUE );

} /* calculate_probs */
float                              /* RET - time (seconds)                   */
  probs_estimate(
    TREE *tree,         /* IN  - tree                             */
    int max_order,      /* IN  - number of cut sets               */
    int min_term,       /* IN  - min number of terms to evaluate  */
    int max_term)       /* IN  - max number of terms to evaluate  */
{
    int num_mcs;   /* number of cut sets used */
    float t = 0;
    int i,j;
    Group **index; /* index to the groups   */
	int *z;
    Group  *p;     /* pointer               */
    clock_t time1, time2;
    BitArray *stop = BitCreate(1);  /* 1-bit zero */
	float *probs;
	
	/*     TimeEstimate Base; */

    /* find out how many cut sets are actually used */
    num_mcs = ExprCountOrder(tree->mcs_expr, max_order);

	/* make sure that max_term does not exceed number of mcs */
	/* if number of mcs is zero then return 0 */
	if(num_mcs == 0) {
		return 0.0f;
	} else if (max_term > num_mcs) {
		max_term = num_mcs;
	}

	/* allocate memory required for testing */
    if ( !fNewMemory( (void *)&probs, ( tree->num_bas * sizeof(float) ) ) ) 
    {
        exit(1);
    }
    if ( !fNewMemory( (void *)&index, ( num_mcs * sizeof(Group *) ) ) )
    {
      exit( 1 );
    }
    if ( !fNewMemory( (void *)&z, ( num_mcs * sizeof(int) ) ) )
    {
      exit( 1 );
    }

	/* populate the arrays with default data */
	for(i=0, p=tree->mcs_expr; i<num_mcs; i++, p=p->next) {
		index[i] = p;
		z[i] = i;
	}	

	/* fill the probs array */
    get_probs( probs );


	/* set the static variables to sensible values */
	set_basic_n(tree->num_bas);
	set_prob_term(0.0);	
	set_basic_prob(probs);

	/* The function that takes most of the time is calc_sub_term().
	   Run this function for each number of terms required. Run it
	   enough times for the CPU clock to change. */
    for (i = min_term; i <= max_term; i++) {
		time1 = clock();
		j = 0;
		do {
			calc_sub_term(z, i, index);
			j++;
			time2 = clock();
		} while(time1 == time2);

        t += nCr(num_mcs, i) * (time2 - time1) / j;
    }

    FreeMemory(index);
	FreeMemory(z);
	FreeMemory(probs);

    return t/CLOCKS_PER_SEC;

} /* probs_estimate */
Exemple #9
0
void hca_displaytopics(char *stem, char *resstem, int topword, 
                       enum ScoreType scoretype, int pmicount, int fullreport) {
  int w,k;
  uint32_t *indk = NULL;
  int Nk_tot = 0;
  double (*tscore)(int) = NULL;
  double sparsityword = 0;
  double sparsitydoc = 0;
  double underused = 0;
  uint32_t *top1cnt = NULL;
  FILE *fp;
  float *tpmi = NULL;
  char *topfile;
  char *repfile;
  uint32_t *psort;
  FILE *rp = NULL;
  float *gtvec = globalprop();
  float *gpvec = calloc(ddN.W,sizeof(gpvec[0]));
  float *pvec = calloc(ddN.W,sizeof(pvec[0]));
  
  if ( pmicount>topword )
    pmicount = topword;
  if ( scoretype == ST_idf ) {
    tscore = idfscore;
  } else if ( scoretype == ST_phi ) {
    tscore = phiscore;
  } else if ( scoretype == ST_count ) {
    tscore = countscore;
  } else if ( scoretype == ST_cost ) {
    tscore = costscore;
  } else if ( scoretype == ST_Q ) {
    tscore = Qscore;
    lowerQ = 1.0/ddN.T;
  }    

  /*
   *  first collect counts of each word/term,
   *  and build gpvec (mean word probs)
   */
  build_NwK();
  {
    /*
     *  gpvec[] is normalised NwK[]
     */
    double tot = 0;
    for (w=0; w<ddN.W; w++)
      tot += gpvec[w] = NwK[w]+0.1; 
    for (w=0; w<ddN.W; w++)
      gpvec[w] /= tot;
  }
  if ( ddS.Nwt ) {
    for (k=0; k<ddN.T; k++) {
      Nk_tot += ddS.NWt[k];
    }
  } 
  
  psort = sorttops(gtvec, ddN.T);
  
  top1cnt = hca_top1cnt();
  if ( !top1cnt )
    yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n");


  if ( pmicount ) {
    tpmi = malloc(sizeof(*tpmi)*(ddN.T+1));
    if ( !tpmi )
      yap_quit("Cannot allocate tpmi in hca_displaytopics()\n");
  }
  indk = malloc(sizeof(*indk)*ddN.W);
  if ( !indk )
    yap_quit("Cannot allocate indk in hca_displaytopics()\n");

  /*
   *   two passes through, 
   *           first to build the top words and dump to file
   */
  repfile = yap_makename(resstem,".topset");
  topfile = yap_makename(resstem,".toplst");
  fp = fopen(topfile,"w");
  if ( !fp ) 
    yap_sysquit("Cannot open file '%s' for write\n", topfile);
  yap_message("\n");
  for (k=0; k<ddN.T; k++) {
    int cnt;
    tscorek = k;
    /*
     *    build sorted word list
     */
    cnt = buildindk(k, indk);
    topk(topword, cnt, indk, tscore);
    if ( cnt==0 )
      continue;
    /*
     *   dump words to file
     */
    fprintf(fp,"%d: ", k);
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr)  ) {
    int cnt;
     /*
     *    dump root words
     */
    tscorek = -1;
    cnt = buildindk(-1, indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    fprintf(fp,"-1:");
    for (w=0; w<topword && w<cnt; w++) {
      fprintf(fp," %d", (int)indk[w]);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  if ( verbose>1 ) yap_message("\n");

  if ( pmicount ) {
    /*
     * compute PMI
     */
    char *toppmifile;
    char *pmifile;
    double *tp;
    tp = dvec(ddN.T);
    pmifile=yap_makename(stem,".pmi");
    toppmifile=yap_makename(resstem,".toppmi");
    get_probs(tp);
    report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, 
               pmicount, tp, tpmi);
    free(toppmifile);
    free(pmifile);
    free(tp);
  }

  /*
   *   now report words and diagnostics
   */
  //ttop_open(topfile);
  if ( fullreport ) {
    rp = fopen(repfile,"w");
    if ( !rp ) 
      yap_sysquit("Cannot open file '%s' for write\n", repfile);
    fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one "
	    "dist-unif dist-unigrm");
    if ( PCTL_BURSTY() ) 
      fprintf(rp, " burst-concent");
    if ( ddN.tokens )  
      fprintf(rp, " ave-length");
    fprintf(rp, " coher");
    if ( pmicount ) 
      fprintf(rp, " pmi");
    fprintf(rp, "\n#word topic index rank");
    if ( ddS.Nwt )
      fprintf(rp, " count");
    fprintf(rp, " prop cumm df coher\n");
    
  }
  for (k=0; k<ddN.T; k++) {
    int cnt;
    int kk = psort[k];
    uint32_t **dfmtx;

    if ( ddP.phi==NULL && ddS.NWt[kk]==0 )
      continue;
    /*
     *   grab word prob vec for later use
     */
    if ( ddS.Nwt ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = wordprob(w,kk);
    } else if ( ddP.phi ) 
      fv_copy(pvec, ddP.phi[kk], ddN.W);
    else if ( ddS.phi ) 
      fv_copy(pvec, ddS.phi[kk], ddN.W);

    /*
     *  rebuild word list
     */
    tscorek = kk;
    cnt = buildindk(kk, indk);
    topk(topword, cnt, indk, tscore);
    if ( topword<cnt )
      cnt = topword;
    assert(cnt>0);
    /*
     *     df stats for topic returned as matrix
     */
    dfmtx = hca_dfmtx(indk, cnt, kk);

    if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) 
      underused++;
    /*
     *  print stats for topic
     *    Mallet:  tokens, doc_ent, ave-word-len, coher., 
     *             uni-dist, corp-dist, eff-no-words
     */
    yap_message("Topic %d/%d", kk, k);
    {
      /*
       *   compute diagnostics
       */
      double prop = gtvec[kk];
      float *dprop = docprop(kk);
      double spw = 0;
      double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); 
      double ew = exp(fv_entropy(pvec,ddN.W));
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W);
      double co = coherence(dfmtx, cnt);
      double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT;
      double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0;
      sparsitydoc += spd;
      yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop);   
      if ( ddS.Nwt ) {
	spw = ((double)nonzero_Nwt(kk))/((double)ddN.W);
	sparsityword += spw;
	yap_message(" ws=%.1lf%%", 100*(1-spw));
      } 
      yap_message(" ds=%.1lf%%", 100*(1-spd) );
      yap_message(" ew=%.0lf", ew); 
      yap_message(" ed=%.1lf", ed); 
      yap_message(" da=%.0lf", da+0.1); 
      yap_message(" t1=%u", top1cnt[kk]); 
      yap_message(" ud=%.3lf", ud); 
      yap_message(" pd=%.3lf", pd); 
      if ( PCTL_BURSTY() ) 
	yap_message(" bd=%.3lf", ddP.bdk[kk]); 
      if ( ddN.tokens )  
	yap_message(" sl=%.2lf", sl); 
      yap_message(" co=%.3lf%%", co);
      if ( pmicount ) 
	yap_message(" pmi=%.3f", tpmi[kk]);
      if ( fullreport ) {
	fprintf(rp,"topic %d %d", kk, k);
	fprintf(rp," %.6lf", prop);   
	if ( ddS.Nwt ) {
	  fprintf(rp," %.6lf", (1-spw));
	} else {
	  fprintf(rp," 0");
	}
	fprintf(rp," %.6lf", (1-spd) );
	fprintf(rp," %.2lf", ew); 
	fprintf(rp," %.2lf", ed); 
	fprintf(rp," %.0lf", da+0.1); 
	fprintf(rp," %u", top1cnt[kk]); 
	fprintf(rp," %.6lf", ud); 
	fprintf(rp," %.6lf", pd); 
	if ( PCTL_BURSTY() ) 
	  fprintf(rp," %.3lf", ddP.bdk[kk]); 
	fprintf(rp," %.4lf", (ddN.tokens)?sl:0); 
	fprintf(rp," %.6lf", co);
	if ( pmicount ) 
	  fprintf(rp," %.4f", tpmi[kk]);
	fprintf(rp,"\n");
      }
      if ( dprop) free(dprop);
    }
    if ( verbose>1 ) {
      double pcumm = 0;
      /*
       *   print top words:
       *     Mallet:   rank, count, prob, cumm, docs, coh
       */
      yap_message("\ntopic %d/%d", kk, k);
      yap_message(" words=");
      for (w=0; w<cnt; w++) {
	if ( w>0 ) yap_message(",");
	if ( ddN.tokens ) 
	  yap_message("%s", ddN.tokens[indk[w]]);
	else
	  yap_message("%d", indk[w]);
	if ( verbose>2 )
	  yap_message("(%6lf)", tscore(indk[w]));
	if ( fullreport ) {
	  fprintf(rp, "word %d %d %d", kk, indk[w], w);
	  if ( ddS.Nwt )
	    fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]);
	  pcumm += pvec[indk[w]];
	  fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	  fprintf(rp, " %d", dfmtx[w][w]); 
	  fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w));
	  if ( ddN.tokens ) 
	    fprintf(rp, " %s", ddN.tokens[indk[w]]);
	  fprintf(rp, "\n");
	}
      }
    }
    yap_message("\n");
    free(dfmtx[0]); free(dfmtx); 
  }
  if ( verbose>1 && ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) {
    int cnt;
    double pcumm = 0;
     /*
     *    print root words
     */
    tscorek = -1;
    cnt = buildindk(-1,indk);
    topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore);
    /*
     *     cannot build df mtx for root because
     *     it is latent w.r.t. topics
     */
    yap_message("Topic root words=");
    if ( fullreport ) {
      int w;
      for (w=0; w<ddN.W; w++)
	pvec[w] = betabasewordprob(w);
      double ew = exp(fv_entropy(pvec,ddN.W));
      double ud = fv_helldistunif(pvec,ddN.W);
      double pd = fv_helldist(pvec,gpvec,ddN.W);
      fprintf(rp,"topic -1 -1 0 0");
      fprintf(rp," %.4lf", ew); 
      fprintf(rp," %.6lf", ud); 
      fprintf(rp," %.6lf", pd); 
      fprintf(rp,"\n");
    }
    for (w=0; w<topword && w<cnt; w++) {
      if ( w>0 ) yap_message(",");
      if ( ddN.tokens )
	yap_message("%s", ddN.tokens[indk[w]]);
      else
	yap_message("%d", indk[w]);
      if ( verbose>2 )
	yap_message("(%6lf)", countscore(indk[w]));
      if ( fullreport ) {
	fprintf(rp, "word %d %d %d", -1, indk[w], w);
	if ( ddS.TwT )
	  fprintf(rp, " %d", ddS.TwT[w]);
	pcumm += pvec[indk[w]];
	fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm);
	fprintf(rp, " 0 0"); 
	if ( ddN.tokens ) 
	  fprintf(rp, " %s", ddN.tokens[indk[w]]);
	fprintf(rp, "\n");
      }   
    }
    yap_message("\n");
  }
  yap_message("\n");
  if ( rp )
    fclose(rp);
	     
  if ( ddS.Nwt )
    yap_message("Average topicXword sparsity = %.2lf%%\n",
                100*(1-sparsityword/ddN.T) );
  yap_message("Average docXtopic sparsity = %.2lf%%\n"
	      "Underused topics = %.1lf%%\n",
	      100*(1-sparsitydoc/ddN.T), 
	      100.0*underused/(double)ddN.T);
  if ( pmicount ) 
    yap_message("Average PMI = %.3f\n", tpmi[ddN.T]);

  /*
   *   print 
   */
  if ( 1 ) {
    float **cmtx = hca_topmtx();
    int t1, t2;
    int m1, m2;
    float mval;
    char *corfile = yap_makename(resstem,".topcor");
    fp = fopen(corfile,"w");
    if ( !fp ) 
      yap_sysquit("Cannot open file '%s' for write\n", corfile);
   /*
    *   print file
     */
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) 
	 if ( cmtx[t1][t2]>1.0e-3 ) 
	  fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]);
    }
    fclose(fp);
    free(corfile);
    /*
     *   display maximum
     */
    m1 = 1; m2 = 0;
    mval = cmtx[1][0];
    for (t1=0; t1<ddN.T; t1++) {
      for (t2=0; t2<t1; t2++) {
	if ( mval<cmtx[t1][t2] ) {
	  mval = cmtx[t1][t2];
	  m1 = t1;
	  m2 = t2;
	}
      }
    }
    yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval);
    free(cmtx[0]); free(cmtx);
  }

  /*
   *  print burstiness report
   */
  if ( PCTL_BURSTY() ) {
    int tottbl = 0;
    int totmlttbl = 0;
    int totmlt = 0;
    int i;
    for (i=0; i<ddN.NT; i++) {
      if ( Z_issetr(ddS.z[i]) ) {
	if ( M_multi(i) )
	  totmlttbl++;
	tottbl++;
      }
      if ( M_multi(i) )
	totmlt++;
    }
    yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n",
		100.0*((double)ddM.dim_multiind)/ddN.N,
		100.0*((double)tottbl)/ddN.NT,
		100.0*((double)totmlttbl)/totmlt);
  }
  yap_message("\n");

  free(topfile);
  if ( repfile ) free(repfile);
  if ( top1cnt ) free(top1cnt);
  free(indk);
  free(psort);
  if ( pmicount )
    free(tpmi);
  if ( NwK ) {
    free(NwK);
    NwK = NULL;
  }
  free(pvec); 
  free(gtvec);
  free(gpvec);
}
Exemple #10
0
/*
 *    run regular gibbs cycles on the data with phi used;
 *    the evaluation on each doc, and sample word probs
 *
 *    if qparts>0, split collection into parts and only search this
 *
 *    K = number of top results to retain
 */
void gibbs_query(int K, char *qname, int dots, int this_qpart, int qparts) {
  /*
   *    mapping from query word posn. to its mi in current doc
   *       >ddN.N  = not in current doc
   *       -ve  = has no mi since occurs just once, found at
   *              posn  (-map[]-1)
   *       non -ve = mi value
   */
  int     *mimap = NULL;
  /*
   *     usual stuff for Gibbs loop over docs
   */
  int i, j;
  float *fact = fvec(ddN.T*4);
  D_MiSi_t dD;
  /*
   *   an index into topk[] which maintains ordering
   */
  int     *topind;

  /*
   *    these store statistics of the results, for printing
   *    these are unordered, ordered by topind[]
   */
  /*      document score  */
  float   *topscore;
  /*      document number   */
  int     *topk;
  /*      flags if ord is irrelevant, thus not scored  */
  char    *wordunused;

  /*
   *    per word stats for top results saved
   */
  int     *found;
  float   *topcnt;
  float   *topwordscore;
  /*
   *    temporary versions for when gibbs running
   */
  int     *found_buf;
  float   *topcnt_buf;
  float   *topwordscore_buf;
  double  *logprob;
  /*
   *   search here
   */
  int startdoc = 0;
  int enddoc = ddN.DT;

  /*
   *    setup
   */
  topcnt = malloc(sizeof(topcnt[0])*K*ddP.n_words);
  topwordscore = malloc(sizeof(topwordscore[0])*K*ddP.n_words);
  found = malloc(sizeof(found)*ddP.n_words*K);
  wordunused = malloc(sizeof(wordunused[0])*ddP.n_words);

  topcnt_buf = malloc(sizeof(topcnt[0])*ddP.n_words);
  topwordscore_buf = malloc(sizeof(topwordscore[0])*ddP.n_words);
  found_buf = malloc(sizeof(found)*ddP.n_words);
  if ( !topcnt || !topwordscore || !found || 
       !topcnt_buf || !topwordscore_buf || !found_buf )
    yap_quit("Cannot allocate memory in gibbs_query()\n");

  logprob = malloc(sizeof(logprob[0])*ddP.n_query);
  topscore = malloc(sizeof(topscore[0])*K*ddP.n_query);
  topind = malloc(sizeof(topind[0])*K*ddP.n_query);
  topk = malloc(sizeof(topk[0])*K*ddP.n_query);
  if ( ddP.bdk!=NULL ) 
    mimap = malloc(sizeof(mimap[0])*ddP.n_words);
  if ( !topk || !topscore || !logprob || !topind )
    yap_quit("Cannot allocate memory in gibbs_query()\n");
  for (i=0; i<ddP.n_words; i++) {
    wordunused[i] = 0;
  }
  for (i=0; i<K*ddP.n_query; i++) {
    topind[i] = i%K;
    topk[i] = -1;
    topscore[i] = INFINITY;
  }
  
  /*
   *  check words to exclude using topics
   */
  if ( ddP.n_excludetopic>0 ) {
    double *tprob = malloc(sizeof(tprob[0])*ddN.T);
    get_probs(tprob);
    yap_probs();
    if ( verbose>1 )
      yap_message("Excluding words: ");
    for (i=0; i<ddP.n_words; i++) {
      int t = besttopic(ddP.qword[i],tprob);
      if ( Q_excludetopic(t) ) {
	wordunused[i] = 1;
	if ( verbose>1 )
	  yap_message(" %d/%d", (int)ddP.qword[i], t);
      }
    } 
    if ( verbose>1 )
      yap_message("\n");
    free(tprob);
  }
  
  if ( ddP.bdk!=NULL ) misi_init(&ddM,&dD);

  if ( qparts>0 ) {
    startdoc = ((double)this_qpart)/qparts * ddN.DT;
    enddoc = ((double)this_qpart+1.0)/qparts * ddN.DT;
  }
  for(i=startdoc; i<enddoc; i++) {
    int  thisw =  add_doc(i, GibbsNone);
    int  r;
    if ( thisw<=1 ) {
      remove_doc(i, GibbsNone);
      continue;
    }
    if ( ddP.bdk!=NULL ) 
      misi_build(&dD, i, 0);
    map_query(i, mimap, found_buf);
    for (j=0; j<ddP.n_words; j++) {
      topcnt_buf[j] = 0;
      topwordscore_buf[j] = 0;
    }
    
    for (r=0; r<ddP.queryiter; r++) {
      gibbs_lda(GibbsNone, ddN.T, i, ddD.NdT[i], fact, &dD, 0, 0);
      query_docprob(i, mimap, fact, &dD, topcnt_buf, topwordscore_buf);
    }  
    /*
     *  now adjust stats
     */
    for (j=0; j<ddP.n_query; j++) 
      logprob[j] = 0;
    for (j=0; j<ddP.n_words; j++) {
      if ( wordunused[j]>0 )
	continue;
      if ( ddP.query[ddP.qword[j]]==j ) {
	topcnt_buf[j] /= ddP.queryiter;
	topwordscore_buf[j] /= ddP.queryiter;
      } else {
	/*  word in previous query so copy  */
	int jj =  ddP.query[ddP.qword[j]];
	topcnt_buf[j] = topcnt_buf[jj];
	topwordscore_buf[j] = topwordscore_buf[jj];
	found_buf[j] = found_buf[jj];
      }
      if ( wordunused[j]==0 )
	logprob[ddP.qid[j]] += topwordscore_buf[j];
    }
    if ( dots>0 && i>0 && (i%dots==0) ) 
      yap_message(".");
    if ( ddP.bdk!=NULL ) misi_unbuild(&dD,i,0);
    remove_doc(i, GibbsNone);
    /*
     *   enter into the arrays
     */
    for (j=0; j<ddP.n_query; j++) {
      if ( i<K || logprob[j] < topscore[j*K+topind[j*K+K-1]] ) {
	int newind, l;
	/*
	 *   better than current lowest 
	 */
	newind = bubble((i<K)?(i+1):K, 
			&topind[j*K], &topscore[j*K], logprob[j]);
	/*
	 *   save the current details
	 */
	topscore[j*K+newind] = logprob[j];
	topk[j*K+newind] = i;
	for (l=ddP.qposn[j]; l<ddP.qposn[j+1]; l++) {
	  topcnt[newind*ddP.n_words+l] = topcnt_buf[l]; 
	  topwordscore[newind*ddP.n_words+l] = topwordscore_buf[l]; 
	  found[newind*ddP.n_words+l] = found_buf[l]; 
	}
      }
    }
  }
  if ( dots>0 ) yap_message("\n");
  
  /*
   *  write result
   */
  {
    float *ws = fvec(ddP.n_words);
    FILE *fp = fopen(qname,"w");
    int q;
    if ( !fp )
      yap_sysquit("Cannot write query results to '%s'\n", qname);
    for (q=0; q<ddP.n_query; q++) {
      int nw = ddP.qposn[q+1]-ddP.qposn[q];
      for (i=0; i<K && i<ddN.DT && topk[topind[q*K+i]]>=0; i++) {
	int l, ind = topind[q*K+i];
	double tfidf;
        tfidf = bm25(topk[q*K+ind],&found[ind*ddP.n_words+ddP.qposn[q]],
			    &ddP.qword[ddP.qposn[q]], nw, ws);
	assert(ind>=0 && ind<K);
	fprintf(fp, "%d %d ", q, topk[q*K+ind]);
	fprintf(fp, "%.4f %.4lf ", topscore[q*K+ind]/nw, tfidf);
        if ( verbose>1 ) {
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%d ", found[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topcnt[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topwordscore[ind*ddP.n_words+l]);
          for (l=0; l<nw; l++)
            fprintf(fp, "%lf ", ws[l]);
        }
        fprintf(fp, "\n");
      }
    }
    fclose(fp);
    free(ws);
  }

  /*
   *  clean up
   */
  free(fact);
  if ( ddP.bdk!=NULL ) misi_free(&dD);
  if ( mimap ) free(mimap);
  free(found);
  free(topwordscore);
  free(topcnt);
  free(found_buf);
  free(topwordscore_buf);
  free(topcnt_buf);
  free(topscore);
  free(topind);
  free(topk);
  free(logprob);
}