/* * pick up line from file "stem.ext" starting with "par" and an "=", * and return stuff after "=" but only first len chars */ static char *readextpar(char *stem, char *ext, char *par, char *buf, int len) { char *file = yap_makename(stem,ext); FILE *fp = fopen(file,"r"); int parlen = strlen(par); int buflen = len+parlen+50; char *linebuf = malloc(buflen+2); if ( !fp ) yap_sysquit("Cannot open parameter file '%s' ", file); if ( !linebuf ) yap_quit("Out of memory in readpar(%s)\n", par); buf[0] = 0; linebuf[0] = 0; while ( fgets(&linebuf[0],buflen,fp) ) { if ( strstr(&linebuf[0],par)==&linebuf[0] && (linebuf[parlen]==' ' || linebuf[parlen]=='=') ) { char *ret = strstr(&linebuf[0],"="); if ( !ret ) { yap_quit("Badly formatted parameter file '%s': %s\n", file, &linebuf[0]); } ret++; // yap_message("ret = '%s', length %d\n", ret, strlen(ret)); { int i; i = strlen(ret); if ( i>=len ) { i = len-1; ret[i] = 0; i--; } for ( ; i>=0; i--) buf[i] = ret[i]; /* for some weird reason, this fails occasionally!! */ // strncpy(buf, ret, len); } strncpy(buf, ret, len); break; } } if ( ferror(fp) ) yap_sysquit("Error on parameter file '%s' ", file); fclose(fp); free(file); free(linebuf); if ( buf[0] ) return &buf[0]; else return NULL; }
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *termindk = NULL; uint32_t *indk = NULL; int Nk_tot = 0; double (*termtscore)(int) = NULL; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); //#define XTRA // prints model topic probs after observed #ifdef XTRA double *gtavec = calloc(ddN.T,sizeof(gtavec[0])); #endif float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); #ifdef KL float *dfvec = calloc(ddN.W,sizeof(dfvec[0])); #endif double *ngalpha = NULL; T_stats_t *termstats; #ifdef XTRA get_probs(gtavec); #endif if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phirat ) { tscore = phiratioscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } if ( ddS.TwT==NULL && ddP.phi==NULL && scoretype == ST_phirat ) yap_quit("Cannot use '-orat' option with this model/settings.\n"); if ( ddP.PYalpha==H_NG ) { /* * provide an estimate of alpha */ ngalpha = dvec(ddN.T); get_probs(ngalpha); for (k=0; k<ddN.T; k++) { ddP.alphapr[k] = ngalpha[k]; } } /* * returns null if no relevant data file */ termstats = tstats_init(ddS.z, ddD.NdTcum, ddN.T, ddN.DT, stem); if ( termstats ) { if ( scoretype == ST_idf ) { termtscore = termidfscore; } else termtscore = termcountscore; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); if ( termstats ) build_termNwK(termstats); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); if ( termstats ) { termindk = malloc(sizeof(*indk)*termstats->K); if ( !termindk ) yap_quit("Cannot allocate termindk in hca_displaytopics()\n"); } data_df(stem); #ifdef KL for (w=0; w<ddN.W; w++) dfvec[w] = ddD.df[w]; #endif /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; if ( termstats ) { termcnt = buildtermindk(k, termindk, termstats); topk(topword, termcnt, termindk, termtscore); } /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } if ( termstats ) { for (w=0; w<topword && w<termcnt; w++) { fprintf(fp," %d", (int)termstats->Kmin+termindk[w]); } } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); if ( termstats ) { termcnt = buildtermindk(kk, termindk, termstats); topk(topword, termcnt, termindk, termtscore); if ( topword<termcnt ) termcnt = topword; } /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; #define MALLET_EW #ifdef MALLET_EW double ewp = dprop?(1.0/fv_expprob(pvec,ddN.W)):ddN.W; #endif double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); #ifdef XTRA yap_message((ddN.T>200)?"/%.3lf%%":"/%.2lf%%",100*gtavec[kk]); #endif if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); #ifdef KL yap_message(" ew=%lf", ew); #else yap_message(" ew=%.0lf", ew); #endif #ifdef MALLET_EW yap_message(" ewp=%.1lf", ewp); #endif yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddP.NGbeta ) { /* * approx. as sqrt(var(lambda_k)/lambda-normaliser */ double ngvar = sqrt(ddP.NGalpha[kk]) * (ngalpha[kk]/ddP.NGalpha[kk]); yap_message(" ng=%.4lf,%.4lf", ngalpha[kk], ngvar/ngalpha[kk]); if ( ddS.sparse ) yap_message(",%.4f", 1-((float)ddS.sparseD[kk])/ddN.DTused); if ( verbose>2 ) yap_message(" ngl=%.4lf,%.4lf, nga=%.4lf,%.4lf", ddP.NGalpha[kk]/ddP.NGbeta[kk], sqrt(ddP.NGalpha[kk]/ddP.NGbeta[kk]/ddP.NGbeta[kk]), ddP.NGalpha[kk], ddP.NGbeta[kk]); } if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); #ifdef KL yap_message(" %lf", ew); #else fprintf(rp," %.2lf", ew); #endif #ifdef MALLET_EW fprintf(rp," %.2lf", ewp); #endif fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(tscore(indk[w])+0.2)); else yap_message("(%6lf)", tscore(indk[w])); } if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } if ( termstats ) { yap_message(" terms="); for (w=0; w<termcnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", termstats->tokens[termindk[w]]); else yap_message("%d", termstats->Kmin+termindk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(termtscore(termindk[w])+0.2)); else yap_message("(%6lf)", termtscore(termindk[w])); } if ( fullreport ) { fprintf(rp, "term %d %d %d", kk, termindk[w], w); fprintf(rp, " %d", termstats->Nkt[termindk[w]][kk]); fprintf(rp, " %s", termstats->tokens[termindk[w]]); fprintf(rp, "\n"); } } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); /* this case gives bad results */ // if ( scoretype == ST_phirat ) topk(topword, cnt, indk, phiratioscore); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; if ( ddP.phi && ddP.PYbeta!=H_PDP ) { for (w=0; w<ddN.W; w++) pvec[w] = ddS.phi[ddN.T][w]; } else { for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); } #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 && !ddP.phi ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\nTopical words="); topk(topword, cnt, indk, phiinvratioscore); for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddS.sparse && ddP.PYalpha==H_NG ) { double avesp = 0; // correct_docsp(); for (k=0; k<ddN.T; k++) { avesp += gtvec[k]; } // check gtvec[] sums to 1 assert(fabs(avesp-1.0)<0.00001); avesp = 0; for (k=0; k<ddN.T; k++) { avesp += gtvec[k]*((float)ddS.sparseD[k])/ddN.DTused; assert(ddS.sparseD[k]<=ddN.DTused); } assert(avesp<=1.0); assert(avesp>0.0); yap_message("IBP sparsity = %.2lf%%\n", 100*(1-avesp)); } if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-7 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( ngalpha ) free(ngalpha); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } #ifdef KL free(dfvec); #endif free(pvec); free(gtvec); free(gpvec); tstats_free(termstats); }
void hca_displaytopics(char *resstem, int topword, enum ScoreType scoretype) { int w,k; int *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; char *fname = yap_makename(resstem,".top"); int nophi = (ddP.phi==NULL) && (ddS.phi==NULL); FILE *fp; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } fp = fopen(fname,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", fname); /* * first collect counts of each word/term */ if ( scoretype != ST_count && scoretype != ST_phi ) { NwK = u32vec(ddN.W); if ( !NwK ) yap_quit("Out of memory in hca_displaytopics()\n"); for (w=0; w<ddN.W; w++) { NwK[w] = 0; } NWK = 0; for (w=0; w<ddN.W; w++) { for (k=0; k<ddN.T; k++) { NwK[w] += ddS.Nwt[w][k]; // should use CCT_ReadN() } NWK += NwK[w]; } } assert(ddN.tokens); for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk\n"); for (k=0; k<ddN.T; k++) { int cnt; double spw; double spd; tscorek = k; /* * print top words */ cnt=0; if ( ddP.phi==NULL ) { for (w=0; w<ddN.W; w++) { if ( ddS.Nwt[w][k]>0 ) indk[cnt++] = w; } } else { float **phi; if ( ddP.phi ) phi = ddP.phi; else phi = ddS.phi; for (w=0; w<ddN.W; w++) { if ( phi[k][w]>0.5/ddN.W ) indk[cnt++] = w; } } topk(topword, cnt, indk, tscore); spd = ((double)nonzero_Ndt(k))/((double)ddN.DT); sparsitydoc += spd; if ( nophi ) { spw = ((double)nonzero_Nwt(k))/((double)ddN.W); sparsityword += spw; } if ( ddS.NWt[k]*ddN.T*100<Nk_tot ) underused++; yap_message("\nTopic %d (", k); if ( ddP.phi==NULL ) yap_message((ddN.T>200)?"p=%.3lf%%,":"p=%.2lf%%,", 100*((double)ddS.NWt[k])/(double)Nk_tot); if ( nophi ) yap_message("ws=%.1lf%%,", 100*(1-spw)); else yap_message("#=%.0lf,", exp(phi_entropy(k))); yap_message("ds=%.1lf%%", 100*(1-spd) ); fprintf(fp,"%d: ", k); yap_message(") words ="); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( ddP.PYbeta && nophi ) { int cnt; /* * print root words */ tscorek = -1; cnt=0; for (w=0; w<ddN.W; w++) { if ( ddS.TwT[w]>0 ) indk[cnt++] = w; } topk(topword, cnt, indk, tscore); yap_message("\nTopic root words ="); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( nophi ) yap_message("Average topicXword sparsity = %.2lf%%, ", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%, " "underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddP.bdk!=NULL) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("doc PYP report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } fclose(fp); free(fname); free(indk); if ( scoretype != ST_count ) { free(NwK); NwK = NULL; } }
/*========================================== * main *========================================== */ int main(int argc, char* argv[]) { int c, iter, ITER=0, seed=0; enum dataType data = LdaC; enum dataType testdata = LdaC; int dots = 0; enum GibbsType fix_hold = GibbsNone; char *stem; char *resstem; int topwords = 20; int noerrorlog = 0; int displayed = 0; int load_vocab = 0; int checkpoint = 0; int restart = 0; int dopmi = 0; int restart_hca = 0; int load_phi = 0; int load_mu = 0; int procs = 1; int maxW = 0; enum ScoreType score=ST_idf; double BM0val=0, BM1val =0, BP0val=0, BP1val=0; clock_t t1=0, t2=0, t3=0; double tot_time = 0; double psample_time = 0; enum ParType par; /* * default values */ ddN.T = 10; ITER = 100; ddN.TEST = 0; pctl_init(); while ( (c=getopt(argc, argv,"b:c:C:d:ef:F:g:G:h:K:l:L:N:o:pq:vr:s:S:t:T:vVW:"))>=0 ) { switch ( c ) { case 'b': if ( !optarg || sscanf(optarg,"%d",&ddP.back)!=1 ) yap_quit("Need a valid 'b' argument\n"); break; case 'c': if ( !optarg || sscanf(optarg,"%d",&checkpoint)!=1 ) yap_quit("Need a valid 'c' argument\n"); break; case 'C': if ( !optarg || sscanf(optarg,"%d",&ITER)!=1 ) yap_quit("Need a valid 'C' argument\n"); break; case 'd': if ( !optarg || sscanf(optarg,"%d",&dots)!=1 ) yap_quit("Need a valid 'd' argument\n"); break; case 'e': noerrorlog++; break; case 'f': if ( strcmp(optarg,"witdit")==0 ) data = WitDit; else if ( strcmp(optarg,"docword")==0 ) data = Docword; else if ( strcmp(optarg,"ldac")==0 ) data = LdaC; else if ( strcmp(optarg,"bag")==0 ) data = TxtBag; else if ( strcmp(optarg,"lst")==0 ) data = SeqTxtBag; else yap_quit("Illegal data type for -f\n"); break; case 'F': if ( strcmp(optarg,"all")==0 ) { for (par=ParAM; par<=ParBB; par++) ddT[par].fix = 1; } else { par = findpar(optarg); if ( par==ParNone ) yap_quit("Illegal arg for -F\n"); ddT[par].fix = 1; } break; case 'g': { char var[100]; int st=0; if ( !optarg || sscanf(optarg,"%[^, ],%d", &var[0], &st)<1 ) yap_quit("Need a valid 'g' argument\n"); par = findpar(var); if ( par==ParBP1 ) ddP.kbatch = st; else yap_quit("Illegal var for -g\n"); } break; case 'G': { char var[100]; int st=0, cy=0; if ( !optarg || sscanf(optarg,"%[^, ],%d,%d", &var[0], &cy, &st)<2 || st<0 || cy<0 ) yap_quit("Need a valid 'G' argument\n"); par = findpar(var); if ( par==ParNone || par==ParB0P || par==ParB0M ) yap_quit("Illegal var for -G\n"); ddT[par].fix = 0; ddT[par].start = st; ddT[par].cycles = cy; } break; case 'h': { fix_hold = GibbsHold; if ( !optarg ) yap_quit("Need a valid 'h' argument\n"); if ( strncmp(optarg,"dict,",5)==0 ) { if ( sscanf(&optarg[5],"%d",&ddP.hold_dict)<1 || ddP.hold_dict<2 ) yap_quit("Need a valid 'hdict' argument\n"); } else if ( strncmp(optarg,"fract,",6)==0 ) { if ( sscanf(&optarg[6],"%lf",&ddP.hold_fraction)<1 || ddP.hold_fraction<=0 || ddP.hold_fraction>=1 ) yap_quit("Need a valid 'hfract' argument\n"); } else if ( strncmp(optarg,"doc,",4)==0 ) { if ( sscanf(&optarg[4],"%d",&ddP.hold_every)<1 || ddP.hold_every<2 ) yap_quit("Need a valid 'hdoc' argument\n"); } else yap_quit("Need a valid 'h' argument\n"); } break; case 'K': if ( !optarg || sscanf(optarg,"%d",&ddN.T)!=1 ) yap_quit("Need a valid 'K' argument\n"); break; case 'l': if ( !optarg ) yap_quit("Need a valid 'l ' argument\n"); if ( strncmp(optarg,"phi,",4)==0 ) { if ( sscanf(&optarg[4],"%d,%d",&ddP.phiiter, &ddP.phiburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"theta,",6)==0 ) { if ( sscanf(&optarg[6],"%d,%d",&ddP.thetaiter, &ddP.thetaburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"mu,",3)==0 ) { if ( sscanf(&optarg[3],"%d,%d",&ddP.muiter, &ddP.muburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"prog,",5)==0 ) { if ( sscanf(&optarg[5],"%d,%d",&ddP.progiter, &ddP.progburn)<2 ) yap_quit("Need a valid 'l prog,' argument\n"); } else yap_quit("Need a valid DIAG code in 'l' argument\n"); break; case 'L': if ( !optarg ) yap_quit("Need a valid 'L ' argument\n"); if ( strncmp(optarg,"like,",5)==0 ) { if ( sscanf(&optarg[5],"%d,%d",&ddP.mltiter, &ddP.mltburn)<1 ) yap_quit("Need a valid 'L like' argument\n"); } else yap_quit("Need a valid DIAG code in 'L' argument\n"); break; case 'N': if ( !optarg || sscanf(optarg,"%d,%d", &ddP.maxN, &ddP.maxM)<1 ) yap_quit("Need a valid 'N' argument\n"); break; case 'o': { char *ptr = strchr(optarg, ','); int len = strlen(optarg); if ( ptr ) len = ptr - optarg; if ( strncmp(optarg,"idf",len)==0 ) score = ST_idf; else if ( strncmp(optarg,"count",len)==0 ) score = ST_count; else if ( strncmp(optarg,"Q",len)==0 ) score = ST_Q; else if ( strncmp(optarg,"cost",len)==0 ) score = ST_cost; else yap_quit("Need a valid parameter for 'o' argument\n"); if ( ptr ) { /* there was a second arg */ if ( sscanf(ptr+1, "%d", &topwords) != 1) yap_quit("Need a valid second 'o' argument\n"); } break; } break; case 'p': dopmi++; break; case 'q': if(!optarg || sscanf(optarg, "%d", &procs) != 1) yap_quit("Need a valid 'q' argument\n"); break; case 'r': if(!optarg ) yap_quit("Need a valid 'r' argument\n"); if ( strcmp(optarg,"tca")==0 ) restart++; else if ( strcmp(optarg,"hca")==0 ) restart_hca++; else if ( strcmp(optarg,"phi")==0 ) load_phi++; else if ( strcmp(optarg,"mu")==0 ) load_mu++; else yap_quit("Need a valid 'r' argument\n"); break; case 's': if ( !optarg || sscanf(optarg,"%d",&seed)!=1 ) yap_quit("Need a valid 's' argument\n"); break; case 'S': { char var[100]; double vin=0; if ( !optarg || sscanf(optarg,"%[^=, ]=%lf", &var[0], &vin)<2 ) yap_quit("Need a valid 'S' argument\n"); par = findpar(var); if ( par==ParNone ) yap_quit("Illegal var for -S\n"); else if ( par==ParBM0 ) BM0val = vin; else if ( par==ParBM1 ) BM1val = vin; else if ( par==ParBP0 ) BP0val = vin; else if ( par==ParBP1 ) BP1val = vin; else *(ddT[par].ptr) = vin; } break; case 't': if ( !optarg || sscanf(optarg,"%d",&ddP.training)!=1 ) yap_quit("Need a valid 't' argument\n"); break; case 'T': if ( !optarg ) yap_quit("Need a valid 'T' argument\n"); { char *tname = data_name(optarg,data); FILE *fp = fopen(tname,"r"); if ( fp==NULL ) { free(tname); tname = data_name(optarg,testdata); fp = fopen(tname,"r"); } else { testdata = data; } free(tname); if ( fp!=NULL ) { /* its a valid test filename */ ddP.teststem = optarg; fclose(fp); } else if ( sscanf(optarg,"%d",&ddN.TEST)!=1 ) yap_quit("Need a valid 'T' argument\n"); } break; case 'v': verbose++; break; case 'V': load_vocab = 1; break; case 'W': if ( !optarg || sscanf(optarg,"%d",&maxW)<1 ) yap_quit("Need a valid 'W' argument\n"); break; default: yap_quit("Unknown option '%c'\n", c); } } if (argc-optind != 2) { usage(); exit(-1); } if ( optind>=argc ) { yap_quit("No arguments given\n"); } stem = strdup(argv[optind++]); resstem = strdup(argv[optind++]); if ( dopmi ) load_vocab = 1; if ( dopmi && verbose !=2 ) { /* * due to the use of the ".top" file * its really multi-purpose */ yap_quit("When computing PMI verbose must be exactly 2\n"); } if ( noerrorlog==0 ) { char *wname = yap_makename(resstem, ".log"); yap_file(wname); free(wname); } yap_commandline(argc, argv); #ifdef H_THREADS yap_message(" Threads,"); #endif if ( restart || restart_hca ) { char *fname = yap_makename(resstem,".par"); FILE *fp = fopen(fname,"r"); char *buf; if ( !fp ) yap_quit("Parameter file '%s' doesn't exist\n", fname); fclose(fp); free(fname); buf = readpar(resstem,"T",50); if ( !buf ) yap_quit("Parameter file '%s' has no T\n", fname); ddN.T = atoi(buf); free(buf); if ( restart ) { buf = readpar(resstem,"E",50); if ( !buf ) yap_quit("Parameter file '%s' has no E\n", fname); ddN.E = atoi(buf); free(buf); pctl_read(resstem); } if ( maxW==0 ) { buf = readpar(resstem,"W",50); if ( buf ) { maxW = atoi(buf); free(buf); } } if ( ddP.training==0 ) { buf = readpar(resstem,"TRAIN",50); if ( buf ) { ddP.training = atoi(buf); free(buf); } } if ( ddN.TEST==0 ) { buf = readpar(resstem,"TEST",50); if ( buf ) { ddN.TEST = atoi(buf); free(buf); } } } assert(ddN.T>0); assert(ddN.TEST>=0); assert(restart || restart_hca || ITER>0); if ( load_phi && ddP.phiiter>0 ) yap_quit("Options '-l phi,...' and '-r phi' incompatible\n"); if ( load_mu && ddP.muiter>0 ) yap_quit("Options '-l mu,...' and '-r mu' incompatible\n"); /* * set random number generator */ if ( seed ) { rng_seed(rngp,seed); } else { rng_time(rngp,&seed); } yap_message("Setting seed = %lu\n", seed); /* * read data and get dimensions */ { D_bag_t *dbp = data_read(stem, data); int training = pctl_training(dbp->D); if ( ddP.teststem ) { D_bag_t *dbpt = data_read(ddP.teststem, testdata); /* need to load a separate test set, strip to bare training */ data_shrink(dbp, training); ddN.TEST = dbpt->D; data_append(dbp, dbpt); free(dbpt->w); free(dbpt->d); free(dbpt); } if ( maxW>0 ) { if ( dbp->W <= maxW ) dbp->W = maxW; if ( dbp->W > maxW ) data_vocabshrink(dbp, maxW); } /* * transfer into system */ ddN.D = dbp->D; ddN.W = dbp->W; ddN.N = dbp->N; ddN.NT = dbp->N; ddN.DT = training; ddD.w = dbp->w; ddD.d = dbp->d; free(dbp); if ( ddN.DT<ddN.D ) { /* recompute NT */ int i; for (i=0; i<ddN.N; i++) if ( ddD.d[i]>=ddN.DT ) break; ddN.NT = i; } } data_read_epoch(stem); /* * at this point, dimensions are fixed, so load phi and mu if needed */ if ( load_phi ) pctl_loadphi(resstem); if ( load_mu ) pctl_loadmu(resstem); /* * correct parameters after command line */ pctl_fix(ITER); if ( BM0val>0 ) { ddP.b_mu[0] = BM0val; } if ( BM1val>0 ) { int i; for (i=1; i<ddN.E; i++) ddP.b_mu[i] = BM1val; } if ( BP0val>0 ) { int i; for (i=0; i<ddN.T; i++) ddP.b_phi[0][i] = BP0val; } if ( BP1val>0 ) { int i; if ( ddN.E==1 ) yap_quit("b_phi[1] invalid when epochs==1\n"); for (i=0; i<ddN.T; i++) ddP.b_phi[1][i] = BP1val; } pctl_samplereport(); /* * all data structures */ data_alloc(); if ( ddP.phiiter>0 ) phi_init(resstem); else ddS.phi = NULL; if ( ddP.muiter>0 ) mu_init(resstem); else ddS.mu = NULL; if ( ddP.thetaiter>0 ) theta_init(resstem); else ddS.theta = NULL; tca_alloc(); if ( PCTL_BURSTY() ) dmi_init(&ddM, ddS.z, ddD.w, ddD.N_dTcum, ddN.T, ddN.N, ddN.W, ddN.D, ddN.DT, (fix_hold==GibbsHold)?pctl_hold:NULL); if ( load_vocab ) { data_vocab(stem); } cache_init(); /* * yap some details */ data_report(ITER, seed); pctl_report(); /* * load/init topic assignments and prepare statistics */ if ( restart || restart_hca) { tca_read_z(resstem, 0, ddN.DT); tca_rand_z(ddN.T, ddN.DT, ddN.D); } else { tca_rand_z(ddN.T, 0, ddN.D); } tca_reset_stats(resstem, restart, 0); if ( (restart || restart_hca ) && ITER ) yap_message("Initial log_2(perp)=%lf\n", -M_LOG2E * likelihood()/ddN.NT); if ( ITER ) yap_report("cycles: "); for (iter=0; iter<ITER; iter++) { int pro; double thislp = 0; int thisNd = 0; int doc; #ifdef H_THREADS pthread_t thread[procs]; #endif D_pargs_p parg[procs]; #ifdef MU_CACHE mu_side_fact_reinit(); #endif #ifdef PHI_CACHE phi_cache_reinit(); #endif t1 = clock(); /* * sampling */ #ifdef IND_STATS ddP.doc_ind_stats = u32tri(ddN.T,ddN.E,ddN.E); ddP.word_ind_stats = u32tri(ddN.T,ddN.E,ddN.E); #endif /* a bit complex if no threads! */ doc = 0; for (pro = 0 ; pro < procs ; pro++){ parg[pro].dots=dots; parg[pro].procs=procs; parg[pro].doc = &doc; #ifndef H_THREADS sampling_p(&parg[pro]); #else if ( procs==1 ) sampling_p(&parg[pro]); else if( pthread_create(&thread[pro],NULL,sampling_p,(void*) &parg[pro]) != 0){ yap_message("thread failed %d\n",pro+1 ); } #endif } #ifdef H_THREADS if ( procs>1 ) { //waiting for threads to finish for (pro = 0; pro < procs; pro++){ pthread_join(thread[pro], NULL); } } #endif // getting lp, Nd and clock for(pro = 0; pro < procs; pro++){ thislp += parg[pro].thislp; thisNd += parg[pro].thisNd; tot_time += parg[pro].tot_time; } #ifdef H_THREADS if ( procs>1 ) tca_reset_stats(NULL,1,1); #endif /* * full check */ #ifndef NDEBUG { int e, d; check_cp_et(); for (e=0; e<ddN.E; e++) check_m_vte(e); for (d=0; d<ddN.DT; d++) check_n_dt(d); } #endif #ifdef IND_STATS { char *fname = yap_makename(resstem,".istats"); FILE *ifp = fopen(fname,"a"); int e1, e2, kk; fprintf(ifp,"Iteration %d\n", iter); for (kk=0; kk<ddN.T; kk++) { fprintf(ifp," Topic %d\n", kk); for (e1=0; e1<ddN.E; e1++) { fprintf(ifp," Epoch %d\n ", e1); for (e2=0; e2<ddN.E; e2++) fprintf(ifp," %u", (unsigned)ddP.doc_ind_stats[kk][e1][e2]); fprintf(ifp,"\n "); for (e2=0; e2<ddN.E; e2++) fprintf(ifp," %u", (unsigned)ddP.word_ind_stats[kk][e1][e2]); fprintf(ifp,"\n"); } } fclose(ifp); free(ddP.doc_ind_stats[0][0]); free(ddP.doc_ind_stats[0]); free(ddP.doc_ind_stats); free(ddP.word_ind_stats[0][0]); free(ddP.word_ind_stats[0]); free(ddP.word_ind_stats); free(fname); } #endif /* * sample hyperparameters */ t3 = clock(); pctl_sample(iter, procs); /* * do time calcs here to remove diagnostics+reporting */ t2 = clock(); tot_time += (double)(t2 - t1) / CLOCKS_PER_SEC; psample_time += (double)(t2 - t3) / CLOCKS_PER_SEC; /* * progress reports */ if ( ( iter>ddP.progburn && (iter%ddP.progiter)==0 ) || iter+1>=ITER ) { yap_message(" %d\nlog_2(perp)=%lf,%lf", iter, -M_LOG2E * likelihood()/ddN.NT, -M_LOG2E * thislp/thisNd); pctl_update(iter); if ( verbose && iter%10==0 ) yap_probs(); if ( iter>0 && verbose>1 ) { if ( ddN.tokens ) { tca_displaytopics(resstem,topwords,score); displayed++; } } if ( iter+1<ITER ) { // yap_message("\n"); yap_report("cycles: "); } } else { yap_message(" %d", iter); if ( verbose>1) yap_message("\n"); } if ( checkpoint>0 && iter>0 && iter%checkpoint==0 ) { data_checkpoint(resstem, stem, iter+1); yap_message(" checkpointed\n"); tca_report(resstem, stem, ITER, procs, fix_hold, (dopmi&&displayed>0)?1:0); } if ( ddP.phiiter>0 && iter>ddP.phiburn && (iter%ddP.phiiter)==0 ) phi_update(); if ( ddP.thetaiter>0 && iter>ddP.thetaburn && (iter%ddP.thetaiter)==0 ) theta_update(); if ( ddP.muiter>0 && iter>ddP.muburn && (iter%ddP.muiter)==0 ) mu_update(); } // over iter if ( ITER ) yap_report("Finished after %d cycles on average of %lf+%lf(s) per cycle\n", iter, (tot_time-psample_time)/iter, psample_time/iter); if ( ( verbose==1 || ((iter+1)%5!=0 && verbose>1) ) ) { if ( ddN.tokens ) { tca_displaytopics(resstem,topwords,score); displayed++; } } yap_probs(); if ( ITER>0 ) data_checkpoint(resstem, stem, ITER); tca_report(resstem, stem, ITER, procs, fix_hold, (dopmi&&displayed>0)?1:0); if ( ddP.phiiter>0 ) phi_save(resstem); if ( ddP.thetaiter>0 ) theta_save(resstem); if ( ddP.muiter>0 ) mu_save(resstem); /* * free */ phi_free(); theta_free(); mu_free(); cache_free(); pctl_free(); data_free(); dmi_free(&ddM); tca_free(); free(stem); free(resstem); rng_free(rngp); return 0; }
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); yap_message(" ew=%.0lf", ew); yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); fprintf(rp," %.2lf", ew); fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", tscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-3 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } free(pvec); free(gtvec); free(gpvec); }