static double adkterms(double mya, void *mydata) { double val = 0; uint16_t **docstats = (uint16_t **)mydata; #ifdef A_DEBUG float save_a = ddC.SD->a; double like; #endif ddP.ad = mya; cache_update("ad"); val = dmi_likelihood_aterms(&ddM, docstats, pctl_gammaprior, ddP.ad, ddP.bdk, ddC.SD); myarms_evals++; #ifdef A_DEBUG yap_message("Eval adkterms(%lf) = %lf", mya, val); like = likelihood_bdk(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } else yap_message("\n"); last_like = like; last_val = val; #endif return val; }
static double aterms_burst(double mya, void *mydata) { double b[ddM.T]; double val = 0; uint16_t **docstats = (uint16_t **)mydata; #ifdef A_DEBUG float save_a = ddC.a_burst->a; double like; #endif int t; for (t=0; t<ddM.T; t++) b[t] = ddP.b_burst; cache_update("ab"); val = dmi_likelihood_aterms(&ddM, docstats, pctl_gammaprior, mya, b, ddC.a_burst); myarms_evals++; #ifdef A_DEBUG yap_message("Eval adkterms(%lf) = %lf", mya, val); like = dmi_likelihood(&ddM,pctl_gammaprior,mya,b,ddC.a_burst); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } else yap_message("\n"); last_like = like; last_val = val; #endif return val; }
static double aterms(double mya, void *mydata) { int i, t; double val = 0; double la = log(mya); #ifdef A_DEBUG float save_a = ddC.SX->a; double like; #endif S_remake(ddC.SX, mya); for (i=0; i<ddN.DT; i++) { uint32_t Td_ = 0; for (t=0; t<ddN.T; t++) { Td_ += ddS.Tdt[i][t]; if ( ddS.Ndt[i][t]>1 ) { val += S_S(ddC.SX,ddS.Ndt[i][t],ddS.Tdt[i][t]); } } val += Td_*la + lgamma(ddP.bpar/mya+Td_) - lgamma(ddP.bpar/mya); } myarms_evals++; #ifdef A_DEBUG yap_message("Eval aterms(%lf) = %lf (S had %f)", mya, val, save_a); ddP.apar = mya; cache_update("a"); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif return val; }
static double aterms_theta(double mya, void *mydata) { int i, t; double val = 0; #ifdef A_DEBUG float save_a = ddC.a_theta->a; double like; #endif S_remake(ddC.a_theta, mya); for (i=0; i<ddN.DT; i++) { for (t=0; t<ddN.T; t++) { if ( ddS.n_dt[i][t]>1 ) { val += S_S(ddC.a_theta,ddS.n_dt[i][t],ddS.c_dt[i][t]); } } val += poch(ddP.b_theta, mya, ddS.C_dT[i]); } myarms_evals++; #ifdef A_DEBUG yap_message("Eval aterms_theta(%lf) = %lf (S had %f)", mya, val, save_a); ddP.a_theta = mya; cache_update("at"); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif return val; }
static double aterms_phi0(double mya, void *mydata) { int v; double val = 0; #ifdef A_DEBUG float save_a = ddC.a_phi0->a; double like; #endif S_remake(ddC.a_phi0, mya); val += poch(ddP.b_phi0, mya, ddS.S_0_nz); for (v=0; v<ddN.W; v++) { if ( ddS.S_0vT[v]>1 ) val += S_S(ddC.a_phi0, ddS.S_0vT[v], 1); } myarms_evals++; #ifdef A_DEBUG yap_message("Eval aterms_phi0(%lf) = %lf (S had %f)", mya, val, save_a); ddP.a_phi0 = mya; cache_update("ap0"); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif return val; }
static double aterms_mu(double mya, void *mydata) { int e, t; double val = 0; #ifdef A_DEBUG float save_a = ddC.a_mu->a; double like; #endif S_remake(ddC.a_mu, mya); for (e=0; e<ddN.E; e++) { for (t=0; t<ddN.T; t++) { if ( ddS.cp_et[e][t]==0 ) continue; if (e==ddN.E-1) val += S_S(ddC.a_mu, ddS.C_eDt[e][t], ddS.cp_et[e][t]); else val += S_S(ddC.a_mu, ddS.C_eDt[e][t] + ddS.cp_et[e+1][t], ddS.cp_et[e][t]); } val += poch(ddP.b_mu[e], mya, ddS.Cp_e[e]); } myarms_evals++; #ifdef A_DEBUG yap_message("Eval aterms_mu(%lf) = %lf (S had %f)", mya, val, save_a); ddP.a_mu = mya; cache_update("am"); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif return val; }
void hca_displayclass(char *resstem) { int i, k; double ent = 0; uint32_t **TbyC; /* * write topic by class confusion matrix */ TbyC = classbytopic(resstem); /* * now report entropies */ yap_message("Class entropies by topic: "); for (k=0; k<ddN.T; k++) { double me = 0; double tot = 0; for (i=0; i<ddN.C; i++) tot += TbyC[k][i]; for (i=0; i<ddN.C; i++) { double p; if ( TbyC[k][i]>0 ) { p = ((double)TbyC[k][i])/tot; me -= p * log(p) * M_LOG2E; } } ent += me; yap_message(" %.3lf", me); } yap_message(" -> %.3lf\n", ent/ddN.T); free(TbyC[0]); free(TbyC); }
static double awterms(double myaw, void *mydata) { int i, t; double val = 0; double law = log(myaw); #ifdef A_DEBUG float save_a = ddC.SY->a; double like; #endif S_remake(ddC.SY, myaw); for (t=0; t<ddN.T; t++) { uint32_t Tw_ = 0; for (i=0; i<ddN.W; i++) { Tw_ += ddS.Twt[i][t]; if ( ddS.Nwt[i][t]>1 ) { val += S_S(ddC.SY,ddS.Nwt[i][t],ddS.Twt[i][t]); } } val += Tw_*law + lgamma(ddP_bwpar(t)/myaw+Tw_) - lgamma(ddP_bwpar(t)/myaw); } myarms_evals++; #ifdef A_DEBUG yap_message("Eval awterms(%lf) = %lf (S had %f)", myaw, val, save_a); ddP.awpar = myaw; cache_update("aw"); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif return val; }
void sample_a(double *mya) { #ifdef A_DEBUG last_val = 0; last_like = 0; #endif if ( verbose>1 ) yap_message("sample_a (pre): a=%lf, lp=%lf\n", *mya, likelihood()); myarms(PYP_DISC_MIN, PYP_DISC_MAX, &aterms, NULL, mya, "a"); cache_update("a"); if ( verbose>1 ) yap_message("sample_a (post): a=%lf, lp=%lf\n", *mya, likelihood()); }
void *sampling_p(void *pargs) { int i; float *p = fvec(ddN.T * 4); D_MiSi_t dD; D_pargs_p *par =(D_pargs_p *) pargs; clock_t t1 = clock(); if ( PCTL_BURSTY() ) misi_init(&ddM,&dD); /* * sampling */ par->thislp = 0; par->thisNd = 0; while ( (i=atomic_incr(*par->doc)-1)<ddN.DT ) { if ( PCTL_BURSTY() ) misi_build(&dD,i,0); par->thislp += gibbs_lda(GibbsNone, i, ddD.N_dT[i], p, &dD); par->thisNd += ddD.N_dT[i]; if ( par->dots>0 && i>0 && (i%par->dots==0) ) yap_message("."); if ( PCTL_BURSTY() ) misi_unbuild(&dD,i,0); } free(p); if ( PCTL_BURSTY() ) misi_free(&dD); par->tot_time = (double)(clock() - t1) / CLOCKS_PER_SEC; return NULL; }
/* * post. prob of word given topic used in sampling */ double wordfact(int j, int t, float *tip) { if ( ddP.phi!=NULL ) { assert(ddP.phi); return ddP.phi[t][j]; } if ( ddP.PYbeta ) { double p; if ( ddS.Twt[j][t]==0 ) { #ifndef NDEBUG if ( ddS.Nwt[j][t]>0 ) { yap_message("ddS.Nwt[%d][%d]==%d\n", j, t, ddS.Nwt[j][t]); assert(ddS.Nwt[j][t]==0); } #endif p = ((double)ddP.bwpar+ddP.awpar*ddS.TWt[t]) * betabasewordprob(j); *tip = 1.0; } else { double uone, uzero; wordtableindicatorprob(j, t, &uone, &uzero); p = uone + uzero; *tip = uone/(uone + uzero); } return p/((double)ddS.NWt[t]+ddP.bwpar); } return ((double)ddS.Nwt[j][t]+ddP.betapr[j]) / ((double)ddS.NWt[t]+ddP.betatot); }
static int next_best(bestmerge_t *B) { int k, bk=-1; double v = 0; yap_message("merge buffer: "); for (k=0; k<ddN.T; k++) { if ( B[k].ml>0 ) { yap_message("%d+%d ", k, B[k].k2); } } yap_message("\n"); for (k=0; k<ddN.T; k++) { if ( B[k].ml>v ) { bk = k; v = B[k].ml; } } return bk; }
void yap_probs() { int t; int empty = 0; double ent = 0; double factor = 0; double *vp = malloc(sizeof(*vp)*ddN.T); get_probs(vp); yap_message("probs = "); factor = ddP.alpha; for (t=0; t<ddN.T; t++) if ( vp[t]>0 ) { yap_message(" %lf", vp[t]); ent -= vp[t]*log(vp[t]); } else { empty++; yap_message(" -"); } yap_message("\nfactor = %lf, empty = %d, ent = %lf\n", factor, empty, exp(ent)); free(vp); }
static double aterms_phi1(double mya, void *mydata) { int e, t, v; double val = 0; #ifdef A_DEBUG float save_a = ddC.a_phi1->a; double like; #endif S_remake(ddC.a_phi1, mya); for (e=0; e<ddN.E; e++) { for (t=0; t<ddN.T; t++) { if ( ddS.S_Vte[t][e]==0 ) continue; val += poch(ddP.b_phi[e][t], mya, ddS.S_Vte[t][e]); for (v=0; v<ddN.W; v++) { if ( ddS.s_vte[v][t][e]==0 ) continue; if (e<ddN.E-1) { val += S_S(ddC.a_phi1, ddS.m_vte[v][t][e] + ddS.s_vte[v][t][e+1] , ddS.s_vte[v][t][e]); } else { val += S_S(ddC.a_phi1, ddS.m_vte[v][t][e], ddS.s_vte[v][t][e]); } } } } myarms_evals++; #ifdef A_DEBUG yap_message("Eval aterms_phi1(%lf) = %lf (S had %f)", mya, val, save_a); ddP.a_phi1 = mya; cache_update("ap"1); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif return val; }
/* * assumes uniform prior Dirichlet */ void sample_alpha(double *alphatot) { double dirmax = DIR_TOTAL_MAX; if ( dirmax>ddN.T * DIR_MAX ) dirmax = ddN.T * DIR_MAX; #ifdef A_DEBUG last_val = 0; last_like = 0; #endif if ( myarmsMH(DIR_MIN*ddN.T, dirmax, &alphaterms, NULL, alphatot, "alphatot",1) ) { yap_message("sample_alpha: error in result\n"); } cache_update("alpha"); }
static double alphaterms(double alphatot, void *mydata) { int t,s; double val = 0; double tot; double lga = lgamma(alphatot/ddN.T); double lgat = lgamma(alphatot); #ifdef A_DEBUG double like; #endif #ifdef CONJPRIOR val += ddN.T*(lgamma((alphatot+1.0)/ddN.T) - lga); val -= lgamma(alphatot+1.0) - lgamma(alphatot); #endif for (s=0; s<ddN.DT; s++) { tot = 0; for (t=0; t<ddN.T; t++) { tot += alphatot/ddN.T+ddS.Ndt[s][t]; val += gammadiff(ddS.Ndt[s][t],alphatot/ddN.T,lga); } val -= lgamma(tot) - lgat; } myarms_evals++; myarms_last = alphatot; #ifdef A_DEBUG yap_message("Eval alphaterms(%lf) = %lf\n", alphatot, val); ddP.alphatot = alphatot; cache_update("alpha"); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif return val; }
static double a0terms(double mya0, void *mydata) { int i; double l1a0 = log(1-mya0); double l2a0 = log((1-mya0)*(2-mya0)); double lga0 = lgamma(1-mya0); double val = 0; #ifdef A_DEBUG double like; #endif val += ddS.TDTnz*log(mya0) + lgamma(ddP.b0/mya0+ddS.TDTnz) - lgamma(ddP.b0/mya0); for (i=0; i<ddN.T; i++) /* note the root node is a PDD so all t's = 1 */ if ( ddS.TDt[i]>1 ) { if ( ddS.TDt[i]==2 ) val += l1a0; else if ( ddS.TDt[i]==3 ) val += l2a0; else val += lgamma(ddS.TDt[i]-mya0) - lga0; } #ifdef A_DEBUG yap_message("Eval a0terms(%lf) = %lf", mya0, val); ddP.a0 = mya0; cache_update("a0"); like = likelihood(); if ( last_val != 0 ) { yap_message(", lp=%lf diffs=%lf vs %lf\n", like, val-last_val, like-last_like); } last_like = like; last_val = val; #endif myarms_evals++; return val; }
void get_probs(double *vp) { int zerod = 1; int t; double tot = 0; if ( ddP.PYalpha==0 ) { get_probs_alpha(vp); return; } for (t=0; t<ddN.T; t++) { if ( ddP.PYalpha!=H_HPDD || ddS.TDt[t]>0 || zerod ) tot += vp[t] = alphabasetopicprob(t); else vp[t] = 0; if ( ddP.PYalpha==H_HPDD && ddS.TDt[t]==0 ) zerod = 0; } #ifndef NDEBUG if ( fabs(tot-1.0)>1e-4 ) { yap_message("get_probs() probs doesn't normalise, get %lf\n", tot); } #endif for (t=0; t<ddN.T; t++) vp[t] /= tot; }
/* * print out the topic topk=10 words. report the PMI score. */ double report_pmi(char *topfile, /* name of topics file */ char *pmifile, /* name of PMI file */ int T, /* total topics */ int W, /* total words */ int E, /* number of epochs */ int topk, double *tp) { int lineno = 0; int i,k, thee; /* * mapping from local index to actual word index */ uint32_t *wind = u32vec(topk*T*E); int n_wind = 0; /* * boolean vector ... is word used */ uint32_t *wuse = u32vec(W/32+1); /* * PMI's by local index */ uint32_t *topic = u32vec(topk); float *coherency = fvec(E); double **pmi; float ave = 0; char *line; size_t n_line; FILE *fr; if ( !wind || !wuse ) yap_quit("Out of memory in report_pmi()\n"); /* * read in file of top word indices in topic */ fr = fopen(topfile,"r"); if ( !fr ) yap_sysquit("Topic file '%s' not read\n", topfile); line = NULL; n_line = 0; lineno = 0; while ( getline(&line, &n_line, fr)>0 ) { char *buf = line; unsigned j; int e = 0; lineno ++; buf += strspn(buf," \t\n"); // skip space if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) yap_quit("Cannot read topic in topic line %d from file '%s'\n", lineno, topfile); if ( k<0 || k>=T ) continue; if ( e<0 || e>=E ) continue; for (i = 0; i<topk && *buf; i++) { buf = strpbrk(buf," \t\n"); // skip to next space if ( sscanf(buf, " %u", &j) <1 ) { if ( verbose>2 ) yap_message("Cannot read word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); break; } if ( j>=W) { yap_quit("Bad word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); } buf += strspn(buf," \t\n"); // skip space /* * check if word exists, and set up its index */ if ( wuse[j/32U] & (1U<<(j%32U)) ) { // yes, so search for it int ii; for (ii=0; ii<n_wind; ii++) if ( wind[ii]==j ) break; if ( ii>=n_wind ) yap_quit("Lookup of word %d failed at line %d in report_pmi()\n", (int)j, lineno); } else { // no, so add it wuse[j/32U] |= (1U<<(j%32U)); wind[n_wind] = j; n_wind++; } } free(line); line = NULL; n_line = 0; } fclose(fr); pmi = dmat(n_wind,n_wind); /* * build hash table now since we know size */ hashsize = n_wind*2; hashtab = malloc(sizeof(*hashtab)*hashsize); if ( !pmi || !hashtab ) yap_quit("Out of memory in report_pmi()\n"); for (i=0; i<hashsize; i++) hashtab[i] = 0; for (i=0; i<n_wind; i++) addw(wind[i],i); /* * load up PMI file, only keeping words mentioned in hash table */ { unsigned t1, t2; double value; int zcat = 0; fr = fopen(pmifile,"r"); if ( !fr ) { /* * try to zcat it */ char *cmd = malloc(strlen(pmifile)+20); sprintf(cmd,"%s.gz", pmifile); fr = fopen(cmd,"r"); if ( !fr ) yap_sysquit("Cannot open pmifile '%s' in report_pmi()\n", pmifile); fclose(fr); sprintf(cmd,"gunzip -c %s", pmifile); fr = popen(cmd,"r"); if ( !fr ) yap_sysquit("Cannot open or zcat pmifile '%s' in report_pmi()\n", pmifile); zcat = 1; free(cmd); } while (fscanf(fr, "%u %u %lg", &t1, &t2, &value)==3 ) { if ( t1>=W || t2>= W ) yap_quit("Illegal word index in report_pmi()\n"); if ( t1!= t2 && ( wuse[t1/32U] & (1U<<(t1%32U)) ) && ( wuse[t2/32U] & (1U<<(t2%32U))) ) { int i1, i2; i1 = findw(t1,wind); i2 = findw(t2,wind); if ( i1==UINT32_MAX || i2==UINT32_MAX ) yap_quit("Could not locate word index in report_pmi()\n"); pmi[i1][i2]=value; pmi[i2][i1]=value; } } if ( zcat ) pclose(fr); else fclose(fr); } /* * compute PMI score for each topic */ fr = fopen(topfile,"r"); if ( !fr ) yap_sysquit("Topic file '%s' not read\n", topfile); line = NULL; n_line = 0; thee = 0; lineno = 0; if ( E>1 ) yap_message("PMI %d:: ", 0); else yap_message("PMI :: "); while ( getline(&line, &n_line, fr)>0 ) { /* * repeat logic above to read topic file again */ char *buf = line; unsigned j; int cnt = 0; int e = 0; double coh = 0; buf += strspn(buf," \t\n"); // skip space if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) yap_quit("Cannot read topic in topic line %d from file '%s'\n", lineno, topfile); if ( k<0 || k>=T ) continue; if ( e<0 || e>=E ) continue; if ( e!=thee ) { thee = e; yap_message("\nPMI %d:: ", e); } for (i = 0; i<topk && *buf; i++) { buf = strpbrk(buf," \t\n"); // skip to next space if ( sscanf(buf, " %u", &j) <1 ) { yap_message("Cannot read word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); break; } if ( j>=W) { yap_quit("Bad word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); } buf += strspn(buf," \t\n"); // skip space topic[i] = findw(j,wind); } if ( i<topk ) topic[i] = W; /* * topics now read */ for (i=0; i<topk && topic[i]<W; i++) { for (j=i+1; j<topk && topic[j]<W; j++) { coh += pmi[topic[i]][topic[j]]; cnt ++; } } if ( cnt>0 ) coh /= cnt; coherency[e] += coh * tp[k]; yap_message(" %d:%.3lf", k, coh); } fclose(fr); yap_message("\nPMI ="); if ( E==1 ) { yap_message(" %.3lf\n", coherency[0]); ave = coherency[0]; } else { int e; for (e=0; e<E; e++) { ave += coherency[e]; yap_message(" %.3lf", coherency[e]); } ave /= E; yap_message(" -> %.3lf\n", ave); } free(wind); free(coherency); free(wuse); free(topic); free(pmi[0]); free(pmi); free(hashtab); hashtab = NULL; hashsize = 0; return ave; }
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *termindk = NULL; uint32_t *indk = NULL; int Nk_tot = 0; double (*termtscore)(int) = NULL; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); //#define XTRA // prints model topic probs after observed #ifdef XTRA double *gtavec = calloc(ddN.T,sizeof(gtavec[0])); #endif float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); #ifdef KL float *dfvec = calloc(ddN.W,sizeof(dfvec[0])); #endif double *ngalpha = NULL; T_stats_t *termstats; #ifdef XTRA get_probs(gtavec); #endif if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phirat ) { tscore = phiratioscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } if ( ddS.TwT==NULL && ddP.phi==NULL && scoretype == ST_phirat ) yap_quit("Cannot use '-orat' option with this model/settings.\n"); if ( ddP.PYalpha==H_NG ) { /* * provide an estimate of alpha */ ngalpha = dvec(ddN.T); get_probs(ngalpha); for (k=0; k<ddN.T; k++) { ddP.alphapr[k] = ngalpha[k]; } } /* * returns null if no relevant data file */ termstats = tstats_init(ddS.z, ddD.NdTcum, ddN.T, ddN.DT, stem); if ( termstats ) { if ( scoretype == ST_idf ) { termtscore = termidfscore; } else termtscore = termcountscore; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); if ( termstats ) build_termNwK(termstats); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); if ( termstats ) { termindk = malloc(sizeof(*indk)*termstats->K); if ( !termindk ) yap_quit("Cannot allocate termindk in hca_displaytopics()\n"); } data_df(stem); #ifdef KL for (w=0; w<ddN.W; w++) dfvec[w] = ddD.df[w]; #endif /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; if ( termstats ) { termcnt = buildtermindk(k, termindk, termstats); topk(topword, termcnt, termindk, termtscore); } /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } if ( termstats ) { for (w=0; w<topword && w<termcnt; w++) { fprintf(fp," %d", (int)termstats->Kmin+termindk[w]); } } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); if ( termstats ) { termcnt = buildtermindk(kk, termindk, termstats); topk(topword, termcnt, termindk, termtscore); if ( topword<termcnt ) termcnt = topword; } /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; #define MALLET_EW #ifdef MALLET_EW double ewp = dprop?(1.0/fv_expprob(pvec,ddN.W)):ddN.W; #endif double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); #ifdef XTRA yap_message((ddN.T>200)?"/%.3lf%%":"/%.2lf%%",100*gtavec[kk]); #endif if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); #ifdef KL yap_message(" ew=%lf", ew); #else yap_message(" ew=%.0lf", ew); #endif #ifdef MALLET_EW yap_message(" ewp=%.1lf", ewp); #endif yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddP.NGbeta ) { /* * approx. as sqrt(var(lambda_k)/lambda-normaliser */ double ngvar = sqrt(ddP.NGalpha[kk]) * (ngalpha[kk]/ddP.NGalpha[kk]); yap_message(" ng=%.4lf,%.4lf", ngalpha[kk], ngvar/ngalpha[kk]); if ( ddS.sparse ) yap_message(",%.4f", 1-((float)ddS.sparseD[kk])/ddN.DTused); if ( verbose>2 ) yap_message(" ngl=%.4lf,%.4lf, nga=%.4lf,%.4lf", ddP.NGalpha[kk]/ddP.NGbeta[kk], sqrt(ddP.NGalpha[kk]/ddP.NGbeta[kk]/ddP.NGbeta[kk]), ddP.NGalpha[kk], ddP.NGbeta[kk]); } if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); #ifdef KL yap_message(" %lf", ew); #else fprintf(rp," %.2lf", ew); #endif #ifdef MALLET_EW fprintf(rp," %.2lf", ewp); #endif fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(tscore(indk[w])+0.2)); else yap_message("(%6lf)", tscore(indk[w])); } if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } if ( termstats ) { yap_message(" terms="); for (w=0; w<termcnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", termstats->tokens[termindk[w]]); else yap_message("%d", termstats->Kmin+termindk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(termtscore(termindk[w])+0.2)); else yap_message("(%6lf)", termtscore(termindk[w])); } if ( fullreport ) { fprintf(rp, "term %d %d %d", kk, termindk[w], w); fprintf(rp, " %d", termstats->Nkt[termindk[w]][kk]); fprintf(rp, " %s", termstats->tokens[termindk[w]]); fprintf(rp, "\n"); } } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); /* this case gives bad results */ // if ( scoretype == ST_phirat ) topk(topword, cnt, indk, phiratioscore); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; if ( ddP.phi && ddP.PYbeta!=H_PDP ) { for (w=0; w<ddN.W; w++) pvec[w] = ddS.phi[ddN.T][w]; } else { for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); } #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 && !ddP.phi ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\nTopical words="); topk(topword, cnt, indk, phiinvratioscore); for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddS.sparse && ddP.PYalpha==H_NG ) { double avesp = 0; // correct_docsp(); for (k=0; k<ddN.T; k++) { avesp += gtvec[k]; } // check gtvec[] sums to 1 assert(fabs(avesp-1.0)<0.00001); avesp = 0; for (k=0; k<ddN.T; k++) { avesp += gtvec[k]*((float)ddS.sparseD[k])/ddN.DTused; assert(ddS.sparseD[k]<=ddN.DTused); } assert(avesp<=1.0); assert(avesp>0.0); yap_message("IBP sparsity = %.2lf%%\n", 100*(1-avesp)); } if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-7 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( ngalpha ) free(ngalpha); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } #ifdef KL free(dfvec); #endif free(pvec); free(gtvec); free(gpvec); tstats_free(termstats); }
/* * run regular gibbs cycles on the data with phi used; * the evaluation on each doc, and sample word probs * * if qparts>0, split collection into parts and only search this * * K = number of top results to retain */ void gibbs_query(int K, char *qname, int dots, int this_qpart, int qparts) { /* * mapping from query word posn. to its mi in current doc * >ddN.N = not in current doc * -ve = has no mi since occurs just once, found at * posn (-map[]-1) * non -ve = mi value */ int *mimap = NULL; /* * usual stuff for Gibbs loop over docs */ int i, j; float *fact = fvec(ddN.T*4); D_MiSi_t dD; /* * an index into topk[] which maintains ordering */ int *topind; /* * these store statistics of the results, for printing * these are unordered, ordered by topind[] */ /* document score */ float *topscore; /* document number */ int *topk; /* flags if ord is irrelevant, thus not scored */ char *wordunused; /* * per word stats for top results saved */ int *found; float *topcnt; float *topwordscore; /* * temporary versions for when gibbs running */ int *found_buf; float *topcnt_buf; float *topwordscore_buf; double *logprob; /* * search here */ int startdoc = 0; int enddoc = ddN.DT; /* * setup */ topcnt = malloc(sizeof(topcnt[0])*K*ddP.n_words); topwordscore = malloc(sizeof(topwordscore[0])*K*ddP.n_words); found = malloc(sizeof(found)*ddP.n_words*K); wordunused = malloc(sizeof(wordunused[0])*ddP.n_words); topcnt_buf = malloc(sizeof(topcnt[0])*ddP.n_words); topwordscore_buf = malloc(sizeof(topwordscore[0])*ddP.n_words); found_buf = malloc(sizeof(found)*ddP.n_words); if ( !topcnt || !topwordscore || !found || !topcnt_buf || !topwordscore_buf || !found_buf ) yap_quit("Cannot allocate memory in gibbs_query()\n"); logprob = malloc(sizeof(logprob[0])*ddP.n_query); topscore = malloc(sizeof(topscore[0])*K*ddP.n_query); topind = malloc(sizeof(topind[0])*K*ddP.n_query); topk = malloc(sizeof(topk[0])*K*ddP.n_query); if ( ddP.bdk!=NULL ) mimap = malloc(sizeof(mimap[0])*ddP.n_words); if ( !topk || !topscore || !logprob || !topind ) yap_quit("Cannot allocate memory in gibbs_query()\n"); for (i=0; i<ddP.n_words; i++) { wordunused[i] = 0; } for (i=0; i<K*ddP.n_query; i++) { topind[i] = i%K; topk[i] = -1; topscore[i] = INFINITY; } /* * check words to exclude using topics */ if ( ddP.n_excludetopic>0 ) { double *tprob = malloc(sizeof(tprob[0])*ddN.T); get_probs(tprob); yap_probs(); if ( verbose>1 ) yap_message("Excluding words: "); for (i=0; i<ddP.n_words; i++) { int t = besttopic(ddP.qword[i],tprob); if ( Q_excludetopic(t) ) { wordunused[i] = 1; if ( verbose>1 ) yap_message(" %d/%d", (int)ddP.qword[i], t); } } if ( verbose>1 ) yap_message("\n"); free(tprob); } if ( ddP.bdk!=NULL ) misi_init(&ddM,&dD); if ( qparts>0 ) { startdoc = ((double)this_qpart)/qparts * ddN.DT; enddoc = ((double)this_qpart+1.0)/qparts * ddN.DT; } for(i=startdoc; i<enddoc; i++) { int thisw = add_doc(i, GibbsNone); int r; if ( thisw<=1 ) { remove_doc(i, GibbsNone); continue; } if ( ddP.bdk!=NULL ) misi_build(&dD, i, 0); map_query(i, mimap, found_buf); for (j=0; j<ddP.n_words; j++) { topcnt_buf[j] = 0; topwordscore_buf[j] = 0; } for (r=0; r<ddP.queryiter; r++) { gibbs_lda(GibbsNone, ddN.T, i, ddD.NdT[i], fact, &dD, 0, 0); query_docprob(i, mimap, fact, &dD, topcnt_buf, topwordscore_buf); } /* * now adjust stats */ for (j=0; j<ddP.n_query; j++) logprob[j] = 0; for (j=0; j<ddP.n_words; j++) { if ( wordunused[j]>0 ) continue; if ( ddP.query[ddP.qword[j]]==j ) { topcnt_buf[j] /= ddP.queryiter; topwordscore_buf[j] /= ddP.queryiter; } else { /* word in previous query so copy */ int jj = ddP.query[ddP.qword[j]]; topcnt_buf[j] = topcnt_buf[jj]; topwordscore_buf[j] = topwordscore_buf[jj]; found_buf[j] = found_buf[jj]; } if ( wordunused[j]==0 ) logprob[ddP.qid[j]] += topwordscore_buf[j]; } if ( dots>0 && i>0 && (i%dots==0) ) yap_message("."); if ( ddP.bdk!=NULL ) misi_unbuild(&dD,i,0); remove_doc(i, GibbsNone); /* * enter into the arrays */ for (j=0; j<ddP.n_query; j++) { if ( i<K || logprob[j] < topscore[j*K+topind[j*K+K-1]] ) { int newind, l; /* * better than current lowest */ newind = bubble((i<K)?(i+1):K, &topind[j*K], &topscore[j*K], logprob[j]); /* * save the current details */ topscore[j*K+newind] = logprob[j]; topk[j*K+newind] = i; for (l=ddP.qposn[j]; l<ddP.qposn[j+1]; l++) { topcnt[newind*ddP.n_words+l] = topcnt_buf[l]; topwordscore[newind*ddP.n_words+l] = topwordscore_buf[l]; found[newind*ddP.n_words+l] = found_buf[l]; } } } } if ( dots>0 ) yap_message("\n"); /* * write result */ { float *ws = fvec(ddP.n_words); FILE *fp = fopen(qname,"w"); int q; if ( !fp ) yap_sysquit("Cannot write query results to '%s'\n", qname); for (q=0; q<ddP.n_query; q++) { int nw = ddP.qposn[q+1]-ddP.qposn[q]; for (i=0; i<K && i<ddN.DT && topk[topind[q*K+i]]>=0; i++) { int l, ind = topind[q*K+i]; double tfidf; tfidf = bm25(topk[q*K+ind],&found[ind*ddP.n_words+ddP.qposn[q]], &ddP.qword[ddP.qposn[q]], nw, ws); assert(ind>=0 && ind<K); fprintf(fp, "%d %d ", q, topk[q*K+ind]); fprintf(fp, "%.4f %.4lf ", topscore[q*K+ind]/nw, tfidf); if ( verbose>1 ) { for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%d ", found[ind*ddP.n_words+l]); for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%f ", topcnt[ind*ddP.n_words+l]); for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%f ", topwordscore[ind*ddP.n_words+l]); for (l=0; l<nw; l++) fprintf(fp, "%lf ", ws[l]); } fprintf(fp, "\n"); } } fclose(fp); free(ws); } /* * clean up */ free(fact); if ( ddP.bdk!=NULL ) misi_free(&dD); if ( mimap ) free(mimap); free(found); free(topwordscore); free(topcnt); free(found_buf); free(topwordscore_buf); free(topcnt_buf); free(topscore); free(topind); free(topk); free(logprob); }
/******************************** * code for LDA *****************************/ double gibbs_lda(/* * fix==GibbsNone for standard ML training/testing * fix==GibbsHold for word hold-out testing, * same as GibbsNone but also handles * train and test words differently */ enum GibbsType fix, int did, // document index int words, // do this many float *p, // temp store D_MiSi_t *dD ) { int i, wid, t, mi=0; int e; double Z, tot; double logdoc = 0; int logdocinf = 0; int StartWord = ddD.N_dTcum[did]; int EndWord = StartWord + words; float dtip[ddN.T]; #ifdef MH_STEP double doc_side_cache[ddN.T]; for (t=0; t<ddN.T; t++) doc_side_cache[t] = doc_side_fact(did,t); #endif /* * some of the latent variables are not sampled * are kept in the testing version, uses enum GibbsType * fix = global document setting * fix_doc = settings for word in this doc * * NB. if fix==GibbsNone, then fix_doc==fix * if fix==GibbsHold then fix_doc==GibbsHold or GibbsNone */ enum GibbsType fix_doc = fix; if ( PCTL_BURSTY() ) { mi = ddM.MI[did]; } e = ddD.e[did]; for (i=StartWord; i<EndWord; i++) { #ifdef MH_STEP int oldt; #endif if ( fix==GibbsHold ) { if ( pctl_hold(i) ) fix_doc = GibbsHold; // this word is a hold out else fix_doc = GibbsNone; } // check_m_vte(e); wid=ddD.w[i]; /******************* * first we remove affects of this word on the stats *******************/ #ifdef MH_STEP oldt = #endif t = Z_t(ddS.z[i]); if ( fix_doc!=GibbsHold ) { if ( remove_topic(i, did, (!PCTL_BURSTY()||Z_issetr(ddS.z[i]))?wid:-1, t, mi, dD) ) { goto endword; } } /*********************** * get topic probabilities ***********************/ // check_m_vte(e); #ifdef MU_CACHE mu_side_fact_update(e); #endif #ifdef PHI_CACHE phi_norm_update(wid, e); phi_sum_update(wid, e, i); #endif for (t=0, Z=0, tot=0; t<ddN.T; t++) { #ifdef MH_STEP int saveback = ddP.back; if ( fix_doc!=GibbsHold ) ddP.back = 0; #endif /* * (fix_doc==GibbsHold) => * doing estimation, not sampling so use prob versions * else * doing sampling so use fact versions */ #ifdef MH_STEP double tf = (fix_doc==GibbsHold)?doc_side_prob(did,t): doc_side_cache[t]; if ( tf>0 ) { double wf = (fix_doc==GibbsHold)?word_side_prob(e, wid, t): word_side_fact(e, wid, t); #else double tf = (fix_doc==GibbsHold)?doc_side_prob(did,t): doc_side_fact(did,t); if ( tf>0 ) { double wf = (fix_doc==GibbsHold)?word_side_prob(e, wid, t): word_side_fact(e, wid, t); #endif tot += tf; if ( PCTL_BURSTY() ) wf = (fix_doc==GibbsHold)?docprob(dD, t, i, mi, wf): docfact(dD, t, i, mi, wf, &dtip[t]); Z += p[t] = tf * wf; } else p[t] = 0; #ifdef MH_STEP ddP.back = saveback; #endif } if ( fix!=GibbsHold || fix_doc==GibbsHold ) logdoc += log(Z/tot); if ( logdocinf==0 ) if ( !finite(logdoc) ) { logdocinf++; yap_infinite(logdoc); } /******************* * now sample t using p[] and install affects of this on the stats; * but note this needs indicator to be set! *******************/ if ( fix_doc!=GibbsHold ) { /* * sample and update core stats */ t = samplet(p, Z, ddN.T, rng_unit(rngp)); #ifdef MH_STEP if ( t != oldt ) { double ratio = p[oldt]/p[t]; if ( PCTL_BURSTY() ) { ratio *= docfact(dD, t, i, mi, word_side_fact(e, wid, t), &dtip[t]) * doc_side_fact(did,t); ratio /= docfact(dD, oldt, i, mi, word_side_fact(e, wid, oldt), &dtip[oldt]) * doc_side_fact(did,oldt); } else { ratio *= word_side_fact(e, wid, t) * doc_side_fact(did, t); ratio /= word_side_fact(e, wid, oldt) * doc_side_fact(did, oldt); } if ( ratio<1 && ratio<rng_unit(rngp) ) t = oldt; } #endif Z_sett(ddS.z[i],t); #ifdef TRACE_WT if ( wid==TR_W && t==TR_T ) yap_message("update_topic(w=%d,t=%d,d=%d,l=%d,z=%d,N=%d,T=%d)\n", wid,t,did,i,ddS.z[i], (int)ddS.m_vte[wid][t][e],(int)ddS.s_vte[wid][t][e]); #endif update_topic(i, did, wid, t, mi, dtip[t], dD); #ifdef TRACE_WT if ( wid==TR_W && t==TR_T ) yap_message("after update_topic(w=%d,t=%d,d=%d,l=%d,z=%d,N=%d,T=%d)\n", wid,t,did,i,ddS.z[i], (int)ddS.m_vte[wid][t][e],(int)ddS.s_vte[wid][t][e]); #endif } endword: if ( PCTL_BURSTY() && M_multi(i) ) { mi++; } } return logdoc; }
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); yap_message(" ew=%.0lf", ew); yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); fprintf(rp," %.2lf", ew); fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", tscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-3 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } free(pvec); free(gtvec); free(gpvec); }
/*========================================== * main *========================================== */ int main(int argc, char* argv[]) { int c, iter, ITER=0, seed=0; enum dataType data = LdaC; enum dataType testdata = LdaC; int dots = 0; enum GibbsType fix_hold = GibbsNone; char *stem; char *resstem; int topwords = 20; int noerrorlog = 0; int displayed = 0; int load_vocab = 0; int checkpoint = 0; int restart = 0; int dopmi = 0; int restart_hca = 0; int load_phi = 0; int load_mu = 0; int procs = 1; int maxW = 0; enum ScoreType score=ST_idf; double BM0val=0, BM1val =0, BP0val=0, BP1val=0; clock_t t1=0, t2=0, t3=0; double tot_time = 0; double psample_time = 0; enum ParType par; /* * default values */ ddN.T = 10; ITER = 100; ddN.TEST = 0; pctl_init(); while ( (c=getopt(argc, argv,"b:c:C:d:ef:F:g:G:h:K:l:L:N:o:pq:vr:s:S:t:T:vVW:"))>=0 ) { switch ( c ) { case 'b': if ( !optarg || sscanf(optarg,"%d",&ddP.back)!=1 ) yap_quit("Need a valid 'b' argument\n"); break; case 'c': if ( !optarg || sscanf(optarg,"%d",&checkpoint)!=1 ) yap_quit("Need a valid 'c' argument\n"); break; case 'C': if ( !optarg || sscanf(optarg,"%d",&ITER)!=1 ) yap_quit("Need a valid 'C' argument\n"); break; case 'd': if ( !optarg || sscanf(optarg,"%d",&dots)!=1 ) yap_quit("Need a valid 'd' argument\n"); break; case 'e': noerrorlog++; break; case 'f': if ( strcmp(optarg,"witdit")==0 ) data = WitDit; else if ( strcmp(optarg,"docword")==0 ) data = Docword; else if ( strcmp(optarg,"ldac")==0 ) data = LdaC; else if ( strcmp(optarg,"bag")==0 ) data = TxtBag; else if ( strcmp(optarg,"lst")==0 ) data = SeqTxtBag; else yap_quit("Illegal data type for -f\n"); break; case 'F': if ( strcmp(optarg,"all")==0 ) { for (par=ParAM; par<=ParBB; par++) ddT[par].fix = 1; } else { par = findpar(optarg); if ( par==ParNone ) yap_quit("Illegal arg for -F\n"); ddT[par].fix = 1; } break; case 'g': { char var[100]; int st=0; if ( !optarg || sscanf(optarg,"%[^, ],%d", &var[0], &st)<1 ) yap_quit("Need a valid 'g' argument\n"); par = findpar(var); if ( par==ParBP1 ) ddP.kbatch = st; else yap_quit("Illegal var for -g\n"); } break; case 'G': { char var[100]; int st=0, cy=0; if ( !optarg || sscanf(optarg,"%[^, ],%d,%d", &var[0], &cy, &st)<2 || st<0 || cy<0 ) yap_quit("Need a valid 'G' argument\n"); par = findpar(var); if ( par==ParNone || par==ParB0P || par==ParB0M ) yap_quit("Illegal var for -G\n"); ddT[par].fix = 0; ddT[par].start = st; ddT[par].cycles = cy; } break; case 'h': { fix_hold = GibbsHold; if ( !optarg ) yap_quit("Need a valid 'h' argument\n"); if ( strncmp(optarg,"dict,",5)==0 ) { if ( sscanf(&optarg[5],"%d",&ddP.hold_dict)<1 || ddP.hold_dict<2 ) yap_quit("Need a valid 'hdict' argument\n"); } else if ( strncmp(optarg,"fract,",6)==0 ) { if ( sscanf(&optarg[6],"%lf",&ddP.hold_fraction)<1 || ddP.hold_fraction<=0 || ddP.hold_fraction>=1 ) yap_quit("Need a valid 'hfract' argument\n"); } else if ( strncmp(optarg,"doc,",4)==0 ) { if ( sscanf(&optarg[4],"%d",&ddP.hold_every)<1 || ddP.hold_every<2 ) yap_quit("Need a valid 'hdoc' argument\n"); } else yap_quit("Need a valid 'h' argument\n"); } break; case 'K': if ( !optarg || sscanf(optarg,"%d",&ddN.T)!=1 ) yap_quit("Need a valid 'K' argument\n"); break; case 'l': if ( !optarg ) yap_quit("Need a valid 'l ' argument\n"); if ( strncmp(optarg,"phi,",4)==0 ) { if ( sscanf(&optarg[4],"%d,%d",&ddP.phiiter, &ddP.phiburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"theta,",6)==0 ) { if ( sscanf(&optarg[6],"%d,%d",&ddP.thetaiter, &ddP.thetaburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"mu,",3)==0 ) { if ( sscanf(&optarg[3],"%d,%d",&ddP.muiter, &ddP.muburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"prog,",5)==0 ) { if ( sscanf(&optarg[5],"%d,%d",&ddP.progiter, &ddP.progburn)<2 ) yap_quit("Need a valid 'l prog,' argument\n"); } else yap_quit("Need a valid DIAG code in 'l' argument\n"); break; case 'L': if ( !optarg ) yap_quit("Need a valid 'L ' argument\n"); if ( strncmp(optarg,"like,",5)==0 ) { if ( sscanf(&optarg[5],"%d,%d",&ddP.mltiter, &ddP.mltburn)<1 ) yap_quit("Need a valid 'L like' argument\n"); } else yap_quit("Need a valid DIAG code in 'L' argument\n"); break; case 'N': if ( !optarg || sscanf(optarg,"%d,%d", &ddP.maxN, &ddP.maxM)<1 ) yap_quit("Need a valid 'N' argument\n"); break; case 'o': { char *ptr = strchr(optarg, ','); int len = strlen(optarg); if ( ptr ) len = ptr - optarg; if ( strncmp(optarg,"idf",len)==0 ) score = ST_idf; else if ( strncmp(optarg,"count",len)==0 ) score = ST_count; else if ( strncmp(optarg,"Q",len)==0 ) score = ST_Q; else if ( strncmp(optarg,"cost",len)==0 ) score = ST_cost; else yap_quit("Need a valid parameter for 'o' argument\n"); if ( ptr ) { /* there was a second arg */ if ( sscanf(ptr+1, "%d", &topwords) != 1) yap_quit("Need a valid second 'o' argument\n"); } break; } break; case 'p': dopmi++; break; case 'q': if(!optarg || sscanf(optarg, "%d", &procs) != 1) yap_quit("Need a valid 'q' argument\n"); break; case 'r': if(!optarg ) yap_quit("Need a valid 'r' argument\n"); if ( strcmp(optarg,"tca")==0 ) restart++; else if ( strcmp(optarg,"hca")==0 ) restart_hca++; else if ( strcmp(optarg,"phi")==0 ) load_phi++; else if ( strcmp(optarg,"mu")==0 ) load_mu++; else yap_quit("Need a valid 'r' argument\n"); break; case 's': if ( !optarg || sscanf(optarg,"%d",&seed)!=1 ) yap_quit("Need a valid 's' argument\n"); break; case 'S': { char var[100]; double vin=0; if ( !optarg || sscanf(optarg,"%[^=, ]=%lf", &var[0], &vin)<2 ) yap_quit("Need a valid 'S' argument\n"); par = findpar(var); if ( par==ParNone ) yap_quit("Illegal var for -S\n"); else if ( par==ParBM0 ) BM0val = vin; else if ( par==ParBM1 ) BM1val = vin; else if ( par==ParBP0 ) BP0val = vin; else if ( par==ParBP1 ) BP1val = vin; else *(ddT[par].ptr) = vin; } break; case 't': if ( !optarg || sscanf(optarg,"%d",&ddP.training)!=1 ) yap_quit("Need a valid 't' argument\n"); break; case 'T': if ( !optarg ) yap_quit("Need a valid 'T' argument\n"); { char *tname = data_name(optarg,data); FILE *fp = fopen(tname,"r"); if ( fp==NULL ) { free(tname); tname = data_name(optarg,testdata); fp = fopen(tname,"r"); } else { testdata = data; } free(tname); if ( fp!=NULL ) { /* its a valid test filename */ ddP.teststem = optarg; fclose(fp); } else if ( sscanf(optarg,"%d",&ddN.TEST)!=1 ) yap_quit("Need a valid 'T' argument\n"); } break; case 'v': verbose++; break; case 'V': load_vocab = 1; break; case 'W': if ( !optarg || sscanf(optarg,"%d",&maxW)<1 ) yap_quit("Need a valid 'W' argument\n"); break; default: yap_quit("Unknown option '%c'\n", c); } } if (argc-optind != 2) { usage(); exit(-1); } if ( optind>=argc ) { yap_quit("No arguments given\n"); } stem = strdup(argv[optind++]); resstem = strdup(argv[optind++]); if ( dopmi ) load_vocab = 1; if ( dopmi && verbose !=2 ) { /* * due to the use of the ".top" file * its really multi-purpose */ yap_quit("When computing PMI verbose must be exactly 2\n"); } if ( noerrorlog==0 ) { char *wname = yap_makename(resstem, ".log"); yap_file(wname); free(wname); } yap_commandline(argc, argv); #ifdef H_THREADS yap_message(" Threads,"); #endif if ( restart || restart_hca ) { char *fname = yap_makename(resstem,".par"); FILE *fp = fopen(fname,"r"); char *buf; if ( !fp ) yap_quit("Parameter file '%s' doesn't exist\n", fname); fclose(fp); free(fname); buf = readpar(resstem,"T",50); if ( !buf ) yap_quit("Parameter file '%s' has no T\n", fname); ddN.T = atoi(buf); free(buf); if ( restart ) { buf = readpar(resstem,"E",50); if ( !buf ) yap_quit("Parameter file '%s' has no E\n", fname); ddN.E = atoi(buf); free(buf); pctl_read(resstem); } if ( maxW==0 ) { buf = readpar(resstem,"W",50); if ( buf ) { maxW = atoi(buf); free(buf); } } if ( ddP.training==0 ) { buf = readpar(resstem,"TRAIN",50); if ( buf ) { ddP.training = atoi(buf); free(buf); } } if ( ddN.TEST==0 ) { buf = readpar(resstem,"TEST",50); if ( buf ) { ddN.TEST = atoi(buf); free(buf); } } } assert(ddN.T>0); assert(ddN.TEST>=0); assert(restart || restart_hca || ITER>0); if ( load_phi && ddP.phiiter>0 ) yap_quit("Options '-l phi,...' and '-r phi' incompatible\n"); if ( load_mu && ddP.muiter>0 ) yap_quit("Options '-l mu,...' and '-r mu' incompatible\n"); /* * set random number generator */ if ( seed ) { rng_seed(rngp,seed); } else { rng_time(rngp,&seed); } yap_message("Setting seed = %lu\n", seed); /* * read data and get dimensions */ { D_bag_t *dbp = data_read(stem, data); int training = pctl_training(dbp->D); if ( ddP.teststem ) { D_bag_t *dbpt = data_read(ddP.teststem, testdata); /* need to load a separate test set, strip to bare training */ data_shrink(dbp, training); ddN.TEST = dbpt->D; data_append(dbp, dbpt); free(dbpt->w); free(dbpt->d); free(dbpt); } if ( maxW>0 ) { if ( dbp->W <= maxW ) dbp->W = maxW; if ( dbp->W > maxW ) data_vocabshrink(dbp, maxW); } /* * transfer into system */ ddN.D = dbp->D; ddN.W = dbp->W; ddN.N = dbp->N; ddN.NT = dbp->N; ddN.DT = training; ddD.w = dbp->w; ddD.d = dbp->d; free(dbp); if ( ddN.DT<ddN.D ) { /* recompute NT */ int i; for (i=0; i<ddN.N; i++) if ( ddD.d[i]>=ddN.DT ) break; ddN.NT = i; } } data_read_epoch(stem); /* * at this point, dimensions are fixed, so load phi and mu if needed */ if ( load_phi ) pctl_loadphi(resstem); if ( load_mu ) pctl_loadmu(resstem); /* * correct parameters after command line */ pctl_fix(ITER); if ( BM0val>0 ) { ddP.b_mu[0] = BM0val; } if ( BM1val>0 ) { int i; for (i=1; i<ddN.E; i++) ddP.b_mu[i] = BM1val; } if ( BP0val>0 ) { int i; for (i=0; i<ddN.T; i++) ddP.b_phi[0][i] = BP0val; } if ( BP1val>0 ) { int i; if ( ddN.E==1 ) yap_quit("b_phi[1] invalid when epochs==1\n"); for (i=0; i<ddN.T; i++) ddP.b_phi[1][i] = BP1val; } pctl_samplereport(); /* * all data structures */ data_alloc(); if ( ddP.phiiter>0 ) phi_init(resstem); else ddS.phi = NULL; if ( ddP.muiter>0 ) mu_init(resstem); else ddS.mu = NULL; if ( ddP.thetaiter>0 ) theta_init(resstem); else ddS.theta = NULL; tca_alloc(); if ( PCTL_BURSTY() ) dmi_init(&ddM, ddS.z, ddD.w, ddD.N_dTcum, ddN.T, ddN.N, ddN.W, ddN.D, ddN.DT, (fix_hold==GibbsHold)?pctl_hold:NULL); if ( load_vocab ) { data_vocab(stem); } cache_init(); /* * yap some details */ data_report(ITER, seed); pctl_report(); /* * load/init topic assignments and prepare statistics */ if ( restart || restart_hca) { tca_read_z(resstem, 0, ddN.DT); tca_rand_z(ddN.T, ddN.DT, ddN.D); } else { tca_rand_z(ddN.T, 0, ddN.D); } tca_reset_stats(resstem, restart, 0); if ( (restart || restart_hca ) && ITER ) yap_message("Initial log_2(perp)=%lf\n", -M_LOG2E * likelihood()/ddN.NT); if ( ITER ) yap_report("cycles: "); for (iter=0; iter<ITER; iter++) { int pro; double thislp = 0; int thisNd = 0; int doc; #ifdef H_THREADS pthread_t thread[procs]; #endif D_pargs_p parg[procs]; #ifdef MU_CACHE mu_side_fact_reinit(); #endif #ifdef PHI_CACHE phi_cache_reinit(); #endif t1 = clock(); /* * sampling */ #ifdef IND_STATS ddP.doc_ind_stats = u32tri(ddN.T,ddN.E,ddN.E); ddP.word_ind_stats = u32tri(ddN.T,ddN.E,ddN.E); #endif /* a bit complex if no threads! */ doc = 0; for (pro = 0 ; pro < procs ; pro++){ parg[pro].dots=dots; parg[pro].procs=procs; parg[pro].doc = &doc; #ifndef H_THREADS sampling_p(&parg[pro]); #else if ( procs==1 ) sampling_p(&parg[pro]); else if( pthread_create(&thread[pro],NULL,sampling_p,(void*) &parg[pro]) != 0){ yap_message("thread failed %d\n",pro+1 ); } #endif } #ifdef H_THREADS if ( procs>1 ) { //waiting for threads to finish for (pro = 0; pro < procs; pro++){ pthread_join(thread[pro], NULL); } } #endif // getting lp, Nd and clock for(pro = 0; pro < procs; pro++){ thislp += parg[pro].thislp; thisNd += parg[pro].thisNd; tot_time += parg[pro].tot_time; } #ifdef H_THREADS if ( procs>1 ) tca_reset_stats(NULL,1,1); #endif /* * full check */ #ifndef NDEBUG { int e, d; check_cp_et(); for (e=0; e<ddN.E; e++) check_m_vte(e); for (d=0; d<ddN.DT; d++) check_n_dt(d); } #endif #ifdef IND_STATS { char *fname = yap_makename(resstem,".istats"); FILE *ifp = fopen(fname,"a"); int e1, e2, kk; fprintf(ifp,"Iteration %d\n", iter); for (kk=0; kk<ddN.T; kk++) { fprintf(ifp," Topic %d\n", kk); for (e1=0; e1<ddN.E; e1++) { fprintf(ifp," Epoch %d\n ", e1); for (e2=0; e2<ddN.E; e2++) fprintf(ifp," %u", (unsigned)ddP.doc_ind_stats[kk][e1][e2]); fprintf(ifp,"\n "); for (e2=0; e2<ddN.E; e2++) fprintf(ifp," %u", (unsigned)ddP.word_ind_stats[kk][e1][e2]); fprintf(ifp,"\n"); } } fclose(ifp); free(ddP.doc_ind_stats[0][0]); free(ddP.doc_ind_stats[0]); free(ddP.doc_ind_stats); free(ddP.word_ind_stats[0][0]); free(ddP.word_ind_stats[0]); free(ddP.word_ind_stats); free(fname); } #endif /* * sample hyperparameters */ t3 = clock(); pctl_sample(iter, procs); /* * do time calcs here to remove diagnostics+reporting */ t2 = clock(); tot_time += (double)(t2 - t1) / CLOCKS_PER_SEC; psample_time += (double)(t2 - t3) / CLOCKS_PER_SEC; /* * progress reports */ if ( ( iter>ddP.progburn && (iter%ddP.progiter)==0 ) || iter+1>=ITER ) { yap_message(" %d\nlog_2(perp)=%lf,%lf", iter, -M_LOG2E * likelihood()/ddN.NT, -M_LOG2E * thislp/thisNd); pctl_update(iter); if ( verbose && iter%10==0 ) yap_probs(); if ( iter>0 && verbose>1 ) { if ( ddN.tokens ) { tca_displaytopics(resstem,topwords,score); displayed++; } } if ( iter+1<ITER ) { // yap_message("\n"); yap_report("cycles: "); } } else { yap_message(" %d", iter); if ( verbose>1) yap_message("\n"); } if ( checkpoint>0 && iter>0 && iter%checkpoint==0 ) { data_checkpoint(resstem, stem, iter+1); yap_message(" checkpointed\n"); tca_report(resstem, stem, ITER, procs, fix_hold, (dopmi&&displayed>0)?1:0); } if ( ddP.phiiter>0 && iter>ddP.phiburn && (iter%ddP.phiiter)==0 ) phi_update(); if ( ddP.thetaiter>0 && iter>ddP.thetaburn && (iter%ddP.thetaiter)==0 ) theta_update(); if ( ddP.muiter>0 && iter>ddP.muburn && (iter%ddP.muiter)==0 ) mu_update(); } // over iter if ( ITER ) yap_report("Finished after %d cycles on average of %lf+%lf(s) per cycle\n", iter, (tot_time-psample_time)/iter, psample_time/iter); if ( ( verbose==1 || ((iter+1)%5!=0 && verbose>1) ) ) { if ( ddN.tokens ) { tca_displaytopics(resstem,topwords,score); displayed++; } } yap_probs(); if ( ITER>0 ) data_checkpoint(resstem, stem, ITER); tca_report(resstem, stem, ITER, procs, fix_hold, (dopmi&&displayed>0)?1:0); if ( ddP.phiiter>0 ) phi_save(resstem); if ( ddP.thetaiter>0 ) theta_save(resstem); if ( ddP.muiter>0 ) mu_save(resstem); /* * free */ phi_free(); theta_free(); mu_free(); cache_free(); pctl_free(); data_free(); dmi_free(&ddM); tca_free(); free(stem); free(resstem); rng_free(rngp); return 0; }
int myarmsMH(double xl, double xr, double (*myfunc)(double x, void *mydata), void *mydata, double *xval, char *label, int doMH) { double result = *xval; double *resvec = NULL; double startval = *xval; int errcode; if ( fabs(*xval-xr)/(xr)<0.00001 ) { *xval = xr * 0.999 + xl * 0.001; } if ( fabs(*xval-xl)/xl<0.00001 ) { *xval = xl * 0.999 + xr * 0.001; } myarms_evals = 0; if ( doMH ) { resvec = malloc(sizeof(resvec[0])*NSAMP); errcode = myarms_simple(6, &xl, &xr, myfunc, mydata, doMH, xval, resvec, NSAMP); result = resvec[NSAMP-1]; free(resvec); } else errcode = myarms_simple(6, &xl, &xr, myfunc, mydata, 0, xval, &result, 1); /* * 1007, 1003 is out of bounds */ if ( errcode && errcode!=1007 && errcode!=1003 && errcode!=2001 && (errcode!=2000 || startval!=result ) ) { yap_quit(" myarmsMH(%lf,%s)->%d = %lf,%lf%s->%lf, w %d calls, quitting\n", startval, label, errcode, myarms_last, result, (!ISFINITE(result))?"(inf)":"", *xval, myarms_evals); } if ( errcode==1007 || errcode==1003 ) { /* * hit bounds, for safety use start val */ yap_message(" error myarmsMH(%lf,%s)->%d = %lf,%lf%s->%lf, w %d calls, quitting\n", startval, label, errcode, myarms_last, result, (!ISFINITE(result))?"(inf)":"", *xval, myarms_evals); result = startval; } if ( errcode==2001 ) { /* * too many calls in Metrop-Hastings */ yap_message(" error myarmsMH(%lf,%s)->%d = %lf,%lf%s->%lf, w %d calls, quitting\n", startval, label, errcode, myarms_last, result, (!ISFINITE(result))?"(inf)":"", *xval, myarms_evals); } /* * note, sometimes the value is returned * unchanged .... seems to be when the * posterior is really focussed */ if ( verbose>1 || !ISFINITE(result) || startval==result ) yap_message(" myarmsMH(%s) = %lf%s<-%lf, w %d calls %s\n", label, result, (!ISFINITE(result))?"(inf)":"", *xval, myarms_evals, (startval==result)?"UNCHANGED":""); if ( ISFINITE(result) ) { if ( result<xl ) result = xl; else if ( result>xr ) result = xr; *xval = result; } if ( !ISFINITE(result) ) { return 1; } return 0; }
void hca_displaytopics(char *resstem, int topword, enum ScoreType scoretype) { int w,k; int *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; char *fname = yap_makename(resstem,".top"); int nophi = (ddP.phi==NULL) && (ddS.phi==NULL); FILE *fp; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } fp = fopen(fname,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", fname); /* * first collect counts of each word/term */ if ( scoretype != ST_count && scoretype != ST_phi ) { NwK = u32vec(ddN.W); if ( !NwK ) yap_quit("Out of memory in hca_displaytopics()\n"); for (w=0; w<ddN.W; w++) { NwK[w] = 0; } NWK = 0; for (w=0; w<ddN.W; w++) { for (k=0; k<ddN.T; k++) { NwK[w] += ddS.Nwt[w][k]; // should use CCT_ReadN() } NWK += NwK[w]; } } assert(ddN.tokens); for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk\n"); for (k=0; k<ddN.T; k++) { int cnt; double spw; double spd; tscorek = k; /* * print top words */ cnt=0; if ( ddP.phi==NULL ) { for (w=0; w<ddN.W; w++) { if ( ddS.Nwt[w][k]>0 ) indk[cnt++] = w; } } else { float **phi; if ( ddP.phi ) phi = ddP.phi; else phi = ddS.phi; for (w=0; w<ddN.W; w++) { if ( phi[k][w]>0.5/ddN.W ) indk[cnt++] = w; } } topk(topword, cnt, indk, tscore); spd = ((double)nonzero_Ndt(k))/((double)ddN.DT); sparsitydoc += spd; if ( nophi ) { spw = ((double)nonzero_Nwt(k))/((double)ddN.W); sparsityword += spw; } if ( ddS.NWt[k]*ddN.T*100<Nk_tot ) underused++; yap_message("\nTopic %d (", k); if ( ddP.phi==NULL ) yap_message((ddN.T>200)?"p=%.3lf%%,":"p=%.2lf%%,", 100*((double)ddS.NWt[k])/(double)Nk_tot); if ( nophi ) yap_message("ws=%.1lf%%,", 100*(1-spw)); else yap_message("#=%.0lf,", exp(phi_entropy(k))); yap_message("ds=%.1lf%%", 100*(1-spd) ); fprintf(fp,"%d: ", k); yap_message(") words ="); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( ddP.PYbeta && nophi ) { int cnt; /* * print root words */ tscorek = -1; cnt=0; for (w=0; w<ddN.W; w++) { if ( ddS.TwT[w]>0 ) indk[cnt++] = w; } topk(topword, cnt, indk, tscore); yap_message("\nTopic root words ="); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( nophi ) yap_message("Average topicXword sparsity = %.2lf%%, ", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%, " "underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddP.bdk!=NULL) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("doc PYP report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } fclose(fp); free(fname); free(indk); if ( scoretype != ST_count ) { free(NwK); NwK = NULL; } }
void like_merge(float minprop, double scale, int best) { int k1, k2; double realdiff = 0; double likediff; int got=0; /* only use this for reporting ; should disable in production */ float **cmtx; int title = 0; int mincount = minprop * ddN.NT; bestmerge_t B[ddN.T]; if ( mincount<5 ) mincount = 5; assert(ddP.phi==NULL); assert(ddP.theta==NULL); for (k1=0; k1<ddN.T; k1++) B[k1].ml = 0; cmtx = hca_topmtx(); if ( !cmtx ) yap_quit("Out of memory in like_merge()\n"); if ( ddP.PYbeta!=H_None ) yap_quit("Non-parametric beta unimplemented with merge\n"); for (k1=1; k1<ddN.T; k1++) { if ( ddS.NWt[k1]<=mincount ) continue; for (k2=0; k2<k1; k2++) { if ( ddS.NWt[k2]<=mincount ) continue; /* now have a pair to check (k1,k2) with OK counts */ if ( ddP.PYalpha==H_None ) likediff = likemerge_DIRalpha(k1,k2); else likediff = likemerge_alpha(k1, k2); if ( ddP.PYbeta==H_None ) likediff += likemerge_DIRbeta(k1,k2); else likediff += likemerge_beta(k1, k2); if ( likediff>0 ) { got++; if ( title==0 && verbose ) { double like = scale * likelihood(); yap_message("\nPre merge log_2(perp)=%.4lf", like); realdiff = like; } if ( verbose>1 ) { if ( title==0 ) yap_message(", merge report:\n"); yap_message(" %d+%d cor=%0.6f like+=%0.6g", k1, k2, cmtx[k1][k2], scale * likediff); } title = 1; if ( likediff>B[k1].ml ) { B[k1].ml = likediff; B[k1].k2 = k2; if ( verbose>1 ) yap_message(" stored"); } if ( likediff>B[k2].ml ) { B[k2].ml = likediff; B[k2].k2 = k1; if ( verbose>1 ) yap_message(" stored"); } if ( verbose>1 ) yap_message("\n"); } else if ( verbose>2 ) { yap_message(" %d+%d cor=%0.6f like+=%0.6g\n", k1, k2, cmtx[k1][k2], scale * likediff); } } } while ( got && best-->0 && (k1=next_best(&B[0]))>=0 ) { /* * have a good merge at position k1; */ merge_alpha_t Ma; merge_beta_t Mb; k2 = B[k1].k2; yap_message(" best merge is %d+%d giving diff=%lf\n", k1, k2, scale* B[k1].ml); Ma.Tdt = NULL; Mb.Twt = NULL; if ( ddP.PYalpha ) merge_init_Tdt(k1, k2, &Ma); if ( ddP.PYbeta ) merge_init_Twt(k1, k2, &Mb); // WRAY: need to checkk what this does, it it why change? hca_merge_stats(k1, k2, Ma.Tdt, Mb.Twt); // hca_correct_tdt(0); if ( ddP.PYalpha ) merge_free_Tdt(&Ma); if ( ddP.PYbeta ) merge_free_Twt(&Mb); /* block them from getting picked again */ B[k1].ml = 0; B[k2].ml = 0; { int k; for (k=0; k<ddN.T; k++) { if ( B[k].k2==k1 || B[k].k2==k2 ) B[k].ml = 0; } } } if ( got && verbose ) { double like = scale * likelihood(); realdiff -= like; yap_message("\nPost merge log_2(perp)=%.4lf (%.6lf)", like, realdiff); } if ( got==0 && verbose ) { yap_message("Merge found no candidates\n"); } free(cmtx[0]); free(cmtx); }