vector<pair<T_ID, uint32> > InvertedIndexMulDim::topkQuery(vector<int>& queryLow, vector<int>& queryHigh, uint32 k){ map<T_ID,uint32> counter; scanIndex(queryLow, queryHigh, counter); vector<pair<T_ID, uint32> > topk(k); partial_sort_copy(counter.begin(), counter.end(), topk.begin(), topk.end(), compfunc); cout<<"topk.size():"<<topk.size()<<endl; return topk; }
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *termindk = NULL; uint32_t *indk = NULL; int Nk_tot = 0; double (*termtscore)(int) = NULL; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); //#define XTRA // prints model topic probs after observed #ifdef XTRA double *gtavec = calloc(ddN.T,sizeof(gtavec[0])); #endif float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); #ifdef KL float *dfvec = calloc(ddN.W,sizeof(dfvec[0])); #endif double *ngalpha = NULL; T_stats_t *termstats; #ifdef XTRA get_probs(gtavec); #endif if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phirat ) { tscore = phiratioscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } if ( ddS.TwT==NULL && ddP.phi==NULL && scoretype == ST_phirat ) yap_quit("Cannot use '-orat' option with this model/settings.\n"); if ( ddP.PYalpha==H_NG ) { /* * provide an estimate of alpha */ ngalpha = dvec(ddN.T); get_probs(ngalpha); for (k=0; k<ddN.T; k++) { ddP.alphapr[k] = ngalpha[k]; } } /* * returns null if no relevant data file */ termstats = tstats_init(ddS.z, ddD.NdTcum, ddN.T, ddN.DT, stem); if ( termstats ) { if ( scoretype == ST_idf ) { termtscore = termidfscore; } else termtscore = termcountscore; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); if ( termstats ) build_termNwK(termstats); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); if ( termstats ) { termindk = malloc(sizeof(*indk)*termstats->K); if ( !termindk ) yap_quit("Cannot allocate termindk in hca_displaytopics()\n"); } data_df(stem); #ifdef KL for (w=0; w<ddN.W; w++) dfvec[w] = ddD.df[w]; #endif /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; if ( termstats ) { termcnt = buildtermindk(k, termindk, termstats); topk(topword, termcnt, termindk, termtscore); } /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } if ( termstats ) { for (w=0; w<topword && w<termcnt; w++) { fprintf(fp," %d", (int)termstats->Kmin+termindk[w]); } } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); if ( termstats ) { termcnt = buildtermindk(kk, termindk, termstats); topk(topword, termcnt, termindk, termtscore); if ( topword<termcnt ) termcnt = topword; } /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; #define MALLET_EW #ifdef MALLET_EW double ewp = dprop?(1.0/fv_expprob(pvec,ddN.W)):ddN.W; #endif double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); #ifdef XTRA yap_message((ddN.T>200)?"/%.3lf%%":"/%.2lf%%",100*gtavec[kk]); #endif if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); #ifdef KL yap_message(" ew=%lf", ew); #else yap_message(" ew=%.0lf", ew); #endif #ifdef MALLET_EW yap_message(" ewp=%.1lf", ewp); #endif yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddP.NGbeta ) { /* * approx. as sqrt(var(lambda_k)/lambda-normaliser */ double ngvar = sqrt(ddP.NGalpha[kk]) * (ngalpha[kk]/ddP.NGalpha[kk]); yap_message(" ng=%.4lf,%.4lf", ngalpha[kk], ngvar/ngalpha[kk]); if ( ddS.sparse ) yap_message(",%.4f", 1-((float)ddS.sparseD[kk])/ddN.DTused); if ( verbose>2 ) yap_message(" ngl=%.4lf,%.4lf, nga=%.4lf,%.4lf", ddP.NGalpha[kk]/ddP.NGbeta[kk], sqrt(ddP.NGalpha[kk]/ddP.NGbeta[kk]/ddP.NGbeta[kk]), ddP.NGalpha[kk], ddP.NGbeta[kk]); } if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); #ifdef KL yap_message(" %lf", ew); #else fprintf(rp," %.2lf", ew); #endif #ifdef MALLET_EW fprintf(rp," %.2lf", ewp); #endif fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(tscore(indk[w])+0.2)); else yap_message("(%6lf)", tscore(indk[w])); } if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } if ( termstats ) { yap_message(" terms="); for (w=0; w<termcnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", termstats->tokens[termindk[w]]); else yap_message("%d", termstats->Kmin+termindk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(termtscore(termindk[w])+0.2)); else yap_message("(%6lf)", termtscore(termindk[w])); } if ( fullreport ) { fprintf(rp, "term %d %d %d", kk, termindk[w], w); fprintf(rp, " %d", termstats->Nkt[termindk[w]][kk]); fprintf(rp, " %s", termstats->tokens[termindk[w]]); fprintf(rp, "\n"); } } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); /* this case gives bad results */ // if ( scoretype == ST_phirat ) topk(topword, cnt, indk, phiratioscore); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; if ( ddP.phi && ddP.PYbeta!=H_PDP ) { for (w=0; w<ddN.W; w++) pvec[w] = ddS.phi[ddN.T][w]; } else { for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); } #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 && !ddP.phi ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\nTopical words="); topk(topword, cnt, indk, phiinvratioscore); for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddS.sparse && ddP.PYalpha==H_NG ) { double avesp = 0; // correct_docsp(); for (k=0; k<ddN.T; k++) { avesp += gtvec[k]; } // check gtvec[] sums to 1 assert(fabs(avesp-1.0)<0.00001); avesp = 0; for (k=0; k<ddN.T; k++) { avesp += gtvec[k]*((float)ddS.sparseD[k])/ddN.DTused; assert(ddS.sparseD[k]<=ddN.DTused); } assert(avesp<=1.0); assert(avesp>0.0); yap_message("IBP sparsity = %.2lf%%\n", 100*(1-avesp)); } if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-7 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( ngalpha ) free(ngalpha); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } #ifdef KL free(dfvec); #endif free(pvec); free(gtvec); free(gpvec); tstats_free(termstats); }
void hca_displaytopics(char *resstem, int topword, enum ScoreType scoretype) { int w,k; int *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; char *fname = yap_makename(resstem,".top"); int nophi = (ddP.phi==NULL) && (ddS.phi==NULL); FILE *fp; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } fp = fopen(fname,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", fname); /* * first collect counts of each word/term */ if ( scoretype != ST_count && scoretype != ST_phi ) { NwK = u32vec(ddN.W); if ( !NwK ) yap_quit("Out of memory in hca_displaytopics()\n"); for (w=0; w<ddN.W; w++) { NwK[w] = 0; } NWK = 0; for (w=0; w<ddN.W; w++) { for (k=0; k<ddN.T; k++) { NwK[w] += ddS.Nwt[w][k]; // should use CCT_ReadN() } NWK += NwK[w]; } } assert(ddN.tokens); for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk\n"); for (k=0; k<ddN.T; k++) { int cnt; double spw; double spd; tscorek = k; /* * print top words */ cnt=0; if ( ddP.phi==NULL ) { for (w=0; w<ddN.W; w++) { if ( ddS.Nwt[w][k]>0 ) indk[cnt++] = w; } } else { float **phi; if ( ddP.phi ) phi = ddP.phi; else phi = ddS.phi; for (w=0; w<ddN.W; w++) { if ( phi[k][w]>0.5/ddN.W ) indk[cnt++] = w; } } topk(topword, cnt, indk, tscore); spd = ((double)nonzero_Ndt(k))/((double)ddN.DT); sparsitydoc += spd; if ( nophi ) { spw = ((double)nonzero_Nwt(k))/((double)ddN.W); sparsityword += spw; } if ( ddS.NWt[k]*ddN.T*100<Nk_tot ) underused++; yap_message("\nTopic %d (", k); if ( ddP.phi==NULL ) yap_message((ddN.T>200)?"p=%.3lf%%,":"p=%.2lf%%,", 100*((double)ddS.NWt[k])/(double)Nk_tot); if ( nophi ) yap_message("ws=%.1lf%%,", 100*(1-spw)); else yap_message("#=%.0lf,", exp(phi_entropy(k))); yap_message("ds=%.1lf%%", 100*(1-spd) ); fprintf(fp,"%d: ", k); yap_message(") words ="); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( ddP.PYbeta && nophi ) { int cnt; /* * print root words */ tscorek = -1; cnt=0; for (w=0; w<ddN.W; w++) { if ( ddS.TwT[w]>0 ) indk[cnt++] = w; } topk(topword, cnt, indk, tscore); yap_message("\nTopic root words ="); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( nophi ) yap_message("Average topicXword sparsity = %.2lf%%, ", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%, " "underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddP.bdk!=NULL) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("doc PYP report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } fclose(fp); free(fname); free(indk); if ( scoretype != ST_count ) { free(NwK); NwK = NULL; } }
SEXP deterministic(SEXP Y, SEXP X, SEXP Rprobinit, SEXP Rmodeldim, SEXP incint, SEXP Ralpha, SEXP method, SEXP modelprior) { SEXP RXwork = PROTECT(duplicate(X)), RYwork = PROTECT(duplicate(Y)); int nProtected = 2; int nModels=LENGTH(Rmodeldim); SEXP ANS = PROTECT(allocVector(VECSXP, 12)); ++nProtected; SEXP ANS_names = PROTECT(allocVector(STRSXP, 12)); ++nProtected; SEXP Rprobs = PROTECT(duplicate(Rprobinit)); ++nProtected; SEXP R2 = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP shrinkage = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP modelspace = PROTECT(allocVector(VECSXP, nModels)); ++nProtected; SEXP modeldim = PROTECT(duplicate(Rmodeldim)); ++nProtected; SEXP beta = PROTECT(allocVector(VECSXP, nModels)); ++nProtected; SEXP se = PROTECT(allocVector(VECSXP, nModels)); ++nProtected; SEXP mse = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP modelprobs = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP priorprobs = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP logmarg = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP sampleprobs = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP Rse_m, Rcoef_m, Rmodel_m; double *Xwork, *Ywork, *coefficients,*probs, SSY, yty, ybar, mse_m, *se_m, pigamma, R2_m, RSquareFull, alpha, logmarg_m, shrinkage_m; double *XtX, *XtY, *XtXwork, *XtYwork; double one, zero; int inc, p2; int nobs, p, k, i, j, m, n, l, pmodel, *xdims, *model_m, *model; char uplo[] = "U", trans[]="T"; Bit **models; struct Var *vars; /* Info about the model variables. */ /* get dimsensions of all variables */ nobs = LENGTH(Y); xdims = INTEGER(getAttrib(X,R_DimSymbol)); p = xdims[1]; k = LENGTH(modelprobs); Ywork = REAL(RYwork); Xwork = REAL(RXwork); XtX = (double *) R_alloc(p * p, sizeof(double)); XtXwork = (double *) R_alloc(p * p, sizeof(double)); XtY = vecalloc(p); XtYwork = vecalloc(p); /* create X matrix */ for (j=0, l=0; j < p; j++) { for (i = 0; i < p; i++) { XtX[j*p + i] = 0.0;} } p2 = p*p; one = 1.0; zero = 0.0; ybar = 0.0; SSY = 0.0; yty = 0.0; inc = 1; F77_NAME(dsyrk)(uplo, trans, &p, &nobs, &one, &Xwork[0], &nobs, &zero, &XtX[0], &p); yty = F77_NAME(ddot)(&nobs, &Ywork[0], &inc, &Ywork[0], &inc); for (i = 0; i< nobs; i++) { ybar += Ywork[i]; } ybar = ybar/ (double) nobs; SSY = yty - (double) nobs* ybar *ybar; F77_NAME(dgemv)(trans, &nobs, &p, &one, &Xwork[0], &nobs, &Ywork[0], &inc, &zero, &XtY[0],&inc); alpha = REAL(Ralpha)[0]; vars = (struct Var *) R_alloc(p, sizeof(struct Var)); probs = REAL(Rprobs); n = sortvars(vars, probs, p); /* Make space for the models and working variables. */ models = cmatalloc(k,p); model = (int *) R_alloc(p, sizeof(int)); k = topk(models, probs, k, vars, n, p); /* Fit Full model */ if (nobs <= p) {RSquareFull = 1.0;} else { Rcoef_m = NEW_NUMERIC(p); PROTECT(Rcoef_m); Rse_m = NEW_NUMERIC(p); PROTECT(Rse_m); coefficients = REAL(Rcoef_m); se_m = REAL(Rse_m); memcpy(coefficients, XtY, p*sizeof(double)); memcpy(XtXwork, XtX, p2*sizeof(double)); memcpy(XtYwork, XtY, p*sizeof(double)); mse_m = yty; cholreg(XtYwork, XtXwork, coefficients, se_m, &mse_m, p, nobs); /* olsreg(Ywork, Xwork, coefficients, se_m, &mse_m, &p, &nobs, pivot,qraux,work,residuals,effects,v, betaols); */ RSquareFull = 1.0 - (mse_m * (double) ( nobs - p))/SSY; UNPROTECT(2); } /* now fit all top k models */ for (m=0; m < k; m++) { pmodel = 0; pigamma = 1.0; for (j = 0; j < p; j++) { model[j] = (int) models[m][j]; pmodel += (int) models[m][j]; pigamma *= (double)((int) models[m][j])*probs[j] + (1.0 - (double)((int) models[m][j]))*(1.0 - probs[j]); } REAL(sampleprobs)[m] = pigamma; INTEGER(modeldim)[m] = pmodel; Rmodel_m = NEW_INTEGER(pmodel); PROTECT(Rmodel_m); model_m = INTEGER(Rmodel_m); for (j = 0, l=0; j < p; j++) { if (models[m][j]) { model_m[l] = j; l +=1; } } INTEGER(modeldim)[m] = pmodel; SET_ELEMENT(modelspace, m, Rmodel_m); UNPROTECT(1); Rcoef_m = NEW_NUMERIC(pmodel); PROTECT(Rcoef_m); Rse_m = NEW_NUMERIC(pmodel); PROTECT(Rse_m); coefficients = REAL(Rcoef_m); se_m = REAL(Rse_m); for (j=0, l=0; j < pmodel; j++) { XtYwork[j] = XtY[model_m[j]]; for ( i = 0; i < pmodel; i++) { XtXwork[j*pmodel + i] = XtX[model_m[j]*p + model_m[i]]; } } mse_m = yty; memcpy(coefficients, XtYwork, sizeof(double)*pmodel); cholreg(XtYwork, XtXwork, coefficients, se_m, &mse_m, pmodel, nobs); /* olsreg(Ywork, Xwork, coefficients, se_m, &mse_m, &pmodel, &nobs, pivot,qraux,work,residuals,effects,v, betaols); */ R2_m = 1.0 - (mse_m * (double) ( nobs - pmodel))/SSY; SET_ELEMENT(beta, m, Rcoef_m); SET_ELEMENT(se, m, Rse_m); REAL(R2)[m] = R2_m; REAL(mse)[m] = mse_m; gexpectations(p, pmodel, nobs, R2_m, alpha, INTEGER(method)[0], RSquareFull, SSY, &logmarg_m, &shrinkage_m); REAL(logmarg)[m] = logmarg_m; REAL(priorprobs)[m] = compute_prior_probs( model, pmodel,p, modelprior); REAL(shrinkage)[m] = shrinkage_m; UNPROTECT(2); } compute_modelprobs(modelprobs, logmarg, priorprobs, k); compute_margprobs_old(models, modelprobs, probs, k, p); /* freechmat(models,k); */ SET_VECTOR_ELT(ANS, 0, Rprobs); SET_STRING_ELT(ANS_names, 0, mkChar("probne0")); SET_VECTOR_ELT(ANS, 1, modelspace); SET_STRING_ELT(ANS_names, 1, mkChar("which")); SET_VECTOR_ELT(ANS, 2, logmarg); SET_STRING_ELT(ANS_names, 2, mkChar("logmarg")); SET_VECTOR_ELT(ANS, 3, modelprobs); SET_STRING_ELT(ANS_names, 3, mkChar("postprobs")); SET_VECTOR_ELT(ANS, 4, priorprobs); SET_STRING_ELT(ANS_names, 4, mkChar("priorprobs")); SET_VECTOR_ELT(ANS, 5,sampleprobs); SET_STRING_ELT(ANS_names, 5, mkChar("sampleprobs")); SET_VECTOR_ELT(ANS, 6, mse); SET_STRING_ELT(ANS_names, 6, mkChar("mse")); SET_VECTOR_ELT(ANS, 7, beta); SET_STRING_ELT(ANS_names, 7, mkChar("ols")); SET_VECTOR_ELT(ANS, 8, se); SET_STRING_ELT(ANS_names, 8, mkChar("ols.se")); SET_VECTOR_ELT(ANS, 9, shrinkage); SET_STRING_ELT(ANS_names, 9, mkChar("shrinkage")); SET_VECTOR_ELT(ANS, 10, modeldim); SET_STRING_ELT(ANS_names, 10, mkChar("size")); SET_VECTOR_ELT(ANS, 11, R2); SET_STRING_ELT(ANS_names, 11, mkChar("R2")); setAttrib(ANS, R_NamesSymbol, ANS_names); UNPROTECT(nProtected); return(ANS); }
SEXP glm_deterministic(SEXP Y, SEXP X, SEXP Roffset, SEXP Rweights, SEXP Rprobinit, SEXP Rmodeldim, SEXP modelprior, SEXP betaprior, SEXP family, SEXP Rcontrol, SEXP Rlaplace) { int nProtected = 0; int nModels=LENGTH(Rmodeldim); glmstptr * glmfamily; glmfamily = make_glmfamily_structure(family); betapriorptr *betapriorfamily; betapriorfamily = make_betaprior_structure(betaprior, family); // Rprintf("Allocating Space for %d Models\n", nModels) ; SEXP ANS = PROTECT(allocVector(VECSXP, 14)); ++nProtected; SEXP ANS_names = PROTECT(allocVector(STRSXP, 14)); ++nProtected; SEXP Rprobs = PROTECT(duplicate(Rprobinit)); ++nProtected; SEXP R2 = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP shrinkage = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP modelspace = PROTECT(allocVector(VECSXP, nModels)); ++nProtected; SEXP modeldim = PROTECT(duplicate(Rmodeldim)); ++nProtected; SEXP beta = PROTECT(allocVector(VECSXP, nModels)); ++nProtected; SEXP se = PROTECT(allocVector(VECSXP, nModels)); ++nProtected; SEXP deviance = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP modelprobs = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP priorprobs = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP logmarg = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP sampleprobs = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP Q = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; SEXP Rintercept = PROTECT(allocVector(REALSXP, nModels)); ++nProtected; double *probs,shrinkage_m,logmargy; //get dimsensions of all variables int p = INTEGER(getAttrib(X,R_DimSymbol))[1]; int k = LENGTH(modelprobs); struct Var *vars = (struct Var *) R_alloc(p, sizeof(struct Var)); // Info about the model variables. probs = REAL(Rprobs); int n = sortvars(vars, probs, p); Bit **models = cmatalloc(k,p); int *model = (int *) R_alloc(p, sizeof(int)); k = topk(models, probs, k, vars, n, p); /* now fit all top k models */ for (int m=0; m < k; m++) { int pmodel = 0; double pigamma = 1.0; for (int j = 0; j < p; j++) { model[j] = (int) models[m][j]; pmodel += (int) models[m][j]; pigamma *= (double)((int) models[m][j])*probs[j] + (1.0 - (double)((int) models[m][j]))*(1.0 - probs[j]); } SEXP Rmodel_m = PROTECT(allocVector(INTSXP,pmodel)); GetModel_m(Rmodel_m, model, p); //evaluate logmargy and shrinkage SEXP glm_fit = PROTECT(glm_FitModel(X, Y, Rmodel_m, Roffset, Rweights, glmfamily, Rcontrol, Rlaplace, betapriorfamily)); double prior_m = compute_prior_probs(model,pmodel,p, modelprior); logmargy = REAL(getListElement(getListElement(glm_fit, "lpy"),"lpY"))[0]; shrinkage_m = REAL(getListElement(getListElement(glm_fit, "lpy"), "shrinkage"))[0]; SetModel2(logmargy, shrinkage_m, prior_m, sampleprobs, logmarg, shrinkage, priorprobs, m); REAL(sampleprobs)[m] = pigamma; SetModel1(glm_fit, Rmodel_m, beta, se, modelspace, deviance, R2, Q, Rintercept, m); UNPROTECT(2); } compute_modelprobs(modelprobs, logmarg, priorprobs, k); compute_margprobs_old(models, modelprobs, probs, k, p); /* freechmat(models,k); */ SET_VECTOR_ELT(ANS, 0, Rprobs); SET_STRING_ELT(ANS_names, 0, mkChar("probne0")); SET_VECTOR_ELT(ANS, 1, modelspace); SET_STRING_ELT(ANS_names, 1, mkChar("which")); SET_VECTOR_ELT(ANS, 2, logmarg); SET_STRING_ELT(ANS_names, 2, mkChar("logmarg")); SET_VECTOR_ELT(ANS, 3, modelprobs); SET_STRING_ELT(ANS_names, 3, mkChar("postprobs")); SET_VECTOR_ELT(ANS, 4, priorprobs); SET_STRING_ELT(ANS_names, 4, mkChar("priorprobs")); SET_VECTOR_ELT(ANS, 5,sampleprobs); SET_STRING_ELT(ANS_names, 5, mkChar("sampleprobs")); SET_VECTOR_ELT(ANS, 6, deviance); SET_STRING_ELT(ANS_names, 6, mkChar("deviance")); SET_VECTOR_ELT(ANS, 7, beta); SET_STRING_ELT(ANS_names, 7, mkChar("mle")); SET_VECTOR_ELT(ANS, 8, se); SET_STRING_ELT(ANS_names, 8, mkChar("mle.se")); SET_VECTOR_ELT(ANS, 9, shrinkage); SET_STRING_ELT(ANS_names, 9, mkChar("shrinkage")); SET_VECTOR_ELT(ANS, 10, modeldim); SET_STRING_ELT(ANS_names, 10, mkChar("size")); SET_VECTOR_ELT(ANS, 11, R2); SET_STRING_ELT(ANS_names, 11, mkChar("R2")); SET_VECTOR_ELT(ANS, 12, Q); SET_STRING_ELT(ANS_names, 12, mkChar("Q")); SET_VECTOR_ELT(ANS, 13, Rintercept); SET_STRING_ELT(ANS_names, 13, mkChar("intercept")); setAttrib(ANS, R_NamesSymbol, ANS_names); UNPROTECT(nProtected); return(ANS); }
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); yap_message(" ew=%.0lf", ew); yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); fprintf(rp," %.2lf", ew); fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", tscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-3 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } free(pvec); free(gtvec); free(gpvec); }