static void merge_init_Twt(int k1, int k2, merge_beta_t *M) { int w; /* * build local store */ M->Twt = u16vec(ddN.W); M->TwT = u32vec(ddN.W); M->Nwt = u32vec(ddN.W); if ( !M->Nwt || !M->TwT || !M->Twt ) yap_quit("Out of memory in likemerge()\n"); /* * initialise all data entries to simple sum */ M->NWt = ddS.NWt[k1] + ddS.NWt[k2]; M->TWt = ddS.TWt[k1] + ddS.TWt[k2]; for (w=0; w<ddN.W; w++) { M->Twt[w] = ddS.Twt[w][k1] + ddS.Twt[w][k2]; M->Nwt[w] = ddS.Nwt[w][k1] + ddS.Nwt[w][k2]; M->TwT[w] = ddS.TwT[w]; } M->TWTm = ddS.TWT - M->TWt; #ifndef NDEBUG for (w=0; w<ddN.W; w++) { assert(M->Twt[w]<=M->Nwt[w]); } #endif }
/* * global probability for topic: * taken from data if exists; * else from ddP.alphapr * else is 0 */ static float *globalprop() { float *vec = fvec(ddN.T); double tot = 0; int k; if ( !vec) return NULL; if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { tot += vec[k] = ddS.NWt[k]; } } else if ( ddS.Ndt ) { /* have to total from Ndt */ int d; uint32_t *uvec = u32vec(ddN.T); for (d=0; d<ddN.DT; d++) { for (k=0; k<ddN.T; k++) uvec[k] += ddS.Ndt[d][k]; } for (k=0; k<ddN.T; k++) tot += vec[k] = uvec[k]; free(uvec); } else if ( ddP.alphapr ) { /* * this is rather poor, and should be rewritten */ for (k=0; k<ddN.T; k++) tot += vec[k] = ddP.alphapr[k]; } if ( tot <=0 ) return vec; for (k=0; k<ddN.T; k++) vec[k] /= tot; return vec; }
static void build_NwK() { int w, k; NwK = u32vec(ddN.W); if ( !NwK ) yap_quit("Out of memory in hca_displaytopics()\n"); for (w=0; w<ddN.W; w++) { NwK[w] = 0; } NWK = 0; if ( !ddS.Nwt ) { /* * recompute from scratch */ int i; for (i=0; i<ddN.NT; i++) NwK[ddD.w[i]]++; for (w=0; w<ddN.W; w++) NWK += NwK[w]; } else { for (w=0; w<ddN.W; w++) { for (k=0; k<ddN.T; k++) { NwK[w] += ddS.Nwt[w][k]; // should use CCT_ReadN() } NWK += NwK[w]; } } if ( NWK==0 ) yap_quit("empty NWK in build_NwK()\n"); }
static uint32_t *sorttops(float *vec, int K) { uint32_t *psort = u32vec(K); int k; for (k=0; k<K; k++) psort[k] = k; stvec = vec; qsort(psort, K, sizeof(*psort), pcompar); return psort; }
/* * Build a vector giving number of times each topic is the * most common in a doc. */ static uint32_t *hca_top1cnt() { uint32_t *cnt = u32vec(ddN.T); int i, t; for (i=0; i<ddN.DT; i++) { float *tvec = topprop(i); int maxt = 0; for (t=0; t<ddN.T; t++) if ( tvec[t]>tvec[maxt] ) maxt = t; cnt[maxt]++; free(tvec); } return cnt; }
static void build_termNwK(T_stats_t *ptr) { int w, k; termNwK = u32vec(ptr->K); if ( !termNwK ) yap_quit("Out of memory in hca_displaytopics()\n"); for (w=0; w<ptr->K; w++) { termNwK[w] = 0; } termNWK = 0; for (w=0; w<ptr->K; w++) { for (k=0; k<ddN.T; k++) { termNwK[w] += ptr->Nkt[w][k]; } termNWK += termNwK[w]; } if ( termNWK==0 ) yap_quit("empty termNWK in build_termNwK(), collocations empty!\n"); termNwk = ptr->Nkt; }
/* * print out the topic topk=10 words. report the PMI score. */ double report_pmi(char *topfile, /* name of topics file */ char *pmifile, /* name of PMI file */ int T, /* total topics */ int W, /* total words */ int E, /* number of epochs */ int topk, double *tp) { int lineno = 0; int i,k, thee; /* * mapping from local index to actual word index */ uint32_t *wind = u32vec(topk*T*E); int n_wind = 0; /* * boolean vector ... is word used */ uint32_t *wuse = u32vec(W/32+1); /* * PMI's by local index */ uint32_t *topic = u32vec(topk); float *coherency = fvec(E); double **pmi; float ave = 0; char *line; size_t n_line; FILE *fr; if ( !wind || !wuse ) yap_quit("Out of memory in report_pmi()\n"); /* * read in file of top word indices in topic */ fr = fopen(topfile,"r"); if ( !fr ) yap_sysquit("Topic file '%s' not read\n", topfile); line = NULL; n_line = 0; lineno = 0; while ( getline(&line, &n_line, fr)>0 ) { char *buf = line; unsigned j; int e = 0; lineno ++; buf += strspn(buf," \t\n"); // skip space if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) yap_quit("Cannot read topic in topic line %d from file '%s'\n", lineno, topfile); if ( k<0 || k>=T ) continue; if ( e<0 || e>=E ) continue; for (i = 0; i<topk && *buf; i++) { buf = strpbrk(buf," \t\n"); // skip to next space if ( sscanf(buf, " %u", &j) <1 ) { if ( verbose>2 ) yap_message("Cannot read word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); break; } if ( j>=W) { yap_quit("Bad word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); } buf += strspn(buf," \t\n"); // skip space /* * check if word exists, and set up its index */ if ( wuse[j/32U] & (1U<<(j%32U)) ) { // yes, so search for it int ii; for (ii=0; ii<n_wind; ii++) if ( wind[ii]==j ) break; if ( ii>=n_wind ) yap_quit("Lookup of word %d failed at line %d in report_pmi()\n", (int)j, lineno); } else { // no, so add it wuse[j/32U] |= (1U<<(j%32U)); wind[n_wind] = j; n_wind++; } } free(line); line = NULL; n_line = 0; } fclose(fr); pmi = dmat(n_wind,n_wind); /* * build hash table now since we know size */ hashsize = n_wind*2; hashtab = malloc(sizeof(*hashtab)*hashsize); if ( !pmi || !hashtab ) yap_quit("Out of memory in report_pmi()\n"); for (i=0; i<hashsize; i++) hashtab[i] = 0; for (i=0; i<n_wind; i++) addw(wind[i],i); /* * load up PMI file, only keeping words mentioned in hash table */ { unsigned t1, t2; double value; int zcat = 0; fr = fopen(pmifile,"r"); if ( !fr ) { /* * try to zcat it */ char *cmd = malloc(strlen(pmifile)+20); sprintf(cmd,"%s.gz", pmifile); fr = fopen(cmd,"r"); if ( !fr ) yap_sysquit("Cannot open pmifile '%s' in report_pmi()\n", pmifile); fclose(fr); sprintf(cmd,"gunzip -c %s", pmifile); fr = popen(cmd,"r"); if ( !fr ) yap_sysquit("Cannot open or zcat pmifile '%s' in report_pmi()\n", pmifile); zcat = 1; free(cmd); } while (fscanf(fr, "%u %u %lg", &t1, &t2, &value)==3 ) { if ( t1>=W || t2>= W ) yap_quit("Illegal word index in report_pmi()\n"); if ( t1!= t2 && ( wuse[t1/32U] & (1U<<(t1%32U)) ) && ( wuse[t2/32U] & (1U<<(t2%32U))) ) { int i1, i2; i1 = findw(t1,wind); i2 = findw(t2,wind); if ( i1==UINT32_MAX || i2==UINT32_MAX ) yap_quit("Could not locate word index in report_pmi()\n"); pmi[i1][i2]=value; pmi[i2][i1]=value; } } if ( zcat ) pclose(fr); else fclose(fr); } /* * compute PMI score for each topic */ fr = fopen(topfile,"r"); if ( !fr ) yap_sysquit("Topic file '%s' not read\n", topfile); line = NULL; n_line = 0; thee = 0; lineno = 0; if ( E>1 ) yap_message("PMI %d:: ", 0); else yap_message("PMI :: "); while ( getline(&line, &n_line, fr)>0 ) { /* * repeat logic above to read topic file again */ char *buf = line; unsigned j; int cnt = 0; int e = 0; double coh = 0; buf += strspn(buf," \t\n"); // skip space if ( (E==1 && sscanf(buf, "%d: ", &k)<1) || (E>1 && sscanf(buf, "%d,%d: ", &e, &k)<2) ) yap_quit("Cannot read topic in topic line %d from file '%s'\n", lineno, topfile); if ( k<0 || k>=T ) continue; if ( e<0 || e>=E ) continue; if ( e!=thee ) { thee = e; yap_message("\nPMI %d:: ", e); } for (i = 0; i<topk && *buf; i++) { buf = strpbrk(buf," \t\n"); // skip to next space if ( sscanf(buf, " %u", &j) <1 ) { yap_message("Cannot read word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); break; } if ( j>=W) { yap_quit("Bad word %d in topic line %d from file '%s'\n", i+1, lineno, topfile); } buf += strspn(buf," \t\n"); // skip space topic[i] = findw(j,wind); } if ( i<topk ) topic[i] = W; /* * topics now read */ for (i=0; i<topk && topic[i]<W; i++) { for (j=i+1; j<topk && topic[j]<W; j++) { coh += pmi[topic[i]][topic[j]]; cnt ++; } } if ( cnt>0 ) coh /= cnt; coherency[e] += coh * tp[k]; yap_message(" %d:%.3lf", k, coh); } fclose(fr); yap_message("\nPMI ="); if ( E==1 ) { yap_message(" %.3lf\n", coherency[0]); ave = coherency[0]; } else { int e; for (e=0; e<E; e++) { ave += coherency[e]; yap_message(" %.3lf", coherency[e]); } ave /= E; yap_message(" -> %.3lf\n", ave); } free(wind); free(coherency); free(wuse); free(topic); free(pmi[0]); free(pmi); free(hashtab); hashtab = NULL; hashsize = 0; return ave; }
void hca_displaytopics(char *resstem, int topword, enum ScoreType scoretype) { int w,k; int *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; char *fname = yap_makename(resstem,".top"); int nophi = (ddP.phi==NULL) && (ddS.phi==NULL); FILE *fp; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } fp = fopen(fname,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", fname); /* * first collect counts of each word/term */ if ( scoretype != ST_count && scoretype != ST_phi ) { NwK = u32vec(ddN.W); if ( !NwK ) yap_quit("Out of memory in hca_displaytopics()\n"); for (w=0; w<ddN.W; w++) { NwK[w] = 0; } NWK = 0; for (w=0; w<ddN.W; w++) { for (k=0; k<ddN.T; k++) { NwK[w] += ddS.Nwt[w][k]; // should use CCT_ReadN() } NWK += NwK[w]; } } assert(ddN.tokens); for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk\n"); for (k=0; k<ddN.T; k++) { int cnt; double spw; double spd; tscorek = k; /* * print top words */ cnt=0; if ( ddP.phi==NULL ) { for (w=0; w<ddN.W; w++) { if ( ddS.Nwt[w][k]>0 ) indk[cnt++] = w; } } else { float **phi; if ( ddP.phi ) phi = ddP.phi; else phi = ddS.phi; for (w=0; w<ddN.W; w++) { if ( phi[k][w]>0.5/ddN.W ) indk[cnt++] = w; } } topk(topword, cnt, indk, tscore); spd = ((double)nonzero_Ndt(k))/((double)ddN.DT); sparsitydoc += spd; if ( nophi ) { spw = ((double)nonzero_Nwt(k))/((double)ddN.W); sparsityword += spw; } if ( ddS.NWt[k]*ddN.T*100<Nk_tot ) underused++; yap_message("\nTopic %d (", k); if ( ddP.phi==NULL ) yap_message((ddN.T>200)?"p=%.3lf%%,":"p=%.2lf%%,", 100*((double)ddS.NWt[k])/(double)Nk_tot); if ( nophi ) yap_message("ws=%.1lf%%,", 100*(1-spw)); else yap_message("#=%.0lf,", exp(phi_entropy(k))); yap_message("ds=%.1lf%%", 100*(1-spd) ); fprintf(fp,"%d: ", k); yap_message(") words ="); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( ddP.PYbeta && nophi ) { int cnt; /* * print root words */ tscorek = -1; cnt=0; for (w=0; w<ddN.W; w++) { if ( ddS.TwT[w]>0 ) indk[cnt++] = w; } topk(topword, cnt, indk, tscore); yap_message("\nTopic root words ="); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); if ( verbose>2 ) { double score = tscore(indk[w]); yap_message(",%s(%6lf)", ddN.tokens[indk[w]], score); } else yap_message(",%s", ddN.tokens[indk[w]]); } yap_message("\n"); fprintf(fp, "\n"); } if ( nophi ) yap_message("Average topicXword sparsity = %.2lf%%, ", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%, " "underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddP.bdk!=NULL) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("doc PYP report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } fclose(fp); free(fname); free(indk); if ( scoretype != ST_count ) { free(NwK); NwK = NULL; } }
static void merge_opt_Tdt(int k1, int k2, merge_alpha_t *M) { int d; struct heap_s up; struct heap_s down; /* * sorting on moves, * stores doc index for */ uint32_t *Tdt_up; uint32_t *Tdt_down; /* * change from incr/decr this docs Tdt */ float *score_up; float *score_down; Tdt_up = u32vec(ddN.DT); Tdt_down = u32vec(ddN.DT); score_up = fvec(ddN.DT); score_down = fvec(ddN.DT); if ( !score_down || !score_up || !Tdt_up || !Tdt_down ) yap_quit("Out of memory in likemerge()\n"); /* * initialise sort */ for (d=0; d<ddN.DT; d++) { assert(M->Tdt[d]<=M->Ndt[d]); /* don't change for some docs */ Tdt_up[d] = d; Tdt_down[d] = d; if ( M->Tdt[d]<M->Ndt[d] ) score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) * S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1); else score_up[d] = 0; if ( M->Tdt[d]>1 ) score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d]) /(ddP.bpar + ddP.apar*(M->TdT[d]-1)); else score_down[d] = 0; assert((M->Tdt[d]>1)||score_down[d]==0); assert((M->Tdt[d]<M->Ndt[d])||score_up[d]==0); assert(M->Tdt[d]<=M->Ndt[d]); } assert(M->TDt>0); /* * use a heap, so only top of heap is least */ heap_init(&up, Tdt_up, ddN.DT, fveccmp, (void *)score_up); heap_init(&down, Tdt_down, ddN.DT, fveccmp, (void *)score_down); while ( 1 ) { float upv; float downv; upv = merge_alphabasetopicprob(M->TDTm+M->TDt, M->TDt, k1) *score_up[heap_front(&up)]; if ( M->TDt>1 ) downv = score_down[heap_front(&down)] / merge_alphabasetopicprob(M->TDTm+M->TDt-1, M->TDt-1, k1); else downv = 0.0; if ( downv>upv && downv>1.0 ){ // decrement this d = heap_front(&down); M->TdT[d]--; M->Tdt[d]--; assert(M->Tdt[d]>0); M->TDt--; heap_pop(&down); heap_remove(&up,d); } else if ( downv<upv && upv>1.0 ){ // increment this d = heap_front(&up); M->TdT[d]++; M->Tdt[d]++; assert(M->Tdt[d]<=M->Ndt[d]); M->TDt++; heap_pop(&up); heap_remove(&down,d); } else { // none are better so quit break; } if ( M->Tdt[d]<M->Ndt[d] ) score_up[d] = (ddP.bpar + ddP.apar*M->TdT[d]) * S_V(ddC.SX,M->Ndt[d],M->Tdt[d]+1); else score_up[d] = 0; if ( M->Tdt[d]>1 ) score_down[d] = 1.0 / S_V(ddC.SX,M->Ndt[d],M->Tdt[d]) /(ddP.bpar + ddP.apar*(M->TdT[d]-1)); else score_down[d] = 0; assert(M->Tdt[d]>1||score_down[d]==0); assert(M->Tdt[d]<M->Ndt[d] ||score_up[d]==0); assert(M->Tdt[d]<=M->Ndt[d]); /* * now adjust the two heaps for new vals for [d] */ heap_push(&down,d); heap_push(&up,d); } free(score_up); free(score_down); heap_free(&up); heap_free(&down); }