static inline void calc(int argc, char** argv) { StdDeck_CardMask hole, board; StdDeck_CardMask card; StdDeck_CardMask_RESET(hole); StdDeck_CardMask_RESET(board); int i, cardi; for(i = 1; i < 5; ++i) { StdDeck_stringToCard(argv[i], &cardi); card = StdDeck_MASK(cardi); StdDeck_CardMask_OR(hole, hole, card); } po_probs probs; probs = get_probs(hole, board); printf("%.4f", probs.win + probs.draw); if ( argc > 7) { for(i = 5; i < 8; ++i) { StdDeck_stringToCard(argv[i], &cardi); card = StdDeck_MASK(cardi); StdDeck_CardMask_OR(board, board, card); } probs = get_probs(hole, board); printf(" %.4f", probs.win + probs.draw); } if ( argc > 8) { i = 8; StdDeck_stringToCard(argv[i], &cardi); card = StdDeck_MASK(cardi); StdDeck_CardMask_OR(board, board, card); probs = get_probs(hole, board); printf(" %.4f", probs.win + probs.draw); } if ( argc > 9) { i = 9; StdDeck_stringToCard(argv[i], &cardi); card = StdDeck_MASK(cardi); StdDeck_CardMask_OR(board, board, card); probs = get_probs(hole, board); printf(" %.4f", probs.win + probs.draw); } printf("\n"); }
void print_probs(FILE *fp) { int t, num = 0; double *vp = malloc(sizeof(*vp)*ddN.T); get_probs(vp); fprintf(fp, "factor = %lf\nprobs = ", ddP.alpha); for (t=0; t<ddN.T; t++) if ( vp[t]>0 ) { fprintf(fp, " %lf", vp[t]); num++; } else fprintf(fp, " -"); fprintf(fp, "\n"); fprintf(fp, "# topics: %d\n", num); free(vp); }
void yap_probs() { int t; int empty = 0; double ent = 0; double factor = 0; double *vp = malloc(sizeof(*vp)*ddN.T); get_probs(vp); yap_message("probs = "); factor = ddP.alpha; for (t=0; t<ddN.T; t++) if ( vp[t]>0 ) { yap_message(" %lf", vp[t]); ent -= vp[t]*log(vp[t]); } else { empty++; yap_message(" -"); } yap_message("\nfactor = %lf, empty = %d, ent = %lf\n", factor, empty, exp(ent)); free(vp); }
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *termindk = NULL; uint32_t *indk = NULL; int Nk_tot = 0; double (*termtscore)(int) = NULL; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); //#define XTRA // prints model topic probs after observed #ifdef XTRA double *gtavec = calloc(ddN.T,sizeof(gtavec[0])); #endif float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); #ifdef KL float *dfvec = calloc(ddN.W,sizeof(dfvec[0])); #endif double *ngalpha = NULL; T_stats_t *termstats; #ifdef XTRA get_probs(gtavec); #endif if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phirat ) { tscore = phiratioscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } if ( ddS.TwT==NULL && ddP.phi==NULL && scoretype == ST_phirat ) yap_quit("Cannot use '-orat' option with this model/settings.\n"); if ( ddP.PYalpha==H_NG ) { /* * provide an estimate of alpha */ ngalpha = dvec(ddN.T); get_probs(ngalpha); for (k=0; k<ddN.T; k++) { ddP.alphapr[k] = ngalpha[k]; } } /* * returns null if no relevant data file */ termstats = tstats_init(ddS.z, ddD.NdTcum, ddN.T, ddN.DT, stem); if ( termstats ) { if ( scoretype == ST_idf ) { termtscore = termidfscore; } else termtscore = termcountscore; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); if ( termstats ) build_termNwK(termstats); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); if ( termstats ) { termindk = malloc(sizeof(*indk)*termstats->K); if ( !termindk ) yap_quit("Cannot allocate termindk in hca_displaytopics()\n"); } data_df(stem); #ifdef KL for (w=0; w<ddN.W; w++) dfvec[w] = ddD.df[w]; #endif /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; if ( termstats ) { termcnt = buildtermindk(k, termindk, termstats); topk(topword, termcnt, termindk, termtscore); } /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } if ( termstats ) { for (w=0; w<topword && w<termcnt; w++) { fprintf(fp," %d", (int)termstats->Kmin+termindk[w]); } } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt, termcnt = 0; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); if ( termstats ) { termcnt = buildtermindk(kk, termindk, termstats); topk(topword, termcnt, termindk, termtscore); if ( topword<termcnt ) termcnt = topword; } /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; #define MALLET_EW #ifdef MALLET_EW double ewp = dprop?(1.0/fv_expprob(pvec,ddN.W)):ddN.W; #endif double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); #ifdef XTRA yap_message((ddN.T>200)?"/%.3lf%%":"/%.2lf%%",100*gtavec[kk]); #endif if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); #ifdef KL yap_message(" ew=%lf", ew); #else yap_message(" ew=%.0lf", ew); #endif #ifdef MALLET_EW yap_message(" ewp=%.1lf", ewp); #endif yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddP.NGbeta ) { /* * approx. as sqrt(var(lambda_k)/lambda-normaliser */ double ngvar = sqrt(ddP.NGalpha[kk]) * (ngalpha[kk]/ddP.NGalpha[kk]); yap_message(" ng=%.4lf,%.4lf", ngalpha[kk], ngvar/ngalpha[kk]); if ( ddS.sparse ) yap_message(",%.4f", 1-((float)ddS.sparseD[kk])/ddN.DTused); if ( verbose>2 ) yap_message(" ngl=%.4lf,%.4lf, nga=%.4lf,%.4lf", ddP.NGalpha[kk]/ddP.NGbeta[kk], sqrt(ddP.NGalpha[kk]/ddP.NGbeta[kk]/ddP.NGbeta[kk]), ddP.NGalpha[kk], ddP.NGbeta[kk]); } if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); #ifdef KL yap_message(" %lf", ew); #else fprintf(rp," %.2lf", ew); #endif #ifdef MALLET_EW fprintf(rp," %.2lf", ewp); #endif fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(tscore(indk[w])+0.2)); else yap_message("(%6lf)", tscore(indk[w])); } if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } if ( termstats ) { yap_message(" terms="); for (w=0; w<termcnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", termstats->tokens[termindk[w]]); else yap_message("%d", termstats->Kmin+termindk[w]); if ( verbose>2 ) { if ( scoretype == ST_count ) yap_message("(%d)", (int)(termtscore(termindk[w])+0.2)); else yap_message("(%6lf)", termtscore(termindk[w])); } if ( fullreport ) { fprintf(rp, "term %d %d %d", kk, termindk[w], w); fprintf(rp, " %d", termstats->Nkt[termindk[w]][kk]); fprintf(rp, " %s", termstats->tokens[termindk[w]]); fprintf(rp, "\n"); } } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); /* this case gives bad results */ // if ( scoretype == ST_phirat ) topk(topword, cnt, indk, phiratioscore); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; if ( ddP.phi && ddP.PYbeta!=H_PDP ) { for (w=0; w<ddN.W; w++) pvec[w] = ddS.phi[ddN.T][w]; } else { for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); } #ifdef KL double ew = fv_kl(dfvec,pvec,ddN.W); #else double ew = exp(fv_entropy(pvec,ddN.W)); #endif double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 && !ddP.phi ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\nTopical words="); topk(topword, cnt, indk, phiinvratioscore); for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( ddS.sparse && ddP.PYalpha==H_NG ) { double avesp = 0; // correct_docsp(); for (k=0; k<ddN.T; k++) { avesp += gtvec[k]; } // check gtvec[] sums to 1 assert(fabs(avesp-1.0)<0.00001); avesp = 0; for (k=0; k<ddN.T; k++) { avesp += gtvec[k]*((float)ddS.sparseD[k])/ddN.DTused; assert(ddS.sparseD[k]<=ddN.DTused); } assert(avesp<=1.0); assert(avesp>0.0); yap_message("IBP sparsity = %.2lf%%\n", 100*(1-avesp)); } if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-7 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( ngalpha ) free(ngalpha); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } #ifdef KL free(dfvec); #endif free(pvec); free(gtvec); free(gpvec); tstats_free(termstats); }
void extract_boxes(char *cfgfile, char *weightfile) { network net = parse_network_cfg(cfgfile); if(weightfile){ load_weights(&net, weightfile); } set_batch_network(&net, 1); fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); srand(time(0)); char *val_images = "/home/pjreddie/data/voc/test/train.txt"; list *plist = get_paths(val_images); char **paths = (char **)list_to_array(plist); layer l = net.layers[net.n - 1]; int num_boxes = l.side; int num = l.n; int classes = l.classes; int j; box *boxes = calloc(num_boxes*num_boxes*num, sizeof(box)); float **probs = calloc(num_boxes*num_boxes*num, sizeof(float *)); for(j = 0; j < num_boxes*num_boxes*num; ++j) probs[j] = calloc(classes+1, sizeof(float *)); int N = plist->size; int i=0; int k; int count = 0; float iou_thresh = .3; for (i = 0; i < N; ++i) { fprintf(stderr, "%5d %5d\n", i, count); char *path = paths[i]; image orig = load_image_color(path, 0, 0); image resized = resize_image(orig, net.w, net.h); float *X = resized.data; float *predictions = network_predict(net, X); get_boxes(predictions+1+classes, num, num_boxes, 5+classes, boxes); get_probs(predictions, num*num_boxes*num_boxes, classes, 5+classes, probs); char *labelpath = find_replace(path, "images", "labels"); labelpath = find_replace(labelpath, "JPEGImages", "labels"); labelpath = find_replace(labelpath, ".jpg", ".txt"); labelpath = find_replace(labelpath, ".JPEG", ".txt"); int num_labels = 0; box_label *truth = read_boxes(labelpath, &num_labels); FILE *label = stdin; for(k = 0; k < num_boxes*num_boxes*num; ++k){ int overlaps = 0; for (j = 0; j < num_labels; ++j) { box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h}; float iou = box_iou(boxes[k], t); if (iou > iou_thresh){ if (!overlaps) { char buff[256]; sprintf(buff, "/data/extracted/labels/%d.txt", count); label = fopen(buff, "w"); overlaps = 1; } fprintf(label, "%d %f\n", truth[j].id, iou); } } if (overlaps) { char buff[256]; sprintf(buff, "/data/extracted/imgs/%d", count++); int dx = (boxes[k].x - boxes[k].w/2) * orig.w; int dy = (boxes[k].y - boxes[k].h/2) * orig.h; int w = boxes[k].w * orig.w; int h = boxes[k].h * orig.h; image cropped = crop_image(orig, dx, dy, w, h); image sized = resize_image(cropped, 224, 224); #ifdef OPENCV save_image_jpg(sized, buff); #endif free_image(sized); free_image(cropped); fclose(label); } } free(truth); free_image(orig); free_image(resized); } }
void validate_recall(char *cfgfile, char *weightfile) { network net = parse_network_cfg(cfgfile); if(weightfile){ load_weights(&net, weightfile); } set_batch_network(&net, 1); fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); srand(time(0)); char *val_images = "/home/pjreddie/data/voc/test/2007_test.txt"; list *plist = get_paths(val_images); char **paths = (char **)list_to_array(plist); layer l = net.layers[net.n - 1]; int num_boxes = l.side; int num = l.n; int classes = l.classes; int j; box *boxes = calloc(num_boxes*num_boxes*num, sizeof(box)); float **probs = calloc(num_boxes*num_boxes*num, sizeof(float *)); for(j = 0; j < num_boxes*num_boxes*num; ++j) probs[j] = calloc(classes+1, sizeof(float *)); int N = plist->size; int i=0; int k; float iou_thresh = .5; float thresh = .1; int total = 0; int correct = 0; float avg_iou = 0; int nms = 1; int proposals = 0; int save = 1; for (i = 0; i < N; ++i) { char *path = paths[i]; image orig = load_image_color(path, 0, 0); image resized = resize_image(orig, net.w, net.h); float *X = resized.data; float *predictions = network_predict(net, X); get_boxes(predictions+1+classes, num, num_boxes, 5+classes, boxes); get_probs(predictions, num*num_boxes*num_boxes, classes, 5+classes, probs); if (nms) do_nms(boxes, probs, num*num_boxes*num_boxes, (classes>0) ? classes : 1, iou_thresh); char *labelpath = find_replace(path, "images", "labels"); labelpath = find_replace(labelpath, "JPEGImages", "labels"); labelpath = find_replace(labelpath, ".jpg", ".txt"); labelpath = find_replace(labelpath, ".JPEG", ".txt"); int num_labels = 0; box_label *truth = read_boxes(labelpath, &num_labels); for(k = 0; k < num_boxes*num_boxes*num; ++k){ if(probs[k][0] > thresh){ ++proposals; if(save){ char buff[256]; sprintf(buff, "/data/extracted/nms_preds/%d", proposals); int dx = (boxes[k].x - boxes[k].w/2) * orig.w; int dy = (boxes[k].y - boxes[k].h/2) * orig.h; int w = boxes[k].w * orig.w; int h = boxes[k].h * orig.h; image cropped = crop_image(orig, dx, dy, w, h); image sized = resize_image(cropped, 224, 224); #ifdef OPENCV save_image_jpg(sized, buff); #endif free_image(sized); free_image(cropped); sprintf(buff, "/data/extracted/nms_pred_boxes/%d.txt", proposals); char *im_id = basecfg(path); FILE *fp = fopen(buff, "w"); fprintf(fp, "%s %d %d %d %d\n", im_id, dx, dy, dx+w, dy+h); fclose(fp); free(im_id); } } } for (j = 0; j < num_labels; ++j) { ++total; box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h}; float best_iou = 0; for(k = 0; k < num_boxes*num_boxes*num; ++k){ float iou = box_iou(boxes[k], t); if(probs[k][0] > thresh && iou > best_iou){ best_iou = iou; } } avg_iou += best_iou; if(best_iou > iou_thresh){ ++correct; } } free(truth); free_image(orig); free_image(resized); fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total); } }
/*--------------------------------------------------------------- Routine : calculate_probs Purpose : Calculate the probabilities for the tree, based upon the minimal cut sets. ---------------------------------------------------------------*/ BOOL calculate_probs( char *filename, /* IN - filename to write report to */ TREE *tree, /* IN - tree */ int max_order, /* IN - max order of cut sets to use */ int prob_n_terms, /* IN - number of terms in expansion */ float unit_time ) /* IN - unit time factor to be applied */ { BitArray *stop = BitCreate(1); /* 1-bit zero */ FILE *file; Expr e; Group *g; float *probs, *cp, *imp; float p; int num_bas, num_mcs, i, j; /* char *mcs_file; */ /* int order; */ clock_t time1, time2; time_t tp; BOOL success = TRUE; float one_increment /* value for one increment of the progress bar */; /* start clock */ time1 = clock(); if ( (file = fopen(filename, "w")) == NULL) { printf("*** calculate_probs : error opening file\n"); return FALSE; } /* printf("calculate_probs()\n"); */ /* include transfered-in trees and build the primary event list * * We need to do something different to deal with Common Cause Analysis * We don't need the tree, but we do need the primary event list. * Need to add the common cause events into the primary event list. */ /* if necessary, expand tree */ expand_tree(tree); /* set probs in BASLIST from the events database */ set_bas_prob( unit_time ); /* get number of primary events */ if ((num_bas = tree->num_bas) == 0) { fclose( file ); return FALSE; } if (GenerateNumericalProbabilityCheckForInterrupt()) { success = FALSE; fclose( file ); return success; } /* create array of probabilities of primary events */ if ( !fNewMemory( (void *)&probs, ( num_bas * sizeof(float) ) ) ) { printf("\n*** calculate_probs 1 : malloc failed ***\n"); exit(1); } if ( !fNewMemory( (void *)&imp, ( num_bas * sizeof(float) ) ) ) { printf("\n*** calculate_probs : malloc failed ***\n"); exit(1); } /* fill array */ get_probs( probs ); /* get mcs list */ e = tree->mcs_expr; /* num_mcs = ExprCount(e); */ /* how many mcs are actually used? */ num_mcs = ExprCountOrder(tree->mcs_expr, max_order); /* make sure that max_term does not exceed number of mcs */ /* if number of mcs is zero then return FALSE */ if(num_mcs == 0) { return FALSE; } else if (prob_n_terms > num_mcs) { prob_n_terms = num_mcs; } /* initialise Working dialog */ /* most of the cpu time is taken up in the ExprProb() function */ /* the working dialog is incremented in the combs() function */ one_increment = 0.0; for(i = 1; i <= prob_n_terms; i++) { one_increment += nCr(num_mcs, i); } /* set up progress bar */ one_increment /= 100.0; set_one_increment(one_increment); GenerateNumericalProbabilitySetProgressBarMax(100); /* ExprPrint(e); */ /* print header */ fprintf(file, "Probabilities Analysis\n" "======================\n\n"); fprintf(file, "Tree : %s\n", tree->name); time(&tp); fprintf(file, "Time : %s\n", ctime(&tp)); fprintf(file, "Number of primary events = %d\n", num_bas); fprintf(file, "Number of minimal cut sets = %d\n", num_mcs); fprintf(file, "Order of minimal cut sets = %d\n", tree->max_order); if (max_order < tree->max_order) { fprintf(file, " (order <= %d used)\n\n", max_order); } else { fprintf(file, "\n"); } fprintf(file, "Unit time span = %f\n\n", unit_time); /* calculate cut set probabilities - use ALL the cut sets */ cp = ExprCutsetProbs(e, probs); fprintf(file, "Minimal cut set probabilities :\n\n"); i = 0; for(g=e; !BitEquals(g->b, stop); g=g->next) { char **fp = BitPara( g->b, 30 ); /* printf("(%3d) %s %-20s - %E\n", */ /* i+1, */ /* BitString(g->b), */ /* fp[0], */ /* cp[i]); */ /* */ /* for (j = 1; fp[j] != NULL; j++) { */ /* printf(" %-20s\n", fp[j]); */ /* } */ if (GenerateNumericalProbabilityCheckForInterrupt()) { success = FALSE; CleanUpOperations( file, probs, cp, imp, stop); ParaDestroy(fp); return success; } fprintf(file, "%3d %-30s %E\n", i+1, fp[0], cp[i]); for (j = 1; fp[j] != NULL; j++) { fprintf(file, " %-20s\n", fp[j]); } ParaDestroy(fp); i++; } /* calculate top level probability - use only up to max_order cut sets */ fprintf(file, "\n\n" "Probability of top level event " "(minimal cut sets up to order %d used):\n\n", max_order); p = 0; for (i = 1; i <= prob_n_terms && i <= num_mcs && !GenerateNumericalProbabilityCheckForInterrupt(); i++) { float term; char *sign, *s, *bound; p += (term = ExprProb(e, probs, max_order, i)); sign = ((i % 2) ? "+" : "-" ); s = ((i > 1) ? "s" : " " ); bound = ((i % 2) ? "upper" : "lower" ); fprintf(file, "%2d term%s %s%E = %E (%s bound)\n", i, s, sign, fabs(term), p, bound); } if (prob_n_terms >= num_mcs) { fprintf(file, "\nExact value : %E\n", p); } if (GenerateNumericalProbabilityCheckForInterrupt()) { success = FALSE; CleanUpOperations( file, probs, cp, imp, stop); return success; } /* calculate importances of individual events */ for (j = 0; j < num_bas; j++) { imp[j] = 0; } i = 0; for(g=e; !BitEquals(g->b, stop); g=g->next) { for (j = 0; j < g->b->n; j++) { if ( BitGet(g->b, (g->b->n-1) - j) ) { imp[j] += cp[i]; } } i++; } if (GenerateNumericalProbabilityCheckForInterrupt()) { success = FALSE; CleanUpOperations( file, probs, cp, imp, stop); return success; } fprintf(file, "\n\nPrimary Event Analysis:\n\n"); fprintf(file, " Event " "Failure contrib. " "Importance\n\n"); for (i = 0; i < num_bas; i++) { char *fs = BasicString(num_bas, i); fprintf(file, "%-15s %E %5.2f%%\n", fs, imp[i], 100 * imp[i] / p); strfree(fs); } time2 = clock(); /* printf("calculate_probs : num_terms = %d : time = %f\n", */ /* prob_n_terms, (time2-time1)/(float)CLOCKS_PER_SEC); */ CleanUpOperations( file, probs, cp, imp, stop); /* fclose(file); */ /* FreeMemory(probs); */ /* free(cp); */ /* FreeMemory(imp); */ return ( TRUE ); } /* calculate_probs */
float /* RET - time (seconds) */ probs_estimate( TREE *tree, /* IN - tree */ int max_order, /* IN - number of cut sets */ int min_term, /* IN - min number of terms to evaluate */ int max_term) /* IN - max number of terms to evaluate */ { int num_mcs; /* number of cut sets used */ float t = 0; int i,j; Group **index; /* index to the groups */ int *z; Group *p; /* pointer */ clock_t time1, time2; BitArray *stop = BitCreate(1); /* 1-bit zero */ float *probs; /* TimeEstimate Base; */ /* find out how many cut sets are actually used */ num_mcs = ExprCountOrder(tree->mcs_expr, max_order); /* make sure that max_term does not exceed number of mcs */ /* if number of mcs is zero then return 0 */ if(num_mcs == 0) { return 0.0f; } else if (max_term > num_mcs) { max_term = num_mcs; } /* allocate memory required for testing */ if ( !fNewMemory( (void *)&probs, ( tree->num_bas * sizeof(float) ) ) ) { exit(1); } if ( !fNewMemory( (void *)&index, ( num_mcs * sizeof(Group *) ) ) ) { exit( 1 ); } if ( !fNewMemory( (void *)&z, ( num_mcs * sizeof(int) ) ) ) { exit( 1 ); } /* populate the arrays with default data */ for(i=0, p=tree->mcs_expr; i<num_mcs; i++, p=p->next) { index[i] = p; z[i] = i; } /* fill the probs array */ get_probs( probs ); /* set the static variables to sensible values */ set_basic_n(tree->num_bas); set_prob_term(0.0); set_basic_prob(probs); /* The function that takes most of the time is calc_sub_term(). Run this function for each number of terms required. Run it enough times for the CPU clock to change. */ for (i = min_term; i <= max_term; i++) { time1 = clock(); j = 0; do { calc_sub_term(z, i, index); j++; time2 = clock(); } while(time1 == time2); t += nCr(num_mcs, i) * (time2 - time1) / j; } FreeMemory(index); FreeMemory(z); FreeMemory(probs); return t/CLOCKS_PER_SEC; } /* probs_estimate */
void hca_displaytopics(char *stem, char *resstem, int topword, enum ScoreType scoretype, int pmicount, int fullreport) { int w,k; uint32_t *indk = NULL; int Nk_tot = 0; double (*tscore)(int) = NULL; double sparsityword = 0; double sparsitydoc = 0; double underused = 0; uint32_t *top1cnt = NULL; FILE *fp; float *tpmi = NULL; char *topfile; char *repfile; uint32_t *psort; FILE *rp = NULL; float *gtvec = globalprop(); float *gpvec = calloc(ddN.W,sizeof(gpvec[0])); float *pvec = calloc(ddN.W,sizeof(pvec[0])); if ( pmicount>topword ) pmicount = topword; if ( scoretype == ST_idf ) { tscore = idfscore; } else if ( scoretype == ST_phi ) { tscore = phiscore; } else if ( scoretype == ST_count ) { tscore = countscore; } else if ( scoretype == ST_cost ) { tscore = costscore; } else if ( scoretype == ST_Q ) { tscore = Qscore; lowerQ = 1.0/ddN.T; } /* * first collect counts of each word/term, * and build gpvec (mean word probs) */ build_NwK(); { /* * gpvec[] is normalised NwK[] */ double tot = 0; for (w=0; w<ddN.W; w++) tot += gpvec[w] = NwK[w]+0.1; for (w=0; w<ddN.W; w++) gpvec[w] /= tot; } if ( ddS.Nwt ) { for (k=0; k<ddN.T; k++) { Nk_tot += ddS.NWt[k]; } } psort = sorttops(gtvec, ddN.T); top1cnt = hca_top1cnt(); if ( !top1cnt ) yap_quit("Cannot allocate top1cnt in hca_displaytopics()\n"); if ( pmicount ) { tpmi = malloc(sizeof(*tpmi)*(ddN.T+1)); if ( !tpmi ) yap_quit("Cannot allocate tpmi in hca_displaytopics()\n"); } indk = malloc(sizeof(*indk)*ddN.W); if ( !indk ) yap_quit("Cannot allocate indk in hca_displaytopics()\n"); /* * two passes through, * first to build the top words and dump to file */ repfile = yap_makename(resstem,".topset"); topfile = yap_makename(resstem,".toplst"); fp = fopen(topfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", topfile); yap_message("\n"); for (k=0; k<ddN.T; k++) { int cnt; tscorek = k; /* * build sorted word list */ cnt = buildindk(k, indk); topk(topword, cnt, indk, tscore); if ( cnt==0 ) continue; /* * dump words to file */ fprintf(fp,"%d: ", k); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } if ( ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; /* * dump root words */ tscorek = -1; cnt = buildindk(-1, indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); fprintf(fp,"-1:"); for (w=0; w<topword && w<cnt; w++) { fprintf(fp," %d", (int)indk[w]); } fprintf(fp, "\n"); } fclose(fp); if ( verbose>1 ) yap_message("\n"); if ( pmicount ) { /* * compute PMI */ char *toppmifile; char *pmifile; double *tp; tp = dvec(ddN.T); pmifile=yap_makename(stem,".pmi"); toppmifile=yap_makename(resstem,".toppmi"); get_probs(tp); report_pmi(topfile, pmifile, toppmifile, ddN.T, ddN.W, 1, pmicount, tp, tpmi); free(toppmifile); free(pmifile); free(tp); } /* * now report words and diagnostics */ //ttop_open(topfile); if ( fullreport ) { rp = fopen(repfile,"w"); if ( !rp ) yap_sysquit("Cannot open file '%s' for write\n", repfile); fprintf(rp, "#topic index rank prop word-sparse doc-sparse eff-words eff-docs docs-bound top-one " "dist-unif dist-unigrm"); if ( PCTL_BURSTY() ) fprintf(rp, " burst-concent"); if ( ddN.tokens ) fprintf(rp, " ave-length"); fprintf(rp, " coher"); if ( pmicount ) fprintf(rp, " pmi"); fprintf(rp, "\n#word topic index rank"); if ( ddS.Nwt ) fprintf(rp, " count"); fprintf(rp, " prop cumm df coher\n"); } for (k=0; k<ddN.T; k++) { int cnt; int kk = psort[k]; uint32_t **dfmtx; if ( ddP.phi==NULL && ddS.NWt[kk]==0 ) continue; /* * grab word prob vec for later use */ if ( ddS.Nwt ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = wordprob(w,kk); } else if ( ddP.phi ) fv_copy(pvec, ddP.phi[kk], ddN.W); else if ( ddS.phi ) fv_copy(pvec, ddS.phi[kk], ddN.W); /* * rebuild word list */ tscorek = kk; cnt = buildindk(kk, indk); topk(topword, cnt, indk, tscore); if ( topword<cnt ) cnt = topword; assert(cnt>0); /* * df stats for topic returned as matrix */ dfmtx = hca_dfmtx(indk, cnt, kk); if ( ddS.Nwt && (ddS.NWt[kk]*ddN.T*100<Nk_tot || ddS.NWt[kk]<5 )) underused++; /* * print stats for topic * Mallet: tokens, doc_ent, ave-word-len, coher., * uni-dist, corp-dist, eff-no-words */ yap_message("Topic %d/%d", kk, k); { /* * compute diagnostics */ double prop = gtvec[kk]; float *dprop = docprop(kk); double spw = 0; double spd = ((double)nonzero_Ndt(kk))/((double)ddN.DT); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); double sl = fv_avestrlen(pvec,ddN.tokens,ddN.W); double co = coherence(dfmtx, cnt); double ed = dprop?exp(fv_entropy(dprop,ddN.DT)):ddN.DT; double da = dprop?fv_bound(dprop,ddN.DT,1.0/sqrt((double)ddN.T)):0; sparsitydoc += spd; yap_message((ddN.T>200)?" p=%.3lf%%":" p=%.2lf%%",100*prop); if ( ddS.Nwt ) { spw = ((double)nonzero_Nwt(kk))/((double)ddN.W); sparsityword += spw; yap_message(" ws=%.1lf%%", 100*(1-spw)); } yap_message(" ds=%.1lf%%", 100*(1-spd) ); yap_message(" ew=%.0lf", ew); yap_message(" ed=%.1lf", ed); yap_message(" da=%.0lf", da+0.1); yap_message(" t1=%u", top1cnt[kk]); yap_message(" ud=%.3lf", ud); yap_message(" pd=%.3lf", pd); if ( PCTL_BURSTY() ) yap_message(" bd=%.3lf", ddP.bdk[kk]); if ( ddN.tokens ) yap_message(" sl=%.2lf", sl); yap_message(" co=%.3lf%%", co); if ( pmicount ) yap_message(" pmi=%.3f", tpmi[kk]); if ( fullreport ) { fprintf(rp,"topic %d %d", kk, k); fprintf(rp," %.6lf", prop); if ( ddS.Nwt ) { fprintf(rp," %.6lf", (1-spw)); } else { fprintf(rp," 0"); } fprintf(rp," %.6lf", (1-spd) ); fprintf(rp," %.2lf", ew); fprintf(rp," %.2lf", ed); fprintf(rp," %.0lf", da+0.1); fprintf(rp," %u", top1cnt[kk]); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); if ( PCTL_BURSTY() ) fprintf(rp," %.3lf", ddP.bdk[kk]); fprintf(rp," %.4lf", (ddN.tokens)?sl:0); fprintf(rp," %.6lf", co); if ( pmicount ) fprintf(rp," %.4f", tpmi[kk]); fprintf(rp,"\n"); } if ( dprop) free(dprop); } if ( verbose>1 ) { double pcumm = 0; /* * print top words: * Mallet: rank, count, prob, cumm, docs, coh */ yap_message("\ntopic %d/%d", kk, k); yap_message(" words="); for (w=0; w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", tscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", kk, indk[w], w); if ( ddS.Nwt ) fprintf(rp, " %d", ddS.Nwt[indk[w]][kk]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " %d", dfmtx[w][w]); fprintf(rp, " %.6f", coherence_word(dfmtx, cnt, w)); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } } yap_message("\n"); free(dfmtx[0]); free(dfmtx); } if ( verbose>1 && ddP.PYbeta && (ddP.phi==NULL || ddP.betapr) ) { int cnt; double pcumm = 0; /* * print root words */ tscorek = -1; cnt = buildindk(-1,indk); topk(topword, cnt, indk, (ddP.phi==NULL)?countscore:phiscore); /* * cannot build df mtx for root because * it is latent w.r.t. topics */ yap_message("Topic root words="); if ( fullreport ) { int w; for (w=0; w<ddN.W; w++) pvec[w] = betabasewordprob(w); double ew = exp(fv_entropy(pvec,ddN.W)); double ud = fv_helldistunif(pvec,ddN.W); double pd = fv_helldist(pvec,gpvec,ddN.W); fprintf(rp,"topic -1 -1 0 0"); fprintf(rp," %.4lf", ew); fprintf(rp," %.6lf", ud); fprintf(rp," %.6lf", pd); fprintf(rp,"\n"); } for (w=0; w<topword && w<cnt; w++) { if ( w>0 ) yap_message(","); if ( ddN.tokens ) yap_message("%s", ddN.tokens[indk[w]]); else yap_message("%d", indk[w]); if ( verbose>2 ) yap_message("(%6lf)", countscore(indk[w])); if ( fullreport ) { fprintf(rp, "word %d %d %d", -1, indk[w], w); if ( ddS.TwT ) fprintf(rp, " %d", ddS.TwT[w]); pcumm += pvec[indk[w]]; fprintf(rp, " %.6f %.6f", pvec[indk[w]], pcumm); fprintf(rp, " 0 0"); if ( ddN.tokens ) fprintf(rp, " %s", ddN.tokens[indk[w]]); fprintf(rp, "\n"); } } yap_message("\n"); } yap_message("\n"); if ( rp ) fclose(rp); if ( ddS.Nwt ) yap_message("Average topicXword sparsity = %.2lf%%\n", 100*(1-sparsityword/ddN.T) ); yap_message("Average docXtopic sparsity = %.2lf%%\n" "Underused topics = %.1lf%%\n", 100*(1-sparsitydoc/ddN.T), 100.0*underused/(double)ddN.T); if ( pmicount ) yap_message("Average PMI = %.3f\n", tpmi[ddN.T]); /* * print */ if ( 1 ) { float **cmtx = hca_topmtx(); int t1, t2; int m1, m2; float mval; char *corfile = yap_makename(resstem,".topcor"); fp = fopen(corfile,"w"); if ( !fp ) yap_sysquit("Cannot open file '%s' for write\n", corfile); /* * print file */ for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) if ( cmtx[t1][t2]>1.0e-3 ) fprintf(fp, "%d %d %0.6f\n", t1, t2, cmtx[t1][t2]); } fclose(fp); free(corfile); /* * display maximum */ m1 = 1; m2 = 0; mval = cmtx[1][0]; for (t1=0; t1<ddN.T; t1++) { for (t2=0; t2<t1; t2++) { if ( mval<cmtx[t1][t2] ) { mval = cmtx[t1][t2]; m1 = t1; m2 = t2; } } } yap_message("Maximum correlated topics (%d,%d) = %f\n", m1, m2, mval); free(cmtx[0]); free(cmtx); } /* * print burstiness report */ if ( PCTL_BURSTY() ) { int tottbl = 0; int totmlttbl = 0; int totmlt = 0; int i; for (i=0; i<ddN.NT; i++) { if ( Z_issetr(ddS.z[i]) ) { if ( M_multi(i) ) totmlttbl++; tottbl++; } if ( M_multi(i) ) totmlt++; } yap_message("Burst report: multis=%.2lf%%, tables=%.2lf%%, tbls-in-multis=%.2lf%%\n", 100.0*((double)ddM.dim_multiind)/ddN.N, 100.0*((double)tottbl)/ddN.NT, 100.0*((double)totmlttbl)/totmlt); } yap_message("\n"); free(topfile); if ( repfile ) free(repfile); if ( top1cnt ) free(top1cnt); free(indk); free(psort); if ( pmicount ) free(tpmi); if ( NwK ) { free(NwK); NwK = NULL; } free(pvec); free(gtvec); free(gpvec); }
/* * run regular gibbs cycles on the data with phi used; * the evaluation on each doc, and sample word probs * * if qparts>0, split collection into parts and only search this * * K = number of top results to retain */ void gibbs_query(int K, char *qname, int dots, int this_qpart, int qparts) { /* * mapping from query word posn. to its mi in current doc * >ddN.N = not in current doc * -ve = has no mi since occurs just once, found at * posn (-map[]-1) * non -ve = mi value */ int *mimap = NULL; /* * usual stuff for Gibbs loop over docs */ int i, j; float *fact = fvec(ddN.T*4); D_MiSi_t dD; /* * an index into topk[] which maintains ordering */ int *topind; /* * these store statistics of the results, for printing * these are unordered, ordered by topind[] */ /* document score */ float *topscore; /* document number */ int *topk; /* flags if ord is irrelevant, thus not scored */ char *wordunused; /* * per word stats for top results saved */ int *found; float *topcnt; float *topwordscore; /* * temporary versions for when gibbs running */ int *found_buf; float *topcnt_buf; float *topwordscore_buf; double *logprob; /* * search here */ int startdoc = 0; int enddoc = ddN.DT; /* * setup */ topcnt = malloc(sizeof(topcnt[0])*K*ddP.n_words); topwordscore = malloc(sizeof(topwordscore[0])*K*ddP.n_words); found = malloc(sizeof(found)*ddP.n_words*K); wordunused = malloc(sizeof(wordunused[0])*ddP.n_words); topcnt_buf = malloc(sizeof(topcnt[0])*ddP.n_words); topwordscore_buf = malloc(sizeof(topwordscore[0])*ddP.n_words); found_buf = malloc(sizeof(found)*ddP.n_words); if ( !topcnt || !topwordscore || !found || !topcnt_buf || !topwordscore_buf || !found_buf ) yap_quit("Cannot allocate memory in gibbs_query()\n"); logprob = malloc(sizeof(logprob[0])*ddP.n_query); topscore = malloc(sizeof(topscore[0])*K*ddP.n_query); topind = malloc(sizeof(topind[0])*K*ddP.n_query); topk = malloc(sizeof(topk[0])*K*ddP.n_query); if ( ddP.bdk!=NULL ) mimap = malloc(sizeof(mimap[0])*ddP.n_words); if ( !topk || !topscore || !logprob || !topind ) yap_quit("Cannot allocate memory in gibbs_query()\n"); for (i=0; i<ddP.n_words; i++) { wordunused[i] = 0; } for (i=0; i<K*ddP.n_query; i++) { topind[i] = i%K; topk[i] = -1; topscore[i] = INFINITY; } /* * check words to exclude using topics */ if ( ddP.n_excludetopic>0 ) { double *tprob = malloc(sizeof(tprob[0])*ddN.T); get_probs(tprob); yap_probs(); if ( verbose>1 ) yap_message("Excluding words: "); for (i=0; i<ddP.n_words; i++) { int t = besttopic(ddP.qword[i],tprob); if ( Q_excludetopic(t) ) { wordunused[i] = 1; if ( verbose>1 ) yap_message(" %d/%d", (int)ddP.qword[i], t); } } if ( verbose>1 ) yap_message("\n"); free(tprob); } if ( ddP.bdk!=NULL ) misi_init(&ddM,&dD); if ( qparts>0 ) { startdoc = ((double)this_qpart)/qparts * ddN.DT; enddoc = ((double)this_qpart+1.0)/qparts * ddN.DT; } for(i=startdoc; i<enddoc; i++) { int thisw = add_doc(i, GibbsNone); int r; if ( thisw<=1 ) { remove_doc(i, GibbsNone); continue; } if ( ddP.bdk!=NULL ) misi_build(&dD, i, 0); map_query(i, mimap, found_buf); for (j=0; j<ddP.n_words; j++) { topcnt_buf[j] = 0; topwordscore_buf[j] = 0; } for (r=0; r<ddP.queryiter; r++) { gibbs_lda(GibbsNone, ddN.T, i, ddD.NdT[i], fact, &dD, 0, 0); query_docprob(i, mimap, fact, &dD, topcnt_buf, topwordscore_buf); } /* * now adjust stats */ for (j=0; j<ddP.n_query; j++) logprob[j] = 0; for (j=0; j<ddP.n_words; j++) { if ( wordunused[j]>0 ) continue; if ( ddP.query[ddP.qword[j]]==j ) { topcnt_buf[j] /= ddP.queryiter; topwordscore_buf[j] /= ddP.queryiter; } else { /* word in previous query so copy */ int jj = ddP.query[ddP.qword[j]]; topcnt_buf[j] = topcnt_buf[jj]; topwordscore_buf[j] = topwordscore_buf[jj]; found_buf[j] = found_buf[jj]; } if ( wordunused[j]==0 ) logprob[ddP.qid[j]] += topwordscore_buf[j]; } if ( dots>0 && i>0 && (i%dots==0) ) yap_message("."); if ( ddP.bdk!=NULL ) misi_unbuild(&dD,i,0); remove_doc(i, GibbsNone); /* * enter into the arrays */ for (j=0; j<ddP.n_query; j++) { if ( i<K || logprob[j] < topscore[j*K+topind[j*K+K-1]] ) { int newind, l; /* * better than current lowest */ newind = bubble((i<K)?(i+1):K, &topind[j*K], &topscore[j*K], logprob[j]); /* * save the current details */ topscore[j*K+newind] = logprob[j]; topk[j*K+newind] = i; for (l=ddP.qposn[j]; l<ddP.qposn[j+1]; l++) { topcnt[newind*ddP.n_words+l] = topcnt_buf[l]; topwordscore[newind*ddP.n_words+l] = topwordscore_buf[l]; found[newind*ddP.n_words+l] = found_buf[l]; } } } } if ( dots>0 ) yap_message("\n"); /* * write result */ { float *ws = fvec(ddP.n_words); FILE *fp = fopen(qname,"w"); int q; if ( !fp ) yap_sysquit("Cannot write query results to '%s'\n", qname); for (q=0; q<ddP.n_query; q++) { int nw = ddP.qposn[q+1]-ddP.qposn[q]; for (i=0; i<K && i<ddN.DT && topk[topind[q*K+i]]>=0; i++) { int l, ind = topind[q*K+i]; double tfidf; tfidf = bm25(topk[q*K+ind],&found[ind*ddP.n_words+ddP.qposn[q]], &ddP.qword[ddP.qposn[q]], nw, ws); assert(ind>=0 && ind<K); fprintf(fp, "%d %d ", q, topk[q*K+ind]); fprintf(fp, "%.4f %.4lf ", topscore[q*K+ind]/nw, tfidf); if ( verbose>1 ) { for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%d ", found[ind*ddP.n_words+l]); for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%f ", topcnt[ind*ddP.n_words+l]); for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%f ", topwordscore[ind*ddP.n_words+l]); for (l=0; l<nw; l++) fprintf(fp, "%lf ", ws[l]); } fprintf(fp, "\n"); } } fclose(fp); free(ws); } /* * clean up */ free(fact); if ( ddP.bdk!=NULL ) misi_free(&dD); if ( mimap ) free(mimap); free(found); free(topwordscore); free(topcnt); free(found_buf); free(topwordscore_buf); free(topcnt_buf); free(topscore); free(topind); free(topk); free(logprob); }