Beispiel #1
0
void *sampling_p(void *pargs)
{
  int i;
  float *p = fvec(ddN.T * 4);
  D_MiSi_t dD;
  D_pargs_p *par =(D_pargs_p *) pargs;
  clock_t t1 = clock();
  
  if ( PCTL_BURSTY() )
    misi_init(&ddM,&dD);
  /*
   *  sampling
   */
  par->thislp = 0;
  par->thisNd = 0;
  while ( (i=atomic_incr(*par->doc)-1)<ddN.DT ) {
    if ( PCTL_BURSTY() )
      misi_build(&dD,i,0);
    par->thislp += gibbs_lda(GibbsNone, i, ddD.N_dT[i], p, &dD);
    par->thisNd += ddD.N_dT[i];
    if ( par->dots>0 && i>0 && (i%par->dots==0) ) 
      yap_message(".");
    if ( PCTL_BURSTY() )
      misi_unbuild(&dD,i,0);
  }
  free(p);
  if ( PCTL_BURSTY() )
    misi_free(&dD);
  par->tot_time = (double)(clock() - t1) / CLOCKS_PER_SEC;
  return NULL;
}
Beispiel #2
0
/*
 *    run regular gibbs cycles on the data with phi used;
 *    the evaluation on each doc, and sample word probs
 *
 *    if qparts>0, split collection into parts and only search this
 *
 *    K = number of top results to retain
 */
void gibbs_query(int K, char *qname, int dots, int this_qpart, int qparts) {
  /*
   *    mapping from query word posn. to its mi in current doc
   *       >ddN.N  = not in current doc
   *       -ve  = has no mi since occurs just once, found at
   *              posn  (-map[]-1)
   *       non -ve = mi value
   */
  int     *mimap = NULL;
  /*
   *     usual stuff for Gibbs loop over docs
   */
  int i, j;
  float *fact = fvec(ddN.T*4);
  D_MiSi_t dD;
  /*
   *   an index into topk[] which maintains ordering
   */
  int     *topind;

  /*
   *    these store statistics of the results, for printing
   *    these are unordered, ordered by topind[]
   */
  /*      document score  */
  float   *topscore;
  /*      document number   */
  int     *topk;
  /*      flags if ord is irrelevant, thus not scored  */
  char    *wordunused;

  /*
   *    per word stats for top results saved
   */
  int     *found;
  float   *topcnt;
  float   *topwordscore;
  /*
   *    temporary versions for when gibbs running
   */
  int     *found_buf;
  float   *topcnt_buf;
  float   *topwordscore_buf;
  double  *logprob;
  /*
   *   search here
   */
  int startdoc = 0;
  int enddoc = ddN.DT;

  /*
   *    setup
   */
  topcnt = malloc(sizeof(topcnt[0])*K*ddP.n_words);
  topwordscore = malloc(sizeof(topwordscore[0])*K*ddP.n_words);
  found = malloc(sizeof(found)*ddP.n_words*K);
  wordunused = malloc(sizeof(wordunused[0])*ddP.n_words);

  topcnt_buf = malloc(sizeof(topcnt[0])*ddP.n_words);
  topwordscore_buf = malloc(sizeof(topwordscore[0])*ddP.n_words);
  found_buf = malloc(sizeof(found)*ddP.n_words);
  if ( !topcnt || !topwordscore || !found || 
       !topcnt_buf || !topwordscore_buf || !found_buf )
    yap_quit("Cannot allocate memory in gibbs_query()\n");

  logprob = malloc(sizeof(logprob[0])*ddP.n_query);
  topscore = malloc(sizeof(topscore[0])*K*ddP.n_query);
  topind = malloc(sizeof(topind[0])*K*ddP.n_query);
  topk = malloc(sizeof(topk[0])*K*ddP.n_query);
  if ( ddP.bdk!=NULL ) 
    mimap = malloc(sizeof(mimap[0])*ddP.n_words);
  if ( !topk || !topscore || !logprob || !topind )
    yap_quit("Cannot allocate memory in gibbs_query()\n");
  for (i=0; i<ddP.n_words; i++) {
    wordunused[i] = 0;
  }
  for (i=0; i<K*ddP.n_query; i++) {
    topind[i] = i%K;
    topk[i] = -1;
    topscore[i] = INFINITY;
  }
  
  /*
   *  check words to exclude using topics
   */
  if ( ddP.n_excludetopic>0 ) {
    double *tprob = malloc(sizeof(tprob[0])*ddN.T);
    get_probs(tprob);
    yap_probs();
    if ( verbose>1 )
      yap_message("Excluding words: ");
    for (i=0; i<ddP.n_words; i++) {
      int t = besttopic(ddP.qword[i],tprob);
      if ( Q_excludetopic(t) ) {
	wordunused[i] = 1;
	if ( verbose>1 )
	  yap_message(" %d/%d", (int)ddP.qword[i], t);
      }
    } 
    if ( verbose>1 )
      yap_message("\n");
    free(tprob);
  }
  
  if ( ddP.bdk!=NULL ) misi_init(&ddM,&dD);

  if ( qparts>0 ) {
    startdoc = ((double)this_qpart)/qparts * ddN.DT;
    enddoc = ((double)this_qpart+1.0)/qparts * ddN.DT;
  }
  for(i=startdoc; i<enddoc; i++) {
    int  thisw =  add_doc(i, GibbsNone);
    int  r;
    if ( thisw<=1 ) {
      remove_doc(i, GibbsNone);
      continue;
    }
    if ( ddP.bdk!=NULL ) 
      misi_build(&dD, i, 0);
    map_query(i, mimap, found_buf);
    for (j=0; j<ddP.n_words; j++) {
      topcnt_buf[j] = 0;
      topwordscore_buf[j] = 0;
    }
    
    for (r=0; r<ddP.queryiter; r++) {
      gibbs_lda(GibbsNone, ddN.T, i, ddD.NdT[i], fact, &dD, 0, 0);
      query_docprob(i, mimap, fact, &dD, topcnt_buf, topwordscore_buf);
    }  
    /*
     *  now adjust stats
     */
    for (j=0; j<ddP.n_query; j++) 
      logprob[j] = 0;
    for (j=0; j<ddP.n_words; j++) {
      if ( wordunused[j]>0 )
	continue;
      if ( ddP.query[ddP.qword[j]]==j ) {
	topcnt_buf[j] /= ddP.queryiter;
	topwordscore_buf[j] /= ddP.queryiter;
      } else {
	/*  word in previous query so copy  */
	int jj =  ddP.query[ddP.qword[j]];
	topcnt_buf[j] = topcnt_buf[jj];
	topwordscore_buf[j] = topwordscore_buf[jj];
	found_buf[j] = found_buf[jj];
      }
      if ( wordunused[j]==0 )
	logprob[ddP.qid[j]] += topwordscore_buf[j];
    }
    if ( dots>0 && i>0 && (i%dots==0) ) 
      yap_message(".");
    if ( ddP.bdk!=NULL ) misi_unbuild(&dD,i,0);
    remove_doc(i, GibbsNone);
    /*
     *   enter into the arrays
     */
    for (j=0; j<ddP.n_query; j++) {
      if ( i<K || logprob[j] < topscore[j*K+topind[j*K+K-1]] ) {
	int newind, l;
	/*
	 *   better than current lowest 
	 */
	newind = bubble((i<K)?(i+1):K, 
			&topind[j*K], &topscore[j*K], logprob[j]);
	/*
	 *   save the current details
	 */
	topscore[j*K+newind] = logprob[j];
	topk[j*K+newind] = i;
	for (l=ddP.qposn[j]; l<ddP.qposn[j+1]; l++) {
	  topcnt[newind*ddP.n_words+l] = topcnt_buf[l]; 
	  topwordscore[newind*ddP.n_words+l] = topwordscore_buf[l]; 
	  found[newind*ddP.n_words+l] = found_buf[l]; 
	}
      }
    }
  }
  if ( dots>0 ) yap_message("\n");
  
  /*
   *  write result
   */
  {
    float *ws = fvec(ddP.n_words);
    FILE *fp = fopen(qname,"w");
    int q;
    if ( !fp )
      yap_sysquit("Cannot write query results to '%s'\n", qname);
    for (q=0; q<ddP.n_query; q++) {
      int nw = ddP.qposn[q+1]-ddP.qposn[q];
      for (i=0; i<K && i<ddN.DT && topk[topind[q*K+i]]>=0; i++) {
	int l, ind = topind[q*K+i];
	double tfidf;
        tfidf = bm25(topk[q*K+ind],&found[ind*ddP.n_words+ddP.qposn[q]],
			    &ddP.qword[ddP.qposn[q]], nw, ws);
	assert(ind>=0 && ind<K);
	fprintf(fp, "%d %d ", q, topk[q*K+ind]);
	fprintf(fp, "%.4f %.4lf ", topscore[q*K+ind]/nw, tfidf);
        if ( verbose>1 ) {
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%d ", found[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topcnt[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topwordscore[ind*ddP.n_words+l]);
          for (l=0; l<nw; l++)
            fprintf(fp, "%lf ", ws[l]);
        }
        fprintf(fp, "\n");
      }
    }
    fclose(fp);
    free(ws);
  }

  /*
   *  clean up
   */
  free(fact);
  if ( ddP.bdk!=NULL ) misi_free(&dD);
  if ( mimap ) free(mimap);
  free(found);
  free(topwordscore);
  free(topcnt);
  free(found_buf);
  free(topwordscore_buf);
  free(topcnt_buf);
  free(topscore);
  free(topind);
  free(topk);
  free(logprob);
}