WordNode * intersection(WordNode * tmp_list, WordNode * list_2)
{
	// make new list
	WordNode * intersect = init_list();
	// no results found for either list - return empty list
	if(!tmp_list){
		return intersect;
	}
	if(!list_2){
		return intersect;
	}
	
	//go through every node in tmp_list
	DocNode * doc = tmp_list->head;

	//go through each of the doc nodes 
	while (doc != NULL) {
		// for each doc node, see if that docNode is present in list_2. 
		DocNode * shared = get_index(doc->docID, list_2);
		if(shared){
			// add the frequencies together 
			int newfreq = (shared->freq) + (doc->freq); 
			//make a new docNode and add this docNode to the intersect list
			add_doc(doc->docID ,newfreq , intersect);
		}
		// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
		doc = doc->next; 
	}

	free_list(tmp_list);
	free_list(list_2);

	return intersect;

}
Exemple #2
0
void read_file(DOC *head, char *file_name)
{
    FILE *fp;
    char line[BUFFER_SIZE];
    
    if ( (fp = fopen(file_name, "r")) == NULL )
    {
        printf("Failed to open: %s\n", file_name);
        exit(1);
    }
    
    while (fgets(line, sizeof(line), fp) != NULL)
    {
        line[strlen(line) - 1] = '\0';  // 改行文字の削除
        add_doc(head, line);
    }
    fclose(fp);
}
//make list copy - should not free a list in the hash table.
WordNode * make_copy(WordNode * list)
{
	if (!list){
		return NULL;
	}

	WordNode * copy = init_list();
	
	DocNode * doc = list->head;
	while (doc != NULL) {
		// just add doc 
		add_doc(doc->docID , doc->freq , copy);
		// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
		doc = doc->next; 
	}

	return copy;

}
HashTable *ReadFile(char *file)
{
	FILE * fp;

	if ((fp=fopen(file, "r")) == NULL) {
		fprintf(stderr, "ERROR: Failure to open index file %s", file);
		return NULL;
	}

	HashTable *ht = init_hash();

	char line[BUF_MAX];


    /* Read the entire file line by line. */
   	while (fgets(line, BUF_MAX, fp) != NULL) {


   		//initialize word, numdocs, and wordnode
		char *word; 
		int num_docs = 0;

    	word = strtok(line, " ");

	    num_docs = atoi(strtok(NULL, " "));
	    
	    if( num_docs == 0){
	    	fprintf(stderr, "ERROR: index file incorrect format.");
	    	free_table(ht);
	    	return NULL;
	    }

	    WordNode * list = init_list();

    	//have to load the docs in backwards
    	int backwards[num_docs*2];

    	for(int i=0; i<num_docs; i++){
    		
    		int doc_id = atoi(strtok(NULL, " "));
    		int freq = atoi(strtok(NULL, " "));
    		// get position of docid to be loaded in backwards 
    		int here = (num_docs * 2 - 2) - (2*i);
    		int next = (num_docs * 2 - 1) - (2*i);
    		//load from the back
    		backwards[here] = doc_id;
    		backwards[next] = freq;

    	}

    	//add in the docs using backwards array
    	for(int i = 0; i < num_docs; i++) {

    		add_doc(backwards[2*i], backwards[(2*i)+1], list);

    	}
    	//add the completed WordNode to the hash table
    	add_hash(word, list, ht);
    	
	}

	fclose(fp);

	return ht;
}
WordNode * unionize(WordNode * tmp_list, WordNode * list_2)
{
	WordNode * unioned = init_list();

	if(!tmp_list && !list_2){
		return unioned; // return empty list
	}

	if(!tmp_list){
		// must make a copy, in case this list_2 is retrieved from the hash table. 
		DocNode * doc = list_2->head;
		while (doc != NULL) {
			// just add doc 
			add_doc(doc->docID , doc->freq , unioned);
			// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
			doc = doc->next; 
		}
		free_list(list_2);
		return unioned;
	}

	if(!list_2){
		// must make a copy
		DocNode * doc = tmp_list->head;
		while (doc != NULL) {
			// just add doc 
			add_doc(doc->docID , doc->freq , unioned);
			// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
			doc = doc->next; 
		}
		free_list(tmp_list);
		return unioned;
	}

	// keep track of shared documents
	int max_seen_size = num_docs(list_2);
	int* have_seen = (int *) calloc(max_seen_size, sizeof(int));
	int i=0;

	//go through every node in tmp_list
	DocNode * doc = tmp_list->head;

	while (doc != NULL) {
		// for each doc node, see if that docNode is present in list_2. 
		DocNode * shared = get_index(doc->docID, list_2);
		
		if(shared){
			//if shared, keep track of this docID. 
			have_seen[i++] = doc->docID;
			// take max frequency 
			int newfreq = MAX((shared->freq),(doc->freq)); 
			//make a new docNode and add this docNode to the unioned list
			add_doc(doc->docID ,newfreq , unioned);
		} else {
			// even if not shared, add to the list. 
			add_doc(doc->docID , doc->freq , unioned);
		}
		// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
		doc = doc->next; 
	}
	
	// now go through every node in list_2
	doc = list_2->head;

	while (doc != NULL) {

		//check if docID has already been added from tmp_list
		int index = find_index(have_seen, max_seen_size, doc->docID);
		
		if (index == -1) {
			// only adds if doc has not been added before
			add_doc(doc->docID , doc->freq , unioned);
		}

		doc = doc->next;
	}

	free_list(tmp_list);
	free_list(list_2);

	return unioned;

}
Exemple #6
0
// simple test, intersecting one list into final_link
int TestINTERSECT1() {
  START_TEST_CASE;
  tmp = init_list();
  add_doc(1, 2, tmp);
  final = NULL;
Exemple #7
0
/*
 *    run regular gibbs cycles on the data with phi used;
 *    the evaluation on each doc, and sample word probs
 *
 *    if qparts>0, split collection into parts and only search this
 *
 *    K = number of top results to retain
 */
void gibbs_query(int K, char *qname, int dots, int this_qpart, int qparts) {
  /*
   *    mapping from query word posn. to its mi in current doc
   *       >ddN.N  = not in current doc
   *       -ve  = has no mi since occurs just once, found at
   *              posn  (-map[]-1)
   *       non -ve = mi value
   */
  int     *mimap = NULL;
  /*
   *     usual stuff for Gibbs loop over docs
   */
  int i, j;
  float *fact = fvec(ddN.T*4);
  D_MiSi_t dD;
  /*
   *   an index into topk[] which maintains ordering
   */
  int     *topind;

  /*
   *    these store statistics of the results, for printing
   *    these are unordered, ordered by topind[]
   */
  /*      document score  */
  float   *topscore;
  /*      document number   */
  int     *topk;
  /*      flags if ord is irrelevant, thus not scored  */
  char    *wordunused;

  /*
   *    per word stats for top results saved
   */
  int     *found;
  float   *topcnt;
  float   *topwordscore;
  /*
   *    temporary versions for when gibbs running
   */
  int     *found_buf;
  float   *topcnt_buf;
  float   *topwordscore_buf;
  double  *logprob;
  /*
   *   search here
   */
  int startdoc = 0;
  int enddoc = ddN.DT;

  /*
   *    setup
   */
  topcnt = malloc(sizeof(topcnt[0])*K*ddP.n_words);
  topwordscore = malloc(sizeof(topwordscore[0])*K*ddP.n_words);
  found = malloc(sizeof(found)*ddP.n_words*K);
  wordunused = malloc(sizeof(wordunused[0])*ddP.n_words);

  topcnt_buf = malloc(sizeof(topcnt[0])*ddP.n_words);
  topwordscore_buf = malloc(sizeof(topwordscore[0])*ddP.n_words);
  found_buf = malloc(sizeof(found)*ddP.n_words);
  if ( !topcnt || !topwordscore || !found || 
       !topcnt_buf || !topwordscore_buf || !found_buf )
    yap_quit("Cannot allocate memory in gibbs_query()\n");

  logprob = malloc(sizeof(logprob[0])*ddP.n_query);
  topscore = malloc(sizeof(topscore[0])*K*ddP.n_query);
  topind = malloc(sizeof(topind[0])*K*ddP.n_query);
  topk = malloc(sizeof(topk[0])*K*ddP.n_query);
  if ( ddP.bdk!=NULL ) 
    mimap = malloc(sizeof(mimap[0])*ddP.n_words);
  if ( !topk || !topscore || !logprob || !topind )
    yap_quit("Cannot allocate memory in gibbs_query()\n");
  for (i=0; i<ddP.n_words; i++) {
    wordunused[i] = 0;
  }
  for (i=0; i<K*ddP.n_query; i++) {
    topind[i] = i%K;
    topk[i] = -1;
    topscore[i] = INFINITY;
  }
  
  /*
   *  check words to exclude using topics
   */
  if ( ddP.n_excludetopic>0 ) {
    double *tprob = malloc(sizeof(tprob[0])*ddN.T);
    get_probs(tprob);
    yap_probs();
    if ( verbose>1 )
      yap_message("Excluding words: ");
    for (i=0; i<ddP.n_words; i++) {
      int t = besttopic(ddP.qword[i],tprob);
      if ( Q_excludetopic(t) ) {
	wordunused[i] = 1;
	if ( verbose>1 )
	  yap_message(" %d/%d", (int)ddP.qword[i], t);
      }
    } 
    if ( verbose>1 )
      yap_message("\n");
    free(tprob);
  }
  
  if ( ddP.bdk!=NULL ) misi_init(&ddM,&dD);

  if ( qparts>0 ) {
    startdoc = ((double)this_qpart)/qparts * ddN.DT;
    enddoc = ((double)this_qpart+1.0)/qparts * ddN.DT;
  }
  for(i=startdoc; i<enddoc; i++) {
    int  thisw =  add_doc(i, GibbsNone);
    int  r;
    if ( thisw<=1 ) {
      remove_doc(i, GibbsNone);
      continue;
    }
    if ( ddP.bdk!=NULL ) 
      misi_build(&dD, i, 0);
    map_query(i, mimap, found_buf);
    for (j=0; j<ddP.n_words; j++) {
      topcnt_buf[j] = 0;
      topwordscore_buf[j] = 0;
    }
    
    for (r=0; r<ddP.queryiter; r++) {
      gibbs_lda(GibbsNone, ddN.T, i, ddD.NdT[i], fact, &dD, 0, 0);
      query_docprob(i, mimap, fact, &dD, topcnt_buf, topwordscore_buf);
    }  
    /*
     *  now adjust stats
     */
    for (j=0; j<ddP.n_query; j++) 
      logprob[j] = 0;
    for (j=0; j<ddP.n_words; j++) {
      if ( wordunused[j]>0 )
	continue;
      if ( ddP.query[ddP.qword[j]]==j ) {
	topcnt_buf[j] /= ddP.queryiter;
	topwordscore_buf[j] /= ddP.queryiter;
      } else {
	/*  word in previous query so copy  */
	int jj =  ddP.query[ddP.qword[j]];
	topcnt_buf[j] = topcnt_buf[jj];
	topwordscore_buf[j] = topwordscore_buf[jj];
	found_buf[j] = found_buf[jj];
      }
      if ( wordunused[j]==0 )
	logprob[ddP.qid[j]] += topwordscore_buf[j];
    }
    if ( dots>0 && i>0 && (i%dots==0) ) 
      yap_message(".");
    if ( ddP.bdk!=NULL ) misi_unbuild(&dD,i,0);
    remove_doc(i, GibbsNone);
    /*
     *   enter into the arrays
     */
    for (j=0; j<ddP.n_query; j++) {
      if ( i<K || logprob[j] < topscore[j*K+topind[j*K+K-1]] ) {
	int newind, l;
	/*
	 *   better than current lowest 
	 */
	newind = bubble((i<K)?(i+1):K, 
			&topind[j*K], &topscore[j*K], logprob[j]);
	/*
	 *   save the current details
	 */
	topscore[j*K+newind] = logprob[j];
	topk[j*K+newind] = i;
	for (l=ddP.qposn[j]; l<ddP.qposn[j+1]; l++) {
	  topcnt[newind*ddP.n_words+l] = topcnt_buf[l]; 
	  topwordscore[newind*ddP.n_words+l] = topwordscore_buf[l]; 
	  found[newind*ddP.n_words+l] = found_buf[l]; 
	}
      }
    }
  }
  if ( dots>0 ) yap_message("\n");
  
  /*
   *  write result
   */
  {
    float *ws = fvec(ddP.n_words);
    FILE *fp = fopen(qname,"w");
    int q;
    if ( !fp )
      yap_sysquit("Cannot write query results to '%s'\n", qname);
    for (q=0; q<ddP.n_query; q++) {
      int nw = ddP.qposn[q+1]-ddP.qposn[q];
      for (i=0; i<K && i<ddN.DT && topk[topind[q*K+i]]>=0; i++) {
	int l, ind = topind[q*K+i];
	double tfidf;
        tfidf = bm25(topk[q*K+ind],&found[ind*ddP.n_words+ddP.qposn[q]],
			    &ddP.qword[ddP.qposn[q]], nw, ws);
	assert(ind>=0 && ind<K);
	fprintf(fp, "%d %d ", q, topk[q*K+ind]);
	fprintf(fp, "%.4f %.4lf ", topscore[q*K+ind]/nw, tfidf);
        if ( verbose>1 ) {
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%d ", found[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topcnt[ind*ddP.n_words+l]);
          for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++)
            fprintf(fp, "%f ", topwordscore[ind*ddP.n_words+l]);
          for (l=0; l<nw; l++)
            fprintf(fp, "%lf ", ws[l]);
        }
        fprintf(fp, "\n");
      }
    }
    fclose(fp);
    free(ws);
  }

  /*
   *  clean up
   */
  free(fact);
  if ( ddP.bdk!=NULL ) misi_free(&dD);
  if ( mimap ) free(mimap);
  free(found);
  free(topwordscore);
  free(topcnt);
  free(found_buf);
  free(topwordscore_buf);
  free(topcnt_buf);
  free(topscore);
  free(topind);
  free(topk);
  free(logprob);
}