WordNode * intersection(WordNode * tmp_list, WordNode * list_2) { // make new list WordNode * intersect = init_list(); // no results found for either list - return empty list if(!tmp_list){ return intersect; } if(!list_2){ return intersect; } //go through every node in tmp_list DocNode * doc = tmp_list->head; //go through each of the doc nodes while (doc != NULL) { // for each doc node, see if that docNode is present in list_2. DocNode * shared = get_index(doc->docID, list_2); if(shared){ // add the frequencies together int newfreq = (shared->freq) + (doc->freq); //make a new docNode and add this docNode to the intersect list add_doc(doc->docID ,newfreq , intersect); } // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } free_list(tmp_list); free_list(list_2); return intersect; }
void read_file(DOC *head, char *file_name) { FILE *fp; char line[BUFFER_SIZE]; if ( (fp = fopen(file_name, "r")) == NULL ) { printf("Failed to open: %s\n", file_name); exit(1); } while (fgets(line, sizeof(line), fp) != NULL) { line[strlen(line) - 1] = '\0'; // 改行文字の削除 add_doc(head, line); } fclose(fp); }
//make list copy - should not free a list in the hash table. WordNode * make_copy(WordNode * list) { if (!list){ return NULL; } WordNode * copy = init_list(); DocNode * doc = list->head; while (doc != NULL) { // just add doc add_doc(doc->docID , doc->freq , copy); // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } return copy; }
HashTable *ReadFile(char *file) { FILE * fp; if ((fp=fopen(file, "r")) == NULL) { fprintf(stderr, "ERROR: Failure to open index file %s", file); return NULL; } HashTable *ht = init_hash(); char line[BUF_MAX]; /* Read the entire file line by line. */ while (fgets(line, BUF_MAX, fp) != NULL) { //initialize word, numdocs, and wordnode char *word; int num_docs = 0; word = strtok(line, " "); num_docs = atoi(strtok(NULL, " ")); if( num_docs == 0){ fprintf(stderr, "ERROR: index file incorrect format."); free_table(ht); return NULL; } WordNode * list = init_list(); //have to load the docs in backwards int backwards[num_docs*2]; for(int i=0; i<num_docs; i++){ int doc_id = atoi(strtok(NULL, " ")); int freq = atoi(strtok(NULL, " ")); // get position of docid to be loaded in backwards int here = (num_docs * 2 - 2) - (2*i); int next = (num_docs * 2 - 1) - (2*i); //load from the back backwards[here] = doc_id; backwards[next] = freq; } //add in the docs using backwards array for(int i = 0; i < num_docs; i++) { add_doc(backwards[2*i], backwards[(2*i)+1], list); } //add the completed WordNode to the hash table add_hash(word, list, ht); } fclose(fp); return ht; }
WordNode * unionize(WordNode * tmp_list, WordNode * list_2) { WordNode * unioned = init_list(); if(!tmp_list && !list_2){ return unioned; // return empty list } if(!tmp_list){ // must make a copy, in case this list_2 is retrieved from the hash table. DocNode * doc = list_2->head; while (doc != NULL) { // just add doc add_doc(doc->docID , doc->freq , unioned); // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } free_list(list_2); return unioned; } if(!list_2){ // must make a copy DocNode * doc = tmp_list->head; while (doc != NULL) { // just add doc add_doc(doc->docID , doc->freq , unioned); // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } free_list(tmp_list); return unioned; } // keep track of shared documents int max_seen_size = num_docs(list_2); int* have_seen = (int *) calloc(max_seen_size, sizeof(int)); int i=0; //go through every node in tmp_list DocNode * doc = tmp_list->head; while (doc != NULL) { // for each doc node, see if that docNode is present in list_2. DocNode * shared = get_index(doc->docID, list_2); if(shared){ //if shared, keep track of this docID. have_seen[i++] = doc->docID; // take max frequency int newfreq = MAX((shared->freq),(doc->freq)); //make a new docNode and add this docNode to the unioned list add_doc(doc->docID ,newfreq , unioned); } else { // even if not shared, add to the list. add_doc(doc->docID , doc->freq , unioned); } // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } // now go through every node in list_2 doc = list_2->head; while (doc != NULL) { //check if docID has already been added from tmp_list int index = find_index(have_seen, max_seen_size, doc->docID); if (index == -1) { // only adds if doc has not been added before add_doc(doc->docID , doc->freq , unioned); } doc = doc->next; } free_list(tmp_list); free_list(list_2); return unioned; }
// simple test, intersecting one list into final_link int TestINTERSECT1() { START_TEST_CASE; tmp = init_list(); add_doc(1, 2, tmp); final = NULL;
/* * run regular gibbs cycles on the data with phi used; * the evaluation on each doc, and sample word probs * * if qparts>0, split collection into parts and only search this * * K = number of top results to retain */ void gibbs_query(int K, char *qname, int dots, int this_qpart, int qparts) { /* * mapping from query word posn. to its mi in current doc * >ddN.N = not in current doc * -ve = has no mi since occurs just once, found at * posn (-map[]-1) * non -ve = mi value */ int *mimap = NULL; /* * usual stuff for Gibbs loop over docs */ int i, j; float *fact = fvec(ddN.T*4); D_MiSi_t dD; /* * an index into topk[] which maintains ordering */ int *topind; /* * these store statistics of the results, for printing * these are unordered, ordered by topind[] */ /* document score */ float *topscore; /* document number */ int *topk; /* flags if ord is irrelevant, thus not scored */ char *wordunused; /* * per word stats for top results saved */ int *found; float *topcnt; float *topwordscore; /* * temporary versions for when gibbs running */ int *found_buf; float *topcnt_buf; float *topwordscore_buf; double *logprob; /* * search here */ int startdoc = 0; int enddoc = ddN.DT; /* * setup */ topcnt = malloc(sizeof(topcnt[0])*K*ddP.n_words); topwordscore = malloc(sizeof(topwordscore[0])*K*ddP.n_words); found = malloc(sizeof(found)*ddP.n_words*K); wordunused = malloc(sizeof(wordunused[0])*ddP.n_words); topcnt_buf = malloc(sizeof(topcnt[0])*ddP.n_words); topwordscore_buf = malloc(sizeof(topwordscore[0])*ddP.n_words); found_buf = malloc(sizeof(found)*ddP.n_words); if ( !topcnt || !topwordscore || !found || !topcnt_buf || !topwordscore_buf || !found_buf ) yap_quit("Cannot allocate memory in gibbs_query()\n"); logprob = malloc(sizeof(logprob[0])*ddP.n_query); topscore = malloc(sizeof(topscore[0])*K*ddP.n_query); topind = malloc(sizeof(topind[0])*K*ddP.n_query); topk = malloc(sizeof(topk[0])*K*ddP.n_query); if ( ddP.bdk!=NULL ) mimap = malloc(sizeof(mimap[0])*ddP.n_words); if ( !topk || !topscore || !logprob || !topind ) yap_quit("Cannot allocate memory in gibbs_query()\n"); for (i=0; i<ddP.n_words; i++) { wordunused[i] = 0; } for (i=0; i<K*ddP.n_query; i++) { topind[i] = i%K; topk[i] = -1; topscore[i] = INFINITY; } /* * check words to exclude using topics */ if ( ddP.n_excludetopic>0 ) { double *tprob = malloc(sizeof(tprob[0])*ddN.T); get_probs(tprob); yap_probs(); if ( verbose>1 ) yap_message("Excluding words: "); for (i=0; i<ddP.n_words; i++) { int t = besttopic(ddP.qword[i],tprob); if ( Q_excludetopic(t) ) { wordunused[i] = 1; if ( verbose>1 ) yap_message(" %d/%d", (int)ddP.qword[i], t); } } if ( verbose>1 ) yap_message("\n"); free(tprob); } if ( ddP.bdk!=NULL ) misi_init(&ddM,&dD); if ( qparts>0 ) { startdoc = ((double)this_qpart)/qparts * ddN.DT; enddoc = ((double)this_qpart+1.0)/qparts * ddN.DT; } for(i=startdoc; i<enddoc; i++) { int thisw = add_doc(i, GibbsNone); int r; if ( thisw<=1 ) { remove_doc(i, GibbsNone); continue; } if ( ddP.bdk!=NULL ) misi_build(&dD, i, 0); map_query(i, mimap, found_buf); for (j=0; j<ddP.n_words; j++) { topcnt_buf[j] = 0; topwordscore_buf[j] = 0; } for (r=0; r<ddP.queryiter; r++) { gibbs_lda(GibbsNone, ddN.T, i, ddD.NdT[i], fact, &dD, 0, 0); query_docprob(i, mimap, fact, &dD, topcnt_buf, topwordscore_buf); } /* * now adjust stats */ for (j=0; j<ddP.n_query; j++) logprob[j] = 0; for (j=0; j<ddP.n_words; j++) { if ( wordunused[j]>0 ) continue; if ( ddP.query[ddP.qword[j]]==j ) { topcnt_buf[j] /= ddP.queryiter; topwordscore_buf[j] /= ddP.queryiter; } else { /* word in previous query so copy */ int jj = ddP.query[ddP.qword[j]]; topcnt_buf[j] = topcnt_buf[jj]; topwordscore_buf[j] = topwordscore_buf[jj]; found_buf[j] = found_buf[jj]; } if ( wordunused[j]==0 ) logprob[ddP.qid[j]] += topwordscore_buf[j]; } if ( dots>0 && i>0 && (i%dots==0) ) yap_message("."); if ( ddP.bdk!=NULL ) misi_unbuild(&dD,i,0); remove_doc(i, GibbsNone); /* * enter into the arrays */ for (j=0; j<ddP.n_query; j++) { if ( i<K || logprob[j] < topscore[j*K+topind[j*K+K-1]] ) { int newind, l; /* * better than current lowest */ newind = bubble((i<K)?(i+1):K, &topind[j*K], &topscore[j*K], logprob[j]); /* * save the current details */ topscore[j*K+newind] = logprob[j]; topk[j*K+newind] = i; for (l=ddP.qposn[j]; l<ddP.qposn[j+1]; l++) { topcnt[newind*ddP.n_words+l] = topcnt_buf[l]; topwordscore[newind*ddP.n_words+l] = topwordscore_buf[l]; found[newind*ddP.n_words+l] = found_buf[l]; } } } } if ( dots>0 ) yap_message("\n"); /* * write result */ { float *ws = fvec(ddP.n_words); FILE *fp = fopen(qname,"w"); int q; if ( !fp ) yap_sysquit("Cannot write query results to '%s'\n", qname); for (q=0; q<ddP.n_query; q++) { int nw = ddP.qposn[q+1]-ddP.qposn[q]; for (i=0; i<K && i<ddN.DT && topk[topind[q*K+i]]>=0; i++) { int l, ind = topind[q*K+i]; double tfidf; tfidf = bm25(topk[q*K+ind],&found[ind*ddP.n_words+ddP.qposn[q]], &ddP.qword[ddP.qposn[q]], nw, ws); assert(ind>=0 && ind<K); fprintf(fp, "%d %d ", q, topk[q*K+ind]); fprintf(fp, "%.4f %.4lf ", topscore[q*K+ind]/nw, tfidf); if ( verbose>1 ) { for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%d ", found[ind*ddP.n_words+l]); for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%f ", topcnt[ind*ddP.n_words+l]); for (l=ddP.qposn[q]; l<ddP.qposn[q+1]; l++) fprintf(fp, "%f ", topwordscore[ind*ddP.n_words+l]); for (l=0; l<nw; l++) fprintf(fp, "%lf ", ws[l]); } fprintf(fp, "\n"); } } fclose(fp); free(ws); } /* * clean up */ free(fact); if ( ddP.bdk!=NULL ) misi_free(&dD); if ( mimap ) free(mimap); free(found); free(topwordscore); free(topcnt); free(found_buf); free(topwordscore_buf); free(topcnt_buf); free(topscore); free(topind); free(topk); free(logprob); }