static void free_lexleaf(void *ptr) { Leaf *lptr = (Leaf*) ptr; LT_free(lptr->lxtree); }
/* * This function is given a list of term numbers and it returns a list * of document numbers based on the cosine document weighting system. * This puts the entries in an array. * inverted file. * If MaxDocs == -1 then it means all */ static DocList * CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) { DocList *Docs; float *AccumulatedWeights = NULL; Splay_Tree *ST = NULL; Splay_Tree *Para_ST = NULL; Hash_Table *HT = NULL; List_Table *LT = NULL; Heap *H; HeapEntry *he; register float *fptr = NULL; register Invf_Doc_Entry *ide = NULL; register Invf_Doc_EntryH *ideh = NULL; int BackEnd, NumExact, MaxExact, NumParas; int MaxDocs = 0, MaxParas = 0; int i; Invf_Doc_Entry_Pool ide_pool; ide_pool.pool = NULL; qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0; switch (rqi->AccumMethod) { case 'S': ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool); if (!ST) return NULL; break; case 'A': AccumulatedWeights = CosineDecode (qd, Terms, rqi); if (!AccumulatedWeights) return NULL; break; case 'H': HT = CosineDecodeHash (qd, Terms, rqi); if (!HT) return NULL; break; case 'L': LT = CosineDecodeList (qd, Terms, rqi); if (!LT) return NULL; break; } #if 0 if (rqi->UseSplayTree) { AccumulatedWeights = CosineDecode (qd, Terms, rqi); fptr = AccumulatedWeights; ide = SP_get_first (ST); for (i = 0; i < qd->sd->sdh.num_of_docs; i++) { if (AccumulatedWeights[i] != 0) { if (i != ide->DocNum) fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1, AccumulatedWeights[i], ide->DocNum + 1, ide->Sum); ide = SP_get_next (ST); } } } #endif switch (rqi->AccumMethod) { case 'S': MaxParas = ST->no_of_items; break; case 'A': { /* count the number of non-zero document weights */ register int i = qd->sd->sdh.num_of_docs; register float *d; MaxParas = 0; for (d = AccumulatedWeights; i; i--, d++) if (*d) MaxParas++; } break; case 'H': MaxParas = HT->num + HT->Suplimentary_Num; break; case 'L': MaxParas = LT->num; break; } if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve) MaxParas = rqi->MaxParasToRetrieve; MaxDocs = MaxParas; /* Allocate memory for the heap */ Docs = MakeDocList (MaxDocs); ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs); H = Heap_Make (MaxDocs, Heap_Lesser); /* Get the sums from the array divide the sums by the document weights which we retrieve from the ".idx.wgt" file and put the resulting data into a heap */ he = H->HE; H->NumItems = MaxDocs; switch (rqi->AccumMethod) { case 'S': { ide = SP_get_first (ST); for (i = 0; i < H->NumItems; i++, ide = SP_get_next (ST), he++) { he->DocNum = ide->DocNum + 1; he->OrgWeight = &ide->Sum; qd->num_of_accum++; } } break; case 'A': { fptr = AccumulatedWeights; for (i = 0; i < H->NumItems; i++, fptr++, he++) { he->DocNum = i + 1; he->OrgWeight = fptr; if (*fptr) qd->num_of_accum++; } } break; case 'H': { ideh = HT->Head; for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++) { he->DocNum = ideh->IDE.DocNum + 1; he->OrgWeight = &ideh->IDE.Sum; qd->num_of_accum++; } } break; case 'L': { ide = LT->IDE; for (i = 0; i < H->NumItems; i++, ide++, he++) { he->DocNum = ide->DocNum + 1; he->OrgWeight = &ide->Sum; qd->num_of_accum++; } } break; } he = H->HE; for (i = 0; i < H->NumItems; i++, he++) { *he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1); he->Weight = *he->OrgWeight; *he->OrgWeight = 0; he->SeekPos = he->Len = 0; } Heap_Build (H); he = H->HE; switch (rqi->AccumMethod) { case 'S': { for (i = MaxDocs; i < ST->no_of_items; i++, ide = SP_get_next (ST)) { ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum); qd->num_of_accum++; if (ide->Sum <= he->Weight) continue; *he->OrgWeight = he->Weight; he->DocNum = ide->DocNum + 1; he->Weight = ide->Sum; he->OrgWeight = &ide->Sum; *he->OrgWeight = 0; Heap_Heapify (H, 1); } } break; case 'A': { for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++) { if (!*fptr) continue; qd->num_of_accum++; *fptr /= GetLowerApproxDocWeight (qd->awd, i); if (*fptr <= he->Weight) continue; *he->OrgWeight = he->Weight; he->DocNum = i + 1; he->Weight = *fptr; he->OrgWeight = fptr; *he->OrgWeight = 0; Heap_Heapify (H, 1); } } break; case 'H': { for (; ideh; ideh = ideh->next) { qd->num_of_accum++; ideh->IDE.Sum /= GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum); if (ideh->IDE.Sum <= he->Weight) continue; *he->OrgWeight = he->Weight; he->DocNum = ideh->IDE.DocNum + 1; he->Weight = ideh->IDE.Sum; he->OrgWeight = &ideh->IDE.Sum; *he->OrgWeight = 0; Heap_Heapify (H, 1); } } break; case 'L': { for (i = MaxDocs; i < LT->num; i++, ide++) { qd->num_of_accum++; ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum); if (ide->Sum <= he->Weight) continue; *he->OrgWeight = he->Weight; he->DocNum = ide->DocNum + 1; he->Weight = ide->Sum; he->OrgWeight = &ide->Sum; *he->OrgWeight = 0; Heap_Heapify (H, 1); } } break; } if (rqi->Exact && qd->id->ifh.InvfLevel != 3) { HeapEntry *he = H->HE; for (i = 0; i < H->NumItems; i++, he++) { he->Weight = he->Weight * GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) / FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len); } Heap_Build (H); he = H->HE; switch (rqi->AccumMethod) { case 'S': { ide = SP_get_first (ST); for (i = 0; i < ST->no_of_items; i++, ide = SP_get_next (ST)) { u_long SeekPos, Len; float Weight; if (!ide->Sum) continue; if (ide->Sum <= he->Weight) continue; Weight = ide->Sum * GetLowerApproxDocWeight (qd->awd, ide->DocNum) / FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len); if (Weight <= he->Weight) continue; he->DocNum = ide->DocNum + 1; he->OrgWeight = &ide->Sum; he->Weight = Weight; he->SeekPos = SeekPos; he->Len = Len; ide->Sum = 0; Heap_Heapify (H, 1); } } break; /* up to here */ case 'A': { fptr = AccumulatedWeights; for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++) { u_long SeekPos, Len; float Weight; if (!*fptr) continue; if (*fptr <= he->Weight) continue; Weight = *fptr * GetLowerApproxDocWeight (qd->awd, i) / FetchDocStart (qd, i + 1, &SeekPos, &Len); if (Weight <= he->Weight) continue; he->DocNum = i + 1; he->OrgWeight = fptr; he->Weight = Weight; he->SeekPos = SeekPos; he->Len = Len; *fptr = 0; Heap_Heapify (H, 1); } } break; case 'H': { ideh = HT->Head; for (ideh = HT->Head; ideh; ideh = ideh->next) { u_long SeekPos, Len; float Weight; if (!ideh->IDE.Sum) continue; if (ideh->IDE.Sum <= he->Weight) continue; Weight = ideh->IDE.Sum * GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum) / FetchDocStart (qd, ideh->IDE.DocNum + 1, &SeekPos, &Len); if (Weight <= he->Weight) continue; he->DocNum = ideh->IDE.DocNum + 1; he->OrgWeight = &ideh->IDE.Sum; he->Weight = Weight; he->SeekPos = SeekPos; he->Len = Len; ideh->IDE.Sum = 0; Heap_Heapify (H, 1); } } break; case 'L': { ide = LT->IDE; for (i = 0; i < LT->num; i++, ide++) { u_long SeekPos, Len; float Weight; if (!ide->Sum) continue; if (ide->Sum <= he->Weight) continue; Weight = ide->Sum * GetLowerApproxDocWeight (qd->awd, ide->DocNum) / FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len); if (Weight <= he->Weight) continue; he->DocNum = ide->DocNum + 1; he->OrgWeight = &ide->Sum; he->Weight = Weight; he->SeekPos = SeekPos; he->Len = Len; ide->Sum = 0; Heap_Heapify (H, 1); } } break; } } H->HC = Heap_Greater; Heap_Build (H); MaxDocs = H->NumItems; if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve) MaxDocs = rqi->MaxDocsToRetrieve; /* Alarm */ he = H->HE; BackEnd = H->NumItems - 1; NumExact = 0; MaxExact = H->NumItems; NumParas = 0; Para_ST = SP_createset (DE_comp); while (H->NumItems && Docs->num < MaxDocs) { DocEntry DocEnt; DocEntry *mem; if (rqi->Exact) { if (H->HE[0].SeekPos == 0) NumExact += Make_Exact_Root (qd, H); } else FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len); NumParas++; DocEnt.DocNum = he->DocNum; DocEnt.Weight = he->Weight; DocEnt.Len = he->Len; DocEnt.SeekPos = he->SeekPos; DocEnt.CompTextBuffer = NULL; DocEnt.Next = NULL; Heap_DeleteHead (H); if (!(mem = SP_member (&DocEnt, Para_ST))) { Docs->DE[Docs->num] = DocEnt; SP_insert (&Docs->DE[Docs->num], Para_ST); Docs->num++; } else { DocEnt.Next = mem->Next; Docs->DE[BackEnd] = DocEnt; mem->Next = &Docs->DE[BackEnd--]; } } SP_freeset (Para_ST); if (qd->id->ifh.InvfLevel == 3) { Message ("%d Paragraphs were required to get %d documents", NumParas, Docs->num); if (NumExact == MaxExact) { Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact); Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas); Message ("returned do not necessarly represent an exact cosine ranking."); Message ("This problem may be corrected by increasing \'maxparas\'."); } } #if 0 { int i; FILE *f = fopen ("top.paras", "w"); fprintf (f, "=========================\nTop Paragraphs\n"); for (i = 0; i < Docs->num; i++) { DocEntry *d; fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight); for (d = Heap[i].Next; d; d = d->Next) fprintf (f, "%d(%f) ", d->DocNum, d->Weight); fprintf (f, "\n"); } fprintf (f, "=========================\n"); fclose (f); } #endif if (AccumulatedWeights) { Xfree (AccumulatedWeights); ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs); } if (ST) { int mem = ST->mem_in_use; SP_freeset (ST); ChangeMemInUse (qd, -mem); free_ide_pool (qd, &ide_pool); } if (HT) HT_free (qd, HT); if (LT) LT_free (qd, LT); return (Docs); }