Esempio n. 1
0
File: match.c Progetto: jff/mathspad
static void free_lexleaf(void *ptr)
{
    Leaf *lptr = (Leaf*) ptr;
    LT_free(lptr->lxtree);
}
Esempio n. 2
0
/*
 * This function is given a list of term numbers and it returns a list
 * of document numbers based on the cosine document weighting system.
 * This puts the entries in an array.
 * inverted file.
 * If MaxDocs == -1 then it means all 
 */
static DocList *
CosineGet (query_data * qd, TermList * Terms,
	   RankedQueryInfo * rqi)
{
  DocList *Docs;
  float *AccumulatedWeights = NULL;
  Splay_Tree *ST = NULL;
  Splay_Tree *Para_ST = NULL;
  Hash_Table *HT = NULL;
  List_Table *LT = NULL;
  Heap *H;
  HeapEntry *he;
  register float *fptr = NULL;
  register Invf_Doc_Entry *ide = NULL;
  register Invf_Doc_EntryH *ideh = NULL;
  int BackEnd, NumExact, MaxExact, NumParas;
  int MaxDocs = 0, MaxParas = 0;
  int i;
  Invf_Doc_Entry_Pool ide_pool;
  ide_pool.pool = NULL;

  qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;

  switch (rqi->AccumMethod)
    {
    case 'S':
      ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
      if (!ST)
	return NULL;
      break;
    case 'A':
      AccumulatedWeights = CosineDecode (qd, Terms, rqi);
      if (!AccumulatedWeights)
	return NULL;
      break;
    case 'H':
      HT = CosineDecodeHash (qd, Terms, rqi);
      if (!HT)
	return NULL;
      break;
    case 'L':
      LT = CosineDecodeList (qd, Terms, rqi);
      if (!LT)
	return NULL;
      break;
    }

#if 0
  if (rqi->UseSplayTree)
    {

      AccumulatedWeights = CosineDecode (qd, Terms, rqi);
      fptr = AccumulatedWeights;
      ide = SP_get_first (ST);
      for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
	{
	  if (AccumulatedWeights[i] != 0)
	    {
	      if (i != ide->DocNum)
		fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
			 AccumulatedWeights[i], ide->DocNum + 1, ide->Sum);
	      ide = SP_get_next (ST);
	    }
	}
    }
#endif

  switch (rqi->AccumMethod)
    {
    case 'S':
      MaxParas = ST->no_of_items;
      break;
    case 'A':
      {				/* count the number of non-zero document weights */
	register int i = qd->sd->sdh.num_of_docs;
	register float *d;
	MaxParas = 0;
	for (d = AccumulatedWeights; i; i--, d++)
	  if (*d)
	    MaxParas++;
      }
      break;
    case 'H':
      MaxParas = HT->num + HT->Suplimentary_Num;
      break;
    case 'L':
      MaxParas = LT->num;
      break;
    }

  if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
    MaxParas = rqi->MaxParasToRetrieve;
  MaxDocs = MaxParas;

  /* Allocate memory for the heap */
  Docs = MakeDocList (MaxDocs);
  ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);

  H = Heap_Make (MaxDocs, Heap_Lesser);


  /* Get the sums from the array divide the sums by the 
     document weights which we retrieve from the ".idx.wgt" file and put
     the resulting data into a heap */


  he = H->HE;
  H->NumItems = MaxDocs;
  switch (rqi->AccumMethod)
    {
    case 'S':
      {
	ide = SP_get_first (ST);
	for (i = 0; i < H->NumItems; i++, ide = SP_get_next (ST), he++)
	  {
	    he->DocNum = ide->DocNum + 1;
	    he->OrgWeight = &ide->Sum;
	    qd->num_of_accum++;
	  }
      }
      break;
    case 'A':
      {
	fptr = AccumulatedWeights;
	for (i = 0; i < H->NumItems; i++, fptr++, he++)
	  {
	    he->DocNum = i + 1;
	    he->OrgWeight = fptr;
	    if (*fptr)
	      qd->num_of_accum++;
	  }
      }
      break;
    case 'H':
      {
	ideh = HT->Head;
	for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
	  {
	    he->DocNum = ideh->IDE.DocNum + 1;
	    he->OrgWeight = &ideh->IDE.Sum;
	    qd->num_of_accum++;
	  }
      }
      break;
    case 'L':
      {
	ide = LT->IDE;
	for (i = 0; i < H->NumItems; i++, ide++, he++)
	  {
	    he->DocNum = ide->DocNum + 1;
	    he->OrgWeight = &ide->Sum;
	    qd->num_of_accum++;
	  }
      }
      break;
    }

  he = H->HE;
  for (i = 0; i < H->NumItems; i++, he++)
    {
      *he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
      he->Weight = *he->OrgWeight;
      *he->OrgWeight = 0;
      he->SeekPos = he->Len = 0;
    }

  Heap_Build (H);

  he = H->HE;
  switch (rqi->AccumMethod)
    {
    case 'S':
      {
	for (i = MaxDocs; i < ST->no_of_items; i++, ide = SP_get_next (ST))
	  {
	    ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
	    qd->num_of_accum++;
	    if (ide->Sum <= he->Weight)
	      continue;
	    *he->OrgWeight = he->Weight;
	    he->DocNum = ide->DocNum + 1;
	    he->Weight = ide->Sum;
	    he->OrgWeight = &ide->Sum;
	    *he->OrgWeight = 0;
	    Heap_Heapify (H, 1);
	  }
      }
      break;
    case 'A':
      {
	for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
	  {
	    if (!*fptr)
	      continue;
	    qd->num_of_accum++;
	    *fptr /= GetLowerApproxDocWeight (qd->awd, i);
	    if (*fptr <= he->Weight)
	      continue;
	    *he->OrgWeight = he->Weight;
	    he->DocNum = i + 1;
	    he->Weight = *fptr;
	    he->OrgWeight = fptr;
	    *he->OrgWeight = 0;
	    Heap_Heapify (H, 1);
	  }
      }
      break;
    case 'H':
      {
	for (; ideh; ideh = ideh->next)
	  {
	    qd->num_of_accum++;
	    ideh->IDE.Sum /=
	      GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum);
	    if (ideh->IDE.Sum <= he->Weight)
	      continue;
	    *he->OrgWeight = he->Weight;
	    he->DocNum = ideh->IDE.DocNum + 1;
	    he->Weight = ideh->IDE.Sum;
	    he->OrgWeight = &ideh->IDE.Sum;
	    *he->OrgWeight = 0;
	    Heap_Heapify (H, 1);
	  }
      }
      break;
    case 'L':
      {
	for (i = MaxDocs; i < LT->num; i++, ide++)
	  {
	    qd->num_of_accum++;
	    ide->Sum /=
	      GetLowerApproxDocWeight (qd->awd, ide->DocNum);
	    if (ide->Sum <= he->Weight)
	      continue;
	    *he->OrgWeight = he->Weight;
	    he->DocNum = ide->DocNum + 1;
	    he->Weight = ide->Sum;
	    he->OrgWeight = &ide->Sum;
	    *he->OrgWeight = 0;
	    Heap_Heapify (H, 1);
	  }
      }
      break;
    }


  if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
    {
      HeapEntry *he = H->HE;

      for (i = 0; i < H->NumItems; i++, he++)
	{
	  he->Weight = he->Weight *
	    GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
	    FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
	}

      Heap_Build (H);

      he = H->HE;

      switch (rqi->AccumMethod)
	{
	case 'S':
	  {
	    ide = SP_get_first (ST);
	    for (i = 0; i < ST->no_of_items; i++, ide = SP_get_next (ST))
	      {
		u_long SeekPos, Len;
		float Weight;
		if (!ide->Sum)
		  continue;
		if (ide->Sum <= he->Weight)
		  continue;
		Weight = ide->Sum *
		  GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
		  FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
		if (Weight <= he->Weight)
		  continue;
		he->DocNum = ide->DocNum + 1;
		he->OrgWeight = &ide->Sum;
		he->Weight = Weight;
		he->SeekPos = SeekPos;
		he->Len = Len;
		ide->Sum = 0;
		Heap_Heapify (H, 1);
	      }
	  }
	  break;

	  /* up to here */

	case 'A':
	  {
	    fptr = AccumulatedWeights;
	    for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
	      {
		u_long SeekPos, Len;
		float Weight;
		if (!*fptr)
		  continue;
		if (*fptr <= he->Weight)
		  continue;
		Weight = *fptr *
		  GetLowerApproxDocWeight (qd->awd, i) /
		  FetchDocStart (qd, i + 1, &SeekPos, &Len);
		if (Weight <= he->Weight)
		  continue;
		he->DocNum = i + 1;
		he->OrgWeight = fptr;
		he->Weight = Weight;
		he->SeekPos = SeekPos;
		he->Len = Len;
		*fptr = 0;
		Heap_Heapify (H, 1);
	      }
	  }
	  break;
	case 'H':
	  {
	    ideh = HT->Head;
	    for (ideh = HT->Head; ideh; ideh = ideh->next)
	      {
		u_long SeekPos, Len;
		float Weight;
		if (!ideh->IDE.Sum)
		  continue;
		if (ideh->IDE.Sum <= he->Weight)
		  continue;
		Weight = ideh->IDE.Sum *
		  GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum) /
		  FetchDocStart (qd, ideh->IDE.DocNum + 1, &SeekPos, &Len);
		if (Weight <= he->Weight)
		  continue;
		he->DocNum = ideh->IDE.DocNum + 1;
		he->OrgWeight = &ideh->IDE.Sum;
		he->Weight = Weight;
		he->SeekPos = SeekPos;
		he->Len = Len;
		ideh->IDE.Sum = 0;
		Heap_Heapify (H, 1);
	      }
	  }
	  break;
	case 'L':
	  {
	    ide = LT->IDE;
	    for (i = 0; i < LT->num; i++, ide++)
	      {
		u_long SeekPos, Len;
		float Weight;
		if (!ide->Sum)
		  continue;
		if (ide->Sum <= he->Weight)
		  continue;
		Weight = ide->Sum *
		  GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
		  FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
		if (Weight <= he->Weight)
		  continue;
		he->DocNum = ide->DocNum + 1;
		he->OrgWeight = &ide->Sum;
		he->Weight = Weight;
		he->SeekPos = SeekPos;
		he->Len = Len;
		ide->Sum = 0;
		Heap_Heapify (H, 1);
	      }
	  }
	  break;
	}
    }



  H->HC = Heap_Greater;
  Heap_Build (H);


  MaxDocs = H->NumItems;
  if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
    MaxDocs = rqi->MaxDocsToRetrieve;

  /* Alarm */

  he = H->HE;
  BackEnd = H->NumItems - 1;
  NumExact = 0;
  MaxExact = H->NumItems;
  NumParas = 0;
  Para_ST = SP_createset (DE_comp);
  while (H->NumItems && Docs->num < MaxDocs)
    {
      DocEntry DocEnt;
      DocEntry *mem;

      if (rqi->Exact)
	{
	  if (H->HE[0].SeekPos == 0)
	    NumExact += Make_Exact_Root (qd, H);
	}
      else
	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);

      NumParas++;

      DocEnt.DocNum = he->DocNum;
      DocEnt.Weight = he->Weight;
      DocEnt.Len = he->Len;
      DocEnt.SeekPos = he->SeekPos;
      DocEnt.CompTextBuffer = NULL;
      DocEnt.Next = NULL;

      Heap_DeleteHead (H);

      if (!(mem = SP_member (&DocEnt, Para_ST)))
	{
	  Docs->DE[Docs->num] = DocEnt;
	  SP_insert (&Docs->DE[Docs->num], Para_ST);
	  Docs->num++;
	}
      else
	{
	  DocEnt.Next = mem->Next;
	  Docs->DE[BackEnd] = DocEnt;
	  mem->Next = &Docs->DE[BackEnd--];
	}
    }
  SP_freeset (Para_ST);

  if (qd->id->ifh.InvfLevel == 3)
    {
      Message ("%d Paragraphs were required to get %d documents",
	       NumParas, Docs->num);
      if (NumExact == MaxExact)
	{
	  Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
	  Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
	  Message ("returned do not necessarly represent an exact cosine ranking.");
	  Message ("This problem may be corrected by increasing \'maxparas\'.");
	}
    }
#if 0
  {
    int i;
    FILE *f = fopen ("top.paras", "w");
    fprintf (f, "=========================\nTop Paragraphs\n");
    for (i = 0; i < Docs->num; i++)
      {
	DocEntry *d;
	fprintf (f, "<%d(%f)>  ", Heap[i].DocNum, Heap[i].Weight);
	for (d = Heap[i].Next; d; d = d->Next)
	  fprintf (f, "%d(%f)  ", d->DocNum, d->Weight);
	fprintf (f, "\n");
      }
    fprintf (f, "=========================\n");
    fclose (f);
  }
#endif

  if (AccumulatedWeights)
    {
      Xfree (AccumulatedWeights);
      ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
    }
  if (ST)
    {
      int mem = ST->mem_in_use;
      SP_freeset (ST);
      ChangeMemInUse (qd, -mem);
      free_ide_pool (qd, &ide_pool);
    }
  if (HT)
    HT_free (qd, HT);

  if (LT)
    LT_free (qd, LT);


  return (Docs);
}