static void HandleFile(char *fpath, DocTable *doctable, MemIndex *index) {
  HashTable tab = NULL;
  DocID_t docID;
  HTIter it;

  // STEP 4.
  // Invoke the BuildWordHT() function in fileparser.h/c to
  // build the word hashtable out of the file.
  tab = BuildWordHT(fpath);
  if (tab == NULL)
    return;

  // STEP 5.
  // Invoke the DTRegisterDocumentName() function in
  // doctable.h/c to register the new file with the
  // doctable.
  docID = DTRegisterDocumentName(*doctable, fpath);
  Verify333(docID != 0);

  // Loop through the hash table.
  it = HashTableMakeIterator(tab);
  while (NumElementsInHashTable(tab) > 0) {
    WordPositions *wp;
    HTKeyValue kv;
    int res;

    // STEP 6.
    // Use HTIteratorDelete() to extract the next WordPositions structure out
    // of the hashtable. Then, use MIAddPostingList()  (defined in memindex.h)
    // to add the word, document ID, and positions linked list into the
    // inverted index.
    // Extract teh word position structure.
    res = HTIteratorDelete(it, &kv);
    Verify333(res != 0);
    wp = kv.value;

    // Add word, DOC_id and positions into the index.
    res = MIAddPostingList(*index, wp->word, docID, wp->positions);
    Verify333(res == 1);
    // Since we've transferred ownership of the memory associated with both
    // the "word" and "positions" field of this WordPositions structure, and
    // since we've removed it from the table, we can now free the
    // WordPositions structure!
    free(wp);
  }
  HTIteratorFree(it);

  // We're all done with the word HT for this file, since we've added all of
  // its contents to the inverted index. So, free the word HT and return.
  FreeWordHT(tab);
}
HashTable BuildWordHT(char *filename) {
  char *filecontent;
  HashTable tab;
  HWSize_t filelen, i;

  if (filename == NULL)
    return NULL;

  // STEP 6.
  // Use ReadFile() to slurp in the file contents.  If the
  // file turns out to be empty (i.e., its length is 0),
  // or you couldn't read the file at all, return NULL to indicate
  // failure.
  filecontent = ReadFile(filename, &filelen);
  if (filecontent == NULL || filelen == 0)
    return NULL;

  // Verify that the file contains only ASCII text.  We won't try to index any
  // files that contain non-ASCII text; unfortunately, this means we aren't
  // Unicode friendly.
  for (i = 0; i < filelen; i++) {
    if ((filecontent[i] == '\0') ||
        ((unsigned char) filecontent[i] > ASCII_UPPER_BOUND)) {
      free(filecontent);
      return NULL;
    }
  }

  // Great!  Let's split the file up into words.  We'll allocate the hash
  // table that will store the WordPositions structures associated with each
  // word.  Since our hash table dynamically grows, we'll start with a small
  // number of buckets.
  tab = AllocateHashTable(64);

  // Loop through the file, splitting it into words and inserting a record for
  // each word.
  LoopAndInsert(tab, filecontent);

  // If we found no words, return NULL instead of a
  // zero-sized hashtable.
  if (NumElementsInHashTable(tab) == 0) {
    FreeHashTable(tab, &WordHTFree);
    tab = NULL;
  }

  // Now that we've finished parsing the document, we can free up the
  // filecontent buffer and return our built-up table.
  free(filecontent);
  filecontent = NULL;
  return tab;
}
Beispiel #3
0
HWSize_t DTNumDocsInDocTable(DocTable table) {
  Verify333(table != NULL);
  return NumElementsInHashTable(table->docid_to_docname);
}
Beispiel #4
0
// our main function; here, we demonstrate how to use some
// of the hash table functions
int main(int argc, char **argv) {
  ExampleValuePtr evp;
  HashTable ht;
  HTIter iter;
  HTKeyValue kv, old_kv;
  HTKey_t i;

  // allocate a hash table with 10,000 initial buckets
  ht = AllocateHashTable(10000);
  Verify333(ht != NULL);

  // insert 20,000 elements (load factor = 2.0)
  for (i = 0; i < 20000; i++) {
    evp = (ExampleValuePtr) malloc(sizeof(ExampleValue));
    Verify333(evp != NULL);
    evp->num = i;

    // make sure HT has the right # of elements in it to start
    Verify333(NumElementsInHashTable(ht) == (HWSize_t) i);

    // insert a new element
    kv.key = FNVHashInt64((HTValue_t)i);
    kv.value = (HTValue_t)evp;
    Verify333(InsertHashTable(ht, kv, &old_kv) == 1);

    // make sure hash table has right # of elements post-insert
    Verify333(NumElementsInHashTable(ht) == (HWSize_t) (i+1));
  }

  // look up a few values
  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 1);
  Verify333(kv.key == FNVHashInt64((HTValue_t)100));
  Verify333(((ExampleValuePtr) kv.value)->num == 100);

  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)18583), &kv) == 1);
  Verify333(kv.key == FNVHashInt64((HTValue_t)18583));
  Verify333(((ExampleValuePtr) kv.value)->num == 18583);

  // make sure non-existent value cannot be found
  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)20000), &kv) == 0);

  // delete a value
  Verify333(RemoveFromHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 1);
  Verify333(kv.key == FNVHashInt64((HTValue_t)100));
  Verify333(((ExampleValuePtr) kv.value)->num == 100);
  ExampleValueFree(kv.value);   // since we malloc'ed it, we must free it

  // make sure it's deleted
  Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 0);
  Verify333(NumElementsInHashTable(ht) == (HWSize_t) 19999);

  // loop through using an iterator
  i = 0;
  iter = HashTableMakeIterator(ht);
  Verify333(iter != NULL);

  while (HTIteratorPastEnd(iter) == 0) {
    Verify333(HTIteratorGet(iter, &kv) == 1);
    Verify333(kv.key != FNVHashInt64((HTValue_t)100));   // we deleted it

    // advance the iterator
    HTIteratorNext(iter);
    i++;
  }
  Verify333(i == 19999);

  // free the iterator
  HTIteratorFree(iter);

  // free the hash table
  FreeHashTable(ht, &ExampleValueFree);
  return 0;
}