static void HandleFile(char *fpath, DocTable *doctable, MemIndex *index) { HashTable tab = NULL; DocID_t docID; HTIter it; // STEP 4. // Invoke the BuildWordHT() function in fileparser.h/c to // build the word hashtable out of the file. tab = BuildWordHT(fpath); if (tab == NULL) return; // STEP 5. // Invoke the DTRegisterDocumentName() function in // doctable.h/c to register the new file with the // doctable. docID = DTRegisterDocumentName(*doctable, fpath); Verify333(docID != 0); // Loop through the hash table. it = HashTableMakeIterator(tab); while (NumElementsInHashTable(tab) > 0) { WordPositions *wp; HTKeyValue kv; int res; // STEP 6. // Use HTIteratorDelete() to extract the next WordPositions structure out // of the hashtable. Then, use MIAddPostingList() (defined in memindex.h) // to add the word, document ID, and positions linked list into the // inverted index. // Extract teh word position structure. res = HTIteratorDelete(it, &kv); Verify333(res != 0); wp = kv.value; // Add word, DOC_id and positions into the index. res = MIAddPostingList(*index, wp->word, docID, wp->positions); Verify333(res == 1); // Since we've transferred ownership of the memory associated with both // the "word" and "positions" field of this WordPositions structure, and // since we've removed it from the table, we can now free the // WordPositions structure! free(wp); } HTIteratorFree(it); // We're all done with the word HT for this file, since we've added all of // its contents to the inverted index. So, free the word HT and return. FreeWordHT(tab); }
HashTable BuildWordHT(char *filename) { char *filecontent; HashTable tab; HWSize_t filelen, i; if (filename == NULL) return NULL; // STEP 6. // Use ReadFile() to slurp in the file contents. If the // file turns out to be empty (i.e., its length is 0), // or you couldn't read the file at all, return NULL to indicate // failure. filecontent = ReadFile(filename, &filelen); if (filecontent == NULL || filelen == 0) return NULL; // Verify that the file contains only ASCII text. We won't try to index any // files that contain non-ASCII text; unfortunately, this means we aren't // Unicode friendly. for (i = 0; i < filelen; i++) { if ((filecontent[i] == '\0') || ((unsigned char) filecontent[i] > ASCII_UPPER_BOUND)) { free(filecontent); return NULL; } } // Great! Let's split the file up into words. We'll allocate the hash // table that will store the WordPositions structures associated with each // word. Since our hash table dynamically grows, we'll start with a small // number of buckets. tab = AllocateHashTable(64); // Loop through the file, splitting it into words and inserting a record for // each word. LoopAndInsert(tab, filecontent); // If we found no words, return NULL instead of a // zero-sized hashtable. if (NumElementsInHashTable(tab) == 0) { FreeHashTable(tab, &WordHTFree); tab = NULL; } // Now that we've finished parsing the document, we can free up the // filecontent buffer and return our built-up table. free(filecontent); filecontent = NULL; return tab; }
HWSize_t DTNumDocsInDocTable(DocTable table) { Verify333(table != NULL); return NumElementsInHashTable(table->docid_to_docname); }
// our main function; here, we demonstrate how to use some // of the hash table functions int main(int argc, char **argv) { ExampleValuePtr evp; HashTable ht; HTIter iter; HTKeyValue kv, old_kv; HTKey_t i; // allocate a hash table with 10,000 initial buckets ht = AllocateHashTable(10000); Verify333(ht != NULL); // insert 20,000 elements (load factor = 2.0) for (i = 0; i < 20000; i++) { evp = (ExampleValuePtr) malloc(sizeof(ExampleValue)); Verify333(evp != NULL); evp->num = i; // make sure HT has the right # of elements in it to start Verify333(NumElementsInHashTable(ht) == (HWSize_t) i); // insert a new element kv.key = FNVHashInt64((HTValue_t)i); kv.value = (HTValue_t)evp; Verify333(InsertHashTable(ht, kv, &old_kv) == 1); // make sure hash table has right # of elements post-insert Verify333(NumElementsInHashTable(ht) == (HWSize_t) (i+1)); } // look up a few values Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 1); Verify333(kv.key == FNVHashInt64((HTValue_t)100)); Verify333(((ExampleValuePtr) kv.value)->num == 100); Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)18583), &kv) == 1); Verify333(kv.key == FNVHashInt64((HTValue_t)18583)); Verify333(((ExampleValuePtr) kv.value)->num == 18583); // make sure non-existent value cannot be found Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)20000), &kv) == 0); // delete a value Verify333(RemoveFromHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 1); Verify333(kv.key == FNVHashInt64((HTValue_t)100)); Verify333(((ExampleValuePtr) kv.value)->num == 100); ExampleValueFree(kv.value); // since we malloc'ed it, we must free it // make sure it's deleted Verify333(LookupHashTable(ht, FNVHashInt64((HTValue_t)100), &kv) == 0); Verify333(NumElementsInHashTable(ht) == (HWSize_t) 19999); // loop through using an iterator i = 0; iter = HashTableMakeIterator(ht); Verify333(iter != NULL); while (HTIteratorPastEnd(iter) == 0) { Verify333(HTIteratorGet(iter, &kv) == 1); Verify333(kv.key != FNVHashInt64((HTValue_t)100)); // we deleted it // advance the iterator HTIteratorNext(iter); i++; } Verify333(i == 19999); // free the iterator HTIteratorFree(iter); // free the hash table FreeHashTable(ht, &ExampleValueFree); return 0; }