/* * addToHashTable - add a url to a hash table * * Assumptions: * 1. ht has been allocated * * Psuedocode: * 1. find the Jenkins hash code for the given url * 2. check wither anything has been hashed with that code yet * 3. add the url to the hash table in a HashTableNode structure */ void addToHashTable(HashTable *ht, const char *url){ unsigned long hashVal = JenkinsHash(url, MAX_HASH_SLOT); /* if nothing had been hashed to that index yet, create a new HashTableNode * and with the provided url in that spot */ if ( ht->table[hashVal]->url == NULL ){ //printf("\n%ld good", hashVal); ht->table[hashVal]->url = malloc(sizeof(char) * 1000); strcpy(ht->table[hashVal]->url, url); } /* if there is already a HashTableNode hashed to that index, traverse * the list of nodes hashed there, and add a new one with the provided url * at the end of the list */ else{ //printf("\n%ld here", hashVal); /* get ready to traverse, or insert */ //ht->table[hashVal]->next = malloc(sizeof(HashTableNode)); HashTableNode *current = malloc(sizeof(HashTableNode)); current = ht->table[hashVal]; /* traverse the list */ while ( current->next != NULL ){ current = current->next; } /* insert the new node at the end of the list */ current->next = malloc(sizeof(HashTableNode)); HashTableNode *newNode = current->next; newNode->url = malloc(sizeof(char) * 1000); strcpy(newNode->url, url); newNode->next = NULL; } }
/* Adding a specific string to hashtable * @str: char buffer to add to hash * @hashTab: hash table to add the string */ int HashAdd(char *str, HashTable *hashTab){ // allocate memory for the new node HashTableNode *addNode; addNode = calloc(1, sizeof(HashTableNode)); if(!addNode) return 0; // store new string as URL addNode->url = str; addNode->next = NULL; unsigned long hashValue = JenkinsHash(str, MAX_HASH_SLOT); HashTableNode *presentNode = hashTab->table[hashValue]; if(presentNode == NULL){ hashTab->table[hashValue] = addNode; // didn't find anything in slot, so add the current node at slot. } else{ // found something in the slot, so go down the list until the last element and // append while(presentNode->next != NULL){ presentNode = presentNode->next; } presentNode->next = addNode; } return 1; }
/*Function to check if a word contains a given document that was present in another word */ int findDocMatch(int DocToMatch, HashTable* Table, char* firstWord){ int hashResult = JenkinsHash(firstWord, MAX_HASH_SLOT); if (Table->table[hashResult] == NULL){ printf("%s does not exist in hashTable\n", firstWord ); exit(1); } else{ WordNode* node2 = Table->table[hashResult]; //set node to what is there WordNode* dummyWord = node2; while (dummyWord != NULL){ //go through all the linked words DocumentNode *dummy_doc = dummyWord->page; if (strcmp(dummyWord->word, firstWord) == 0){//if they are the same word, go through the document nodes //go through the document nodes while (dummy_doc != NULL) { if (dummy_doc->doc_id == DocToMatch){ return 0; //document matched, return sucess } //advance dummy_doc = dummy_doc->next; } break; //you've found the word, no need to continue to other words } else{ // printf("Did not find %s\n",dummyWord->word ); } dummyWord = dummyWord->next; } return 1; //return failure } }
/* lookUpURL - check if a url is already in a hash table * * Assumptions: * 1. ht has been allocated * * Pseudocode: * 1. get the Jenkinds hash code for the provided url * 2. check if anything has been hashed with that code yet * 3. compare the url to the other that are hashed to the same spot if necessary * 4. return 1 if there is a match, and 0 otherwise */ int lookUpURL(HashTable *ht, const char *url){ unsigned long hashVal = JenkinsHash(url, MAX_HASH_SLOT); /* if nothing has been hashed to that index, the url is not in the * table yet, so return 0 */ if ( ht->table[hashVal]->url == NULL && ht->table[hashVal]->next == NULL){ //printf("here"); return 0; } /* if there is already something hashed to that index, traverse down * the list of nodes and compare each node's urls to the provided one */ else{ HashTableNode *current = ht->table[hashVal]; /* return 1 if there is a matching url */ if (strcmp(url, current->url) == 0){ return 1; } while (current->next != NULL){ if(strcmp(url, current->next->url) == 0){ return 1; } else{ current = current->next; } } } return 0; }
/* Checks if a URL is already in hashtable * @str: char buffer to look up in hash table * @hashTab: hash table to look up the string. * @result: Returns 0 if the str is not containted, and 1 if it is. */ GenHashTableNode *HashContains(char *str, HashTable *hashTab){ unsigned long hashValue = JenkinsHash(str, MAX_HASH_SLOT); GenHashTableNode *presentNode; // go through the linked list in the place that the given string hashed to, to // see if the string is contained in the hash table. for(presentNode = hashTab->table[hashValue]; presentNode != NULL; presentNode = presentNode->next) { if (strcmp(str, ((WordNode *)presentNode->hashKey)->word) == 0) return presentNode; } return NULL; }
/* Checks if url is already in a hashtable * and returns the hashtable node that has the matching url */ Hashtablenode *In_Hashtable(char *str, Hashtable *hash_table){ unsigned long hash_number = JenkinsHash(str, MAX_HASH_SLOT); Hashtablenode *current_node; //iterate through hashtable for(current_node = hash_table->table[hash_number]; current_node != NULL; current_node = current_node->next) { //if found, return the node it's found at if (strcmp(str, ((WordNode *)current_node->hash_key)->word) == 0){ return current_node; } } return current_node; }
// Seaches the hashtable to see if the word has already been added to the hashtable int searchIndexHash(HashTable *hash, char *targetword){ WordNode *findwordnode; int jenkins = JenkinsHash(targetword,MAX_HASH_SLOT); if (hash->table[jenkins] != NULL ){ findwordnode=hash->table[jenkins]->data; while (findwordnode != NULL ){ // If we find that the word has already been added to our hashtable, return success if( strcmp(findwordnode->word,targetword) == 0 ){ return 0; } findwordnode=findwordnode->next; } } return -1; }
/* Helper function: returns 1 if word is found in the hashtable, 0 otherwise If found in hashtable, then it finds if doc_id is there if so, then it increases the freq by one then leaves, if not it adds a DocumentNode to the end of the list */ int inHashTable(HashTable *hashtable, char *word, int doc_id) { // get the index we want to insert the has into int index = JenkinsHash(word, MAX_HASH_SLOT); if (hashtable->table[index] == NULL) return 0; for(HashTableNode *current = hashtable->table[index]; current != NULL; current = current->next) { //if word matches, try to find the doc id if(strcmp(current->key->word, word) == 0) { //Go through the document chain and try to find doc_id DocumentNode *current_doc = current->key->page; DocumentNode *previous_doc; while(current_doc != NULL) { if (current_doc->doc_id == doc_id) { //found the doc id so update the frequency and leave function (current_doc->freq)++; return 1; } previous_doc = current_doc; current_doc = current_doc->next; } //If we get here then we didn't find the doc id, so make new DocumentNode and attach it DocumentNode *doc = calloc(1, sizeof(DocumentNode)); if (!doc) { fprintf(stderr, "Calloc failed: Not enough memory for a DocumentNode\n"); return -1; } (current->key->num_docs)++; doc->doc_id = doc_id; doc->freq = 1; doc->next = NULL; previous_doc->next = doc; return 1; } } //if we get here we didn't find anything return 0; }
//Checks if a specific word is inside the hashTable. int InHashTable(HashTable* hTable, char* Word){ unsigned long slot = JenkinsHash(Word, MAX_HASH_SLOT); //Check for the URL at the slot. If slot is empty, then URL is not in hashtable. if(hTable->table[slot]->word == NULL){ return 0; } else{ HashTableNode* currNode = hTable->table[slot]; if(strcmp(currNode->word, Word) == 0){ return 1; } while((currNode->next) != NULL){ currNode = currNode->next; if(strcmp(currNode->word, Word) == 0){ return 1; } } return 0; } }
int hashLookUp(char *word, HashTable *Index){ unsigned long key = JenkinsHash(word, MAX_HASH_SLOT); if(Index->table[key] == NULL){ return 1; } if(Index->table[key] != NULL){ // loop through and string compare wordNode *tmp = Index->table[key]; while(tmp){ if(strcmp(word, tmp->word) == 0){ return 0; } tmp = tmp->next; } } return 1; }
int add(HashTable *table, char *URL ) { unsigned long bucket = JenkinsHash(URL,MAX_HASH_SLOT); HashTableNode *new_node; HashTableNode *node_ptr; if(!table->table[bucket]) { new_node = calloc(1, sizeof(HashTableNode)); if (new_node != NULL) { new_node->url = URL; table->table[bucket] = new_node; return 1; // SUCCESS } } for(node_ptr = table->table[bucket]; node_ptr != NULL; node_ptr = node_ptr->next) { if(strcmp(URL, node_ptr->url) == 0) { return 0; // FAILURE } } new_node = calloc(1, sizeof(HashTableNode)); if (new_node != NULL) { new_node->url = URL; for(node_ptr = table->table[bucket]; node_ptr->next != NULL; node_ptr = node_ptr->next) { // do nothing } node_ptr->next = new_node; } return 1; }
/* Adding a specific string to hashtable * @str: char buffer to add to hash * @hashTab: hash table to add the string */ int HashAdd(char *str, HashTable *hashTab, int curDocID){ // allocate memory for the new node GenHashTableNode *addNode; // see if a word is contained already in the hashtable. // Two cases (word already contained or not) are processed // very differently addNode = HashContains(str, hashTab); if(!addNode){ // the word is not contained in the hashtable. // allocate memory for a hashtable node, WordNode // and DocNode addNode = calloc(1, sizeof(GenHashTableNode)); if(!addNode) return 0; WordNode *addWord; addWord = calloc(1, sizeof(GenHashTableNode)); if(!addWord) return 0; addWord->word = str; // add current word into the WordNode DocNode *addDoc; addDoc = calloc(1, sizeof(DocNode)); if(!addDoc) return 0; // link the DocNode, WordNode, and hashtable node correctly. addDoc->documentID = curDocID; // store documentID passed into function addDoc->occurrences = 1; // count of occurences starts at 1 addWord->docs = addDoc; addNode->hashKey = addWord; // find the index that the word hashes to unsigned long hashValue = JenkinsHash(str, MAX_HASH_SLOT); GenHashTableNode *presentNode = hashTab->table[hashValue]; if(presentNode == NULL){ // didn't find anything in slot, so add the current node // at slot. hashTab->table[hashValue] = addNode; } else{ // found something in the slot, so go down the list until // the last element and append while(presentNode->next != NULL){ presentNode = presentNode->next; } presentNode->next = addNode; } return 1; } else{ // the word was already in the hashtable free(str); // we do not need the actual words anymore so free. // iterate through the DocNode chain stored at the current WordNode // position, while keeping a previous pointer so we can use it later // to append to this DocNode chain DocNode *curDoc = ((WordNode*)addNode->hashKey)->docs; DocNode *prevDoc; for( ; curDoc != NULL; curDoc = curDoc->nextDoc){ if(curDoc->documentID == curDocID){ // if there is a document ID match, we have seen the // word already in a the same document to increment // its occurrence value and return curDoc->occurrences = (curDoc->occurrences + 1); return 1; } prevDoc = curDoc; } DocNode *addDoc; addDoc = calloc(1, sizeof(DocNode)); if(!addDoc) return 0; // we didnt find a DocNode with the same document ID as the current // file so store the document ID into a new DocNode, and the occurrence // of this word in this document is the starting number, 1. addDoc->documentID = curDocID; addDoc->occurrences = 1; // append this new DocNode into the DocNode chain. prevDoc->nextDoc = addDoc; return 1; } }
void And(char *word, HashTable *Index) { unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code. // Declare variables for traversal. WordNode *current; DocumentNode *ptr, *ptr2, *runner, *no_need; int num; // Get matching WordNode of word if it is in the InvertedIndex. if ((num = InHashTable(word, Index))) { current = Index->table[index]->data; // Loop until we get the matching WordNode. for (int i=1; i < num; i++) { current = current->next; } ptr2 = current->page; // Set to start of the list of document nodes for the current word. } else { ptr2 = NULL; } // Initialize variables. ptr = temp_list; while (ptr != NULL) { // Check that the word is in the InvertedIndex. if (num) { ptr2 = current->page; // Set to start of the list of document nodes for the current word. // Loop until the end of the new list of matching DocumentNodes. while (ptr2 != NULL) { // Check for a match in doc_id. if (ptr->doc_id == ptr2->doc_id) { ptr->freq += ptr2->freq; // Add the frequencies. break; } ptr2 = ptr2->next; } // Case of no match. if (ptr2 == NULL) { // Check if we need to delete the first node of temp_list. if (ptr == temp_list) { temp_list = temp_list->next; } else { // All other cases. runner->next = runner->next->next; } no_need = ptr; ptr = ptr->next; // Free the node to be deleted. no_need->next = NULL; free(no_need); no_need = NULL; } else { // Case of match. runner = ptr; ptr = ptr->next; } } else { // Word is not in the InvertedIndex. ptr = NULL; FreeList(0); } } }
int GetLinks(char *line, HashTable *Index) { // Declare variables. char *buf; char word[MAX]; int flag; // flag to do union or intersection operations. int count; // variable to count the position of a word in the line. // Initialize variables. buf = line; flag = 1; count = 0; temp_list = NULL; final_list = NULL; // Loop through the line and do the appropriate operations. while (sscanf(buf, "%s", word) == 1) { count++; // If word is AND, then ignore and read in new word. if (strcmp(word, operator1) == 0) { if (count == 1) { // If there is no previous word, then throw an error. return 0; } // Increment position in line. buf = strstr(buf, word) + strlen(word); continue; } // If word is OR, then tell the program to do OR operation. if (strcmp(word, operator2) == 0) { flag = 2; // Set flag to union operation. if (count == 1) { // If there is no previous word, then throw an error. return 0; } } // Hold onto original copy of word in case NormalizeWord() changes its content. char *word_old = (char *)calloc(1, strlen(word) + 1); strcpy(word_old, word); // Change word to lowercase. if (strcmp(word, operator1) != 0 && strcmp(word, operator2) != 0) { NormalizeWord(word); // Normalize if word is not an operator. } // Add list of docs to temp_list. // Case when it is the first word of the block. if (count == 1) { // Declare variables. WordNode *current; // variable for traversal. DocumentNode *ptr, *ptr2; // variables for traversal. int num; // Case when the word is in the InvertedIndex. if ((num = InHashTable(word, Index))) { unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code. current = Index->table[index]->data; // Loop until we get the matching WordNode. for (int i=1; i < num; i++) { current = current->next; } // Loop through each DocumentNode and add to temp_list. for (ptr = current->page; ptr != NULL; ptr = ptr->next) { // Declare and initialize a DocumentNode with the same values as ptr. DocumentNode *dn; dn = (DocumentNode *)calloc(1, sizeof(DocumentNode)); dn->doc_id = ptr->doc_id; dn->freq = ptr->freq; // Add the new DocumentNode to temp_list. if (temp_list == NULL) { // Case when temp_list is empty. temp_list = dn; ptr2 = temp_list; } else { // Case when temp_list is nonempty. ptr2->next = dn; ptr2 = ptr2->next; } } } } else { // If not first word of the block, then do the operation. // Check if the current operation is "AND". if (flag == 1) { And(word, Index); } // Check if the current operation is "OR". if (flag == 2) { if (temp_list != NULL) { Or(); } flag = 1; // Set flag back to "AND" operation. count = 0; // Set word count to 0 to signal the start of a new block of words. } } // Increment position in the query line to read in next word. buf = strstr(buf, word_old) + strlen(word_old); free(word_old); // Cleanup. } // If the last word of the query line is an operator, throw an error. if (strcmp(word, operator1) == 0 || strcmp(word, operator2) == 0) { return 0; } // If nonempty, flush out temp_list to final_list. if (temp_list != NULL) { Or(); } return 1; // Return 1 if successful. }
int main(int argc, char* argv[]){ //call the check args function to check the input arguments checkArgs(argc, argv); //init the HashTable HashTable* Table = ReadFile(argv[1]); //init the array to hold all of the input words char wordArray[MAX_ROWS][MAX_ROWS][MAX_WORD_LENGTH + 1]; //init keyboard input string char line[MAX_WORD_LENGTH+1]; while (1){ //main loop printf("\nEnter your string (enter \"QUIT\" to exit the function) \n"); //accept user input. Deal with user input longer than the max line if (fgets(line, MAX_LINE, stdin)){ if (NULL == strchr(line, '\n')){ printf("Query only accepts 1000 characters\n"); eat_extra(); //"eats" characters after 1000 characters are input then exits exit(1); } } //handle when the user quits the program if (strcmp(line, "QUIT\n") == 0){ printf("Exit command reached, Cleaning memory and quitting\n"); CleanHashMemory(Table); exit(0); } // size_t length = strlen(line); // printf("length of input is %zu\n", length ); //check if the inputted line ends with AND or OR EndsWithAND(line); EndsWithOR(line); char* argv2 = argv[2]; //make sure the wordArray is cleared out between queries memset(wordArray, 0, sizeof(wordArray[0][0][0]) * 500 * MAX_ROWS * MAX_WORD_LENGTH + 1); int FinalDocMatchArray[1705] = {0}; //keep the documents ids that have matched all the criteria int FinalArrayIndex = 0; int scoreArray[1705] = {0}; //keep the scores of the FinalDocMatchArray in parallel positions int index = 0; //init variables for GetNextWord int pos = 0; int counter = 0; int andPos = 0; int andFlag = 0; int orFlag = 0; int orPos = 0; char* word; while((pos = GetNextWord(line, pos, &word)) > 0){ //go through the words in the query //if the word exists, add it to the hash table if (word != NULL && strlen(word) < MAX_WORD_LENGTH) { //check if it starts with AND or OR if (counter == 0 && (strcmp(word, "AND") == 0 || strcmp(word, "OR") == 0)){ printf("Input cannot start or end with AND or OR\n"); exit(1); } else if (strcmp(word, "AND") == 0){ // printf("AND detected\n"); if (andFlag == 1) { printf("Two ANDs in a row. Invalid input.\n"); exit(1); } andFlag = 1; } //detect ORs and increment position in wordArray else if (strcmp(word, "OR") == 0){ // printf("OR detected\n"); if (orFlag == 1) { printf("Two ORs in a row. Invalid input.\n"); exit(1); } orPos++; andPos = 0; orFlag = 1; } else{ NormalizeWord(word); // printf("Word is %s %i\n", word, counter); andFlag = 0; orFlag = 0; //put the word in the wordArray at the appropriate place int len = strlen(word+1); char wordCpy[len+1]; strcpy(wordCpy,word); strcpy(wordArray[andPos][orPos], wordCpy); // printf("Adding %s to array at %i %i \n",word, andPos, orPos ); andPos++; } counter++; } free(word); word = NULL; } //k is incremented every time an OR is processed int k = 0; while (strcmp(wordArray[0][k], "") != 0){ int docMatchArray[1705] = {0}; //temporary array of matching documents int docMatchArrayIndex = 0; char* firstWord = wordArray[0][k]; // printf("Word is: %s\n", firstWord); //compute jenkins hash int hashResult = JenkinsHash(firstWord, MAX_HASH_SLOT); if (Table->table[hashResult] == NULL){ printf("%s does not exist in hashTable database\n", firstWord ); exit(1); } //go through the hashtable until you find the appropriate word and documents //put it into a temporary array to be matched against else{ WordNode* node2 = Table->table[hashResult]; WordNode* dummyWord = node2; while (dummyWord != NULL){ //go through all the linked words DocumentNode *dummy_doc = dummyWord->page; if (strcmp(dummyWord->word, firstWord) == 0){//if they are the same word, go through the document nodes //go through the document nodes while (dummy_doc != NULL) { //put all of the first words docs into the temp list docMatchArray[docMatchArrayIndex] = dummy_doc->doc_id; docMatchArrayIndex++; //advance dummy_doc = dummy_doc->next; } break; //you've found the word, no need to continue to other words } else{ // printf("Did not find %s\n", firstWord ); } dummyWord = dummyWord->next; // printf("Advancing\n"); } } //if there's only 1 word to examine, no need to compare other words if (strcmp(wordArray[1][k], "") == 0){ //add everything in the doc match array to the FinalDocMatchArray for (int i = 0; i < docMatchArrayIndex; i ++ ){ if (docMatchArray[i] != '\0'){ int dupIndex = 0; int dupFlag = 0; while (FinalDocMatchArray[dupIndex] != '\0'){ //check if they're the same if(docMatchArray[i] == FinalDocMatchArray[dupIndex]){ // printf("FOUND A DUPLICATE for %i\n", docMatchArray[i] ); dupFlag = 1; //a duplicate was found, compute the final score and increment that element int finalScore = 0; int index3=0; // printf("docNum is %i\n",FinalDocMatchArray[index]); while(strcmp(wordArray[index3][k],"") != 0){ //for every word //go through all the words and compute the final score finalScore += ComputeScore(FinalDocMatchArray[dupIndex], Table, wordArray[index3][k]); index3++; } //put it in the score array scoreArray[dupIndex] += finalScore; finalScore = 0; break; } dupIndex++; } //if the duplicate was not found and there's only 1 word, then put everything into the final array if (dupFlag != 1) { //if a duplicate was not found in the list FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i]; FinalArrayIndex++; } } } } //if there's more than one word between the OR statements, compute the final scores for them all else{ for (int i = 0; i < 1705; i ++){ if (docMatchArray[i] != 0){ int result = 1; int m = 0;//make sure to adjust based on current position in masterList //for every doc in the docMatchArray, test if all other words contain that doc while (strcmp(wordArray[m][k], "") != 0) { //increment word //check if this word's documents and see if there's a match result = findDocMatch(docMatchArray[i], Table, wordArray[m][k]); if (result != 0){ break; //the document had no matches, skip the rest } m++; } if (result == 0){ //before you add it to the final array, check if you've already added it int dupIndex2 = 0; int dupFlag2 = 0; while (FinalDocMatchArray[dupIndex2] != '\0'){ //if it's already in the list, then only increment the score if(docMatchArray[i] == FinalDocMatchArray[dupIndex2]){ dupFlag2 = 1; int finalScore2 = 0; int index4 = 0; while(strcmp(wordArray[index4][k],"") != 0){//for every word // printf("Word is %s\n",wordArray[index4][k]); finalScore2 += ComputeScore(FinalDocMatchArray[dupIndex2], Table, wordArray[index4][k]); index4++; } scoreArray[dupIndex2] += finalScore2; //increment the appropriate score finalScore2 = 0; break; } dupIndex2++; } //otherwise, add it to end of the Final Array if (dupFlag2 != 1){ FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i]; FinalArrayIndex++; } } } } } //compute the scores for all the non-duplicates toward the end of the array int finalScore = 0; while (FinalDocMatchArray[index] != '\0'){ //for every doc that matches all AND words int index2=0; // printf("docNum is %i\n",FinalDocMatchArray[index]); while(strcmp(wordArray[index2][k],"") != 0){//for every word // printf("Word is %s\n",wordArray[index2][k]); finalScore += ComputeScore(FinalDocMatchArray[index], Table, wordArray[index2][k]); index2++; } // printf("Score for %i is %i\n",FinalDocMatchArray[index], finalScore); //put it in the score array scoreArray[index] = finalScore; finalScore = 0; index++; } k++; //increment OR position } //sort the Final Array BubbleSort(FinalDocMatchArray, scoreArray, argv2); }//loop back to string entry } //end main
// returns 1 if successful, 0 otherwise int addToHashTable(HashTable *hashtable, char *word, int doc_id) { //If it is already in hash table you can leave, because inHashTable handles this case if (inHashTable(hashtable, word, doc_id)) return 1; //otherwise make new node, wordnode, and docnode HashTableNode *node = malloc(sizeof(HashTableNode)); if (!node) { fprintf(stderr, "Malloc failed: not enough memory to allocate a new HashNode in the HashTable.\n"); return 0; } node->next = NULL; WordNode *word_node = malloc(sizeof(WordNode)); if(!word_node) { fprintf(stderr, "Malloc failed: not enough memory to allocate a new WordNode in the HashTable.\n"); return 0; } word_node->word = calloc(1+strlen(word), sizeof(char)); if (word_node->word == NULL) { fprintf(stderr, "Calloc failed: not enough memory to allocate a new word in the HashTable.\n"); return 0; } strcpy(word_node->word, word); //copy the value from the word to our node DocumentNode *add_doc = calloc(1, sizeof(DocumentNode)); if (!add_doc) { fprintf(stderr, "Calloc failed: not enough memory to allocate a new DocumentNode in the HashTable.\n"); return 0; } add_doc->doc_id = doc_id; add_doc->freq = 1; add_doc->next = NULL; word_node->page = add_doc; word_node->next = NULL; word_node->num_docs = 1; node->key = word_node; // get the index we want to insert the has into int index = JenkinsHash(word, MAX_HASH_SLOT); if (hashtable->table[index] == NULL) { //Nothing in the slot so add the node hashtable->table[index] = node; } else { HashTableNode *current = hashtable->table[index]; while(current->next != NULL) { current = current->next; } current->next = node; } return 1; }
// Adds a hashtablenode, wordnode, and documentnode for the word found int insertIndexHash(HashTable *hash, char *targetword, int targetid, int frequency){ int jenkins = JenkinsHash(targetword, MAX_HASH_SLOT); WordNode *findwordnode; //If the word already exists in the hashtable if(searchIndexHash(hash,targetword) == 0 ){ findwordnode=hash->table[jenkins]->data; while(findwordnode != NULL ){ if( strcmp(findwordnode->word,targetword) == 0 ){ DocumentNode *finddoc=findwordnode->page; DocumentNode *dummydoc=finddoc; while(finddoc != NULL ){ dummydoc=finddoc; //If the docnode already exists w/ the same doc_id, just increase freq by one if ( finddoc->doc_id == targetid ){ (finddoc->freq)++; free(targetword); return 0; } finddoc=finddoc->next; } // If no doc_id's match, make a new docnode for the given word and target id DocumentNode *newDocNode=(DocumentNode *)malloc(sizeof(DocumentNode)); newDocNode->next=NULL; newDocNode->doc_id=targetid; newDocNode->freq=frequency; dummydoc->next=newDocNode; free(targetword); return 0; } findwordnode=findwordnode->next; } } else { // If the word is not in the hashtable, make a wordnode and docnode for it DocumentNode *newDocNode=(DocumentNode *)malloc(sizeof(DocumentNode)); newDocNode->next=NULL; newDocNode->doc_id=targetid; newDocNode->freq=frequency; WordNode *newWordNode=(WordNode *)malloc(sizeof(WordNode)); newWordNode->next=NULL; newWordNode->word=targetword; newWordNode->page=newDocNode; HashTableNode *newHashNode=(HashTableNode *)malloc(sizeof(HashTableNode)); newHashNode->data=newWordNode; if (hash->table[jenkins] != NULL ){ findwordnode=hash->table[jenkins]->data; WordNode *holdernode=findwordnode; while (findwordnode != NULL ){ holdernode=findwordnode; findwordnode=findwordnode->next; } holdernode->next=newWordNode; free(newHashNode); return 0; } else { hash->table[jenkins]=newHashNode; return 0; } } return 0; }
int AddToHashTable(HashTable* hTable, char* Word, int docId){ //Find the slot of the word in the hashtable using JenkinsHash unsigned long slot = JenkinsHash(Word, MAX_HASH_SLOT); //unsigned long slot = 0; //If the slot is empty, simply add the word. if(hTable->table[slot]->word == NULL){ hTable->table[slot]->word = strdup(Word); DocumentNode* tempNode = (DocumentNode*)calloc(1, sizeof(DocumentNode)); hTable->table[slot]->page = tempNode; hTable->table[slot]->page->doc_id = docId; hTable->table[slot]->page->freq = 1; hTable->table[slot]->page->next = NULL; //printf("First node at this place\n"); return 0; } else{ HashTableNode* currNode = hTable->table[slot]; //The current node while((currNode->next)!=NULL){ //If the word of the node and the given word are the same, then add a docNode or increase frequency. if(strcmp(currNode->word, Word) == 0){ DocumentNode* docNode = currNode->page; while((docNode->next)!=NULL){ //if the document node id and the given id are the same, increment the frequency. if(docNode->doc_id == docId){ docNode->freq += 1; //printf("increased freuency\n"); return 0; } docNode = docNode->next; } //check for the last node. if(docNode->doc_id == docId){ docNode->freq += 1; //printf("increased frequency2\n"); return 0; } //Add a document Node at the end since we didn't find a match. DocumentNode* tempNode = (DocumentNode*)calloc(1, sizeof(DocumentNode)); tempNode->next = NULL; tempNode->doc_id = docId; tempNode->freq = 1; docNode->next = tempNode; //printf("added a document node\n"); return 0; } currNode = currNode->next; } //Check if there is a match at the last Word node. if(strcmp(currNode->word, Word) == 0){ DocumentNode* docNode = currNode->page; while((docNode->next)!=NULL){ //if the document node id and the given id are the same, increment the frequency. if(docNode->doc_id == docId){ docNode->freq += 1; //printf("increased frequency 3\n"); return 0; } docNode = docNode->next; } //check for the last node. if(docNode->doc_id == docId){ docNode->freq += 1; //printf("increased frequency 4\n"); return 0; } //Add a document Node at the end since we didn't find a match. DocumentNode* tempNode = (DocumentNode*)calloc(1, sizeof(DocumentNode)); tempNode->next = NULL; tempNode->doc_id = docId; tempNode->freq = 1; docNode->next = tempNode; //printf("added a document node\n"); return 0; } //Couldn't find a match. Add a new word node at the end of the list. HashTableNode* tempWord = (HashTableNode*) calloc(1, sizeof(HashTableNode)); tempWord->word = strdup(Word); tempWord->next = NULL; //Create a document node for the newly created wordNode. DocumentNode* docNode = (DocumentNode*) calloc(1, sizeof(DocumentNode)); tempWord->page = docNode; tempWord->page->doc_id = docId; tempWord->page->freq = 1; tempWord->page->next = NULL; currNode->next = tempWord; //printf("Create a new Wordnode\n"); return 0; } return 1; }
/* * reads index and recreates an index in a new file */ int read_file(char *fileName, Hashtable *hash_table){ FILE *input = fopen(fileName, "r"); int i; int counter = 0; int docID, frequency, file_count; char *word; char temp_character; unsigned long hash_number; WordNode *wordnode; DocumentNode *docNode, *end_doc; Hashtablenode *current_node, *newHashNode; if(!input) return 0; while((temp_character = fgetc(input)) != EOF){ // read until next non alphabetical character while(isalpha(temp_character)){ temp_character = fgetc(input); counter++; } //put pointer back to start fseek(input, -(counter + 1), SEEK_CUR); word = calloc(counter + 1, sizeof(char)); fread(word, sizeof(char), counter, input); //find where the word hashes to hash_number = JenkinsHash(word, MAX_HASH_SLOT); current_node = hash_table->table[hash_number]; //allocate memory for new hashtable node and word node wordnode = calloc(1, sizeof(WordNode)); newHashNode = calloc(1, sizeof(Hashtablenode)); //store wordnode and found word newHashNode->hash_key = wordnode; wordnode->word = word; //add the hashnode in if(current_node==NULL){ hash_table->table[hash_number] = newHashNode; } else{ while(current_node->next) current_node = current_node->next; current_node->next = newHashNode; } //find number of file containing current word fscanf(input ," %d ", &file_count); //iterate through rest of line and store frequency and document id for(i = 0; i < file_count; i++){ fscanf(input, "%d %d ", &docID, &frequency); docNode = calloc(1, sizeof(DocumentNode)); docNode->doc_id = docID; docNode->frequency = frequency; //on the first iteration, add to wordnode, after that append to end if(i == 0){ wordnode->page = docNode; end_doc = docNode; } else{ end_doc->next = docNode; end_doc = end_doc->next; } } counter = 0; } fclose(input); return 1; }
/* Adds a given string to the hashtable */ int add_to_hashtable(char *str, Hashtable *hash_table, int document_id){ // allocate memory for the new node Hashtablenode *hashnode; int found_flag = 0; //check if word is already in table, if not mark it with a flag hashnode = In_Hashtable(str, hash_table); if (hashnode!=NULL){ found_flag = 1; } else{ hashnode = calloc(1, sizeof(Hashtablenode)); } //if word is not found if(found_flag==0){ //create word node WordNode *wordnode; wordnode = calloc(1, sizeof(Hashtablenode)); wordnode->word = str; // add current word into the WordNode DocumentNode *docNode; docNode = calloc(1, sizeof(DocumentNode)); //link hashnode with word node and doc node hashnode->hash_key = wordnode; wordnode->page = docNode; //store the document id in the doc node docNode->doc_id = document_id; //start the count at 1 docNode->frequency = 1; // find the index that the word hashes to unsigned long hash_number = JenkinsHash(str, MAX_HASH_SLOT); Hashtablenode *current_node = hash_table->table[hash_number]; //if there, append if(current_node!=NULL){ while(current_node->next != NULL){ current_node = current_node->next; } current_node->next = hashnode; } //else, add it else{ hash_table->table[hash_number] = hashnode; } return 1; } //otherwise, str was found else{ //iterate through document nodes, looking for matching document ids DocumentNode *temp_doc; DocumentNode *end_doc; for( temp_doc = ((WordNode*)hashnode->hash_key)->page; temp_doc != NULL; temp_doc = temp_doc->next){ if(temp_doc->doc_id == document_id){ temp_doc->frequency = (temp_doc->frequency + 1); return 1; } end_doc = temp_doc; } DocumentNode *docNode; docNode = calloc(1, sizeof(DocumentNode)); //otherwise, the document id wasn't found to be matching, so set frequency to 1 docNode->frequency = 1; docNode->doc_id = document_id; //add to the last document node end_doc->next = docNode; return 1; } }
// main crawler function int main(int argc, char* argv[]) { // local variables FILE *fp; // file pointer for html files char *nextURL; // pointer to the next URL found on the seed page char *newURL; // pointer to the next URL in the while loop // check command line arguments if (argc != 4) { printf("Incorrect number of arguments provided."); exit(1); } // check that the second argument is a directory stat(argv[2],&statbuffer); if S_ISDIR(statbuffer.st_mode) { } else { printf("Error, you did not supply a valid directory"); exit(1); } // get arguments char *seedURL = argv[1]; int filename_len = strlen(argv[2])+21; // get the directory char*filename = calloc(filename_len,sizeof(char)); // check the maxDepth int value = is_numeric(argv[3]); if (value != 0) { sscanf(argv[3],"%i",&maxDepth); } else { printf("Error! maxDepth must be a number"); exit(1); } // init curl curl_global_init(CURL_GLOBAL_ALL); // initialize data structures/variables // initialize hashtable HashTable *table = malloc(sizeof(HashTable)); memset(table,0,MAX_HASH_SLOT); // initialize linked list List *WebPageList; WebPageList = createList(); // setup seed page // get seed webpage // if it fails, report and exit if (NormalizeURL(seedURL) == 0) { printf("Error, bad URL"); exit(1); } // write seed file // create WebPage object by allocating memory WebPage *seedPage = malloc(sizeof(WebPage)); // assign values to each part of the struct seedPage->url = seedURL; seedPage->html = NULL; seedPage->html_len = 0; seedPage->depth = 0; // try to get the webpage up to MAX_TRY times if (!GetWebPage(seedPage)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(seedPage)) { break; } } } // write html contents to a file "1" in the given directory sprintf(filename,"%s/%d",argv[2],1); fp = fopen(filename,"w"); fputs(seedURL,fp); fputs("\n",fp); fprintf(fp,"%d\n",seedPage->depth); fputs(seedPage->html,fp); // close the file and wipe the filename fclose(fp); memset(filename,'\0',filename_len); // add seed page to hashtable add(table,seedURL); // extract urls from seed page // while there are still URLs in the seed page's html while ((pos = GetNextURL(seedPage->html,pos,seedPage->url,&nextURL)) > 0) { // only visiting them if it wouldn't exceed maxDepth if ((seedPage->depth+1) > maxDepth) { free(seedPage); exit(1); } // ensure it's a valid url if (NormalizeURL(nextURL) != 0) { // also check if its in the right domain if (strncmp(URL_PREFIX,nextURL,strlen(URL_PREFIX)) == 0) { // if it is added to the hashtable it is a unique URL that // hasn't been visited before, add it to the linked list // of URLs to visit if (add(table,nextURL)) { // create a new webpage object WebPage *pages = malloc(sizeof(WebPage)); pages->url = nextURL; pages->html = NULL; pages->html_len = 0; pages->depth = 1; // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(pages)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(pages)) { break; } } } // add it to the linked list addToEnd(WebPageList,pages); } } } } // while there are urls to crawl while (WebPageList->head != NULL) { // get next url from list WebPage *nextPage = malloc(sizeof(WebPage)); nextPage = removeFromFront(WebPageList); // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(nextPage)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(nextPage)) { break; } } } // write page file sprintf(filename,"%s/%d",argv[2],docNum); fp = fopen(filename,"w"); fputs(nextPage->url,fp); fputs("\n",fp); fprintf(fp,"%d\n",nextPage->depth); fputs(nextPage->html,fp); // close the file and wipe the filename (to be used next time) fclose(fp); memset(filename,'\0',filename_len); // increment the doc num docNum++; // check if visiting the URLs on this page will exceed maxDepth if ((nextPage->depth+1) > maxDepth) { free(nextPage); continue; } pos = 0; // iterate through all the URLs on the page while ((pos = GetNextURL(nextPage->html,pos,nextPage->url,&newURL))>0) { // check to ensure that the URLs are the proper format if (NormalizeURL(newURL) != 0 ) { // check to ensure that they are in the right domain if (strncmp(URL_PREFIX,newURL,strlen(URL_PREFIX)) == 0) { // making sure to only add new ones to the list if (add(table,newURL) != 0) { // create a new WebPage object WebPage *page = malloc(sizeof(WebPage)); page->url = newURL; page->html = NULL; page->html_len = 0; page->depth = nextPage->depth + 1; GetWebPage(page); // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(page)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(page)) { break; } } } // add the page to the linked list addToEnd(WebPageList,page); } } } } // Sleep for a bit to avoid annoying the target sleep(INTERVAL_PER_FETCH); // Free resources free(nextPage); } // cleanup curl curl_global_cleanup(); // free resources // free hashtable hash = JenkinsHash(seedURL,MAX_HASH_SLOT); HashTableNode *freer = table->table[hash]; HashTableNode *tempHash = NULL; while (freer != NULL) { tempHash = freer; freer = freer->next; free(tempHash); } free(table); // free linked list free(WebPageList); // free WebPage and filename pointer free(seedPage); free(filename); return 0; }
/* ========================================================================== * Take a word and DocumentNode information and a hashtable and try to put * or find the information in the index and update the index appropriately. * * *** Content *** * Case 1: Nothing found in word-hashed slot. Insert new word & doc nodes * Case 2: Linked list of wordnodes hashed to slot. Loop over them, if one * of them is for the parameter-passed word, break and assign a wordnode. If * not, reached end of linked list without finding word (a collision of hash * number). If the word was found, loop over the doc nodes. If one found, * increment docID, if not found, make a new one and set it to the next doc * node. If that doc node doesn't exist for the parameter passed document ID, * make new document node and add to end of document node linked lists. * ========================================================================== */ int reloadIndexHash(char *word, int docID, int freq, HashTable *index) { unsigned long hashNumber = JenkinsHash(word, MAX_HASH_SLOT); DocumentNode *docNode = NULL; WordNode *wordNode = NULL; // Update index backwards, essentially. if (index->table[hashNumber]->hashKey == NULL) { docNode = DNode(docID, freq); wordNode = WNode(docNode, word); index->table[hashNumber]->hashKey = wordNode; // nothing found, slot in new wordnode return 1; } else { // loop in that bucket in index table GenHashTableNode *loopNode = index->table[hashNumber]; WordNode *relevantWordNode = NULL; WordNode *lastWordNode = NULL; while (loopNode->hashKey) { wordNode = loopNode->hashKey; if (!strcmp(wordNode->word, word)) { // if wordnode found in linked list break relevantWordNode = wordNode; break; } if (!loopNode->next) { lastWordNode = wordNode; // last wordnode in linked list. assign wordnode break; } loopNode = loopNode->next; } // end while // depending on outcome of loop: relevantWordNode is a match for the inputted word // or lastWordNode refers to the last word node in the linked list if (relevantWordNode) { docNode = relevantWordNode->page; // loop through that WordNode's list of document nodes while (docNode) { if (docNode->docID == docID) { // found a document node, increment freq. docNode->freq++; return 1; } if (!docNode->next) { // didn't find. make new. DocumentNode *addition = NULL; addition = DNode(docID, freq); docNode->next = addition; return 1; } docNode = docNode->next; } } if (lastWordNode) { // word node not found. make new word node and new doc node GenHashTableNode *nextNode = (GenHashTableNode*)calloc(1, sizeof(GenHashTableNode)); docNode = DNode(docID, freq); WordNode *addition = NULL; addition = WNode(docNode, word); loopNode->next = nextNode; loopNode->next->hashKey = addition; nextNode->next = NULL; return 1; } } // end else return 1; }
/* * Makes a indexer hashtable from a file with formatting: * word fileID occurrences fileID occurrences ... fileID occurrences \n * One line should be written per a word that occurred in at least one of * the htmls from the crawler indexed. * @fileName: fileName of the file to be made into a hashtable * @hashTab: hashTable to create from a file of the above format * @result: the modified hashTab created from the file, fileName. Returns 1 * if successful and 0 if not. */ int readFile(char *fileName, HashTable *hashTab){ FILE *input = fopen(fileName, "r"); // file to read from int i; int letCount = 0; // counts length of read word int docID, occurrences, numFiles; // ints read from the file char *word; // stores the read word char tempc; unsigned long hashValue; WordNode *addWord; // New WordNode, DocNode, and hash table node DocNode *addDoc, *prevDoc; // in reconstruction of hashtable as well as GenHashTableNode *curHashNode, *newHashNode; // previous pointers to some constructs if(!input) return 0; // read until the end of a file while((tempc = fgetc(input)) != EOF){ // read until next space (or non alphabet) while(isalpha(tempc)){ tempc = fgetc(input); // count the length of the word until space letCount++; } // rewind pointer, and save the word into an allocated memory fseek(input, -(letCount + 1), SEEK_CUR); word = calloc(letCount + 1, sizeof(char)); fread(word, sizeof(char), letCount, input); // find where the word naturally hashes to. hashValue = JenkinsHash(word, MAX_HASH_SLOT); curHashNode = hashTab->table[hashValue]; // allocate memory for the new GenHashTableNode and WordNode newHashNode = calloc(1, sizeof(GenHashTableNode)); if(!newHashNode) return 0; addWord = calloc(1, sizeof(WordNode)); if(!addWord) return 0; // store the found word into the new WordNode and store the // this WordNode into the newly created GenHashTableNode addWord->word = word; newHashNode->hashKey = addWord; // connect this newHashNode in the right place of the hashtable. if(!curHashNode){ hashTab->table[hashValue] = newHashNode; } else{ while(curHashNode->next) curHashNode = curHashNode->next; curHashNode->next = newHashNode; } // find the number of the files containing the current word // (this is the word in the line of a file right after the word) fscanf(input ," %d ", &numFiles); // iterate through the current line for the rest of the numbers // encoding the documentID and occurrences of the word. We iterate // numFiles times. for(i = 0; i < numFiles; i++){ // find the int encoding documentID and occurrences. fscanf(input, "%d %d ", &docID, &occurrences); // Need to make a new DocNode, so allocate memory for that // and store the newly found documentID and occurrences // into this new DocNode addDoc = calloc(1, sizeof(DocNode)); addDoc->documentID = docID; addDoc->occurrences = occurrences; // link the new DocNode to the right DocNode or WordNode // in the first iteration, the DocNode will be added to // the WordNode. All other iteration should link an existing // DocNode to the current DocNode if(i == 0){ addWord->docs = addDoc; prevDoc = addDoc; } else{ prevDoc->nextDoc = addDoc; prevDoc = prevDoc->nextDoc; } } // set the letter count back to 0. letCount = 0; } fclose(input); return 1; }
//Inserting the words found into a temporary docList that will be used later in order to compose the master docList docList *insertDocList(HashTable *currHash, char *currWord, docList *currDocs){ WordNode *findwordnode; int jenkins = JenkinsHash(currWord,MAX_HASH_SLOT); //If the word does occupy a slot on the hashtable... if (currHash->table[jenkins] != NULL ){ findwordnode=currHash->table[jenkins]->data; //while there are still possible words in the hashtable slot... while(findwordnode != NULL ){ //If the word matches our target word if( strcmp(findwordnode->word,currWord) == 0 ){ DocumentNode *findDocumentNode = findwordnode->page; //If there are already entries in the docList if ( currDocs->head != NULL ){ //While there are already DocumentNodes in the docList we are adding to if( empty != 2 ){ docList *tempDoclist = malloc(sizeof(docList)); docNode *edocNode = malloc(sizeof(docNode)); edocNode->next=NULL; tempDoclist->head=edocNode; docNode *tempDocNode = tempDoclist->head; int count = 0; int change = 0; while( findDocumentNode != NULL ){ docNode *newDocNode = malloc(sizeof(docNode)); newDocNode->doc_id=findDocumentNode->doc_id; newDocNode->freq=findDocumentNode->freq; newDocNode->next=NULL; //If the docList is already populated, simply add on the new docNode to the end of the docList docNode *findnode = currDocs->head; while(findnode != NULL){ if(newDocNode->doc_id == findnode->doc_id){ change=1; if ( count == 0 ){ tempDoclist->head->doc_id=newDocNode->doc_id; tempDoclist->head->freq=newDocNode->freq+findnode->freq; count=1; } else { newDocNode->freq+=findnode->freq; tempDocNode->next=newDocNode; tempDocNode=tempDocNode->next; } } findnode=findnode->next; } //If the doc_id doesn't match with any on the current list, free the list and set to null if( change == 0 ){ freeDocList(currDocs); docList *failedDocs = malloc(sizeof(docList)); failedDocs->head=NULL; empty=2; return failedDocs; } findDocumentNode = findDocumentNode->next; } return tempDoclist; } } else { //If the docList has not been populated once yet create an initial docNode and set it as the head of the docList if ( empty != 2){ docNode *emptydocNode = malloc(sizeof(docNode)); emptydocNode->next=NULL; currDocs->head=emptydocNode; int flag = 0; while ( findDocumentNode != NULL ){ docNode *newDocNode = malloc(sizeof(docNode)); newDocNode->doc_id=findDocumentNode->doc_id; newDocNode->freq=findDocumentNode->freq; newDocNode->next=NULL; docNode *findDocNode=currDocs->head; while(findDocNode->next != NULL ){ findDocNode=findDocNode->next; } if( flag == 0 ){ currDocs->head->doc_id=newDocNode->doc_id; currDocs->head->freq=newDocNode->freq; flag = 1; } else { findDocNode->next=newDocNode; } findDocumentNode = findDocumentNode->next; } } } return currDocs; } findwordnode=findwordnode->next; } } //If the hashtable search came up empty, meaning that the word cannot be found in any url, set the docList so it reads NULL if(empty == 0 || empty == 2 ) { freeDocList(currDocs); docList *failedDocs = malloc(sizeof(docList)); failedDocs->head=NULL; empty=2; return failedDocs; } return currDocs; }
// updates the index with a given word // either adds it or increments the count of an existing document // or adds a new document to an existing word int updateIndex(char *wordIn, int idIn, IndexTable *tableIn) { // null word, invalid doc id, or nonexistent index if (!wordIn || idIn < 0 || !tableIn) { if (1 == DEBUG) { printf("updateIndex received invalid params\n"); } return 1; // failed } int location = JenkinsHash(wordIn, MAX_HASH_SLOT); WordNode *node = tableIn->table[location]; // word not in index if (!tableIn->table[location]->word) { if (1 == DEBUG) { printf("Word %s not found in index, adding it\n", wordIn); } tableIn->table[location] = newWordNode(wordIn, idIn); return 0; } // word already in index, search through list to find its node while (1) { if (strcmp(node->word, wordIn) == 0) { // found the WordNode containing wordIn if (1 == DEBUG) { printf("Found WordNode for \"%s\"\n", wordIn); } DocumentNode *doc = node->page; while (1) { if (doc->docId == idIn) { // found the DocumentNode for this doc/word combo if (1 == DEBUG) { printf("Found DocumentNode for id=%d for \"%s\"\n", idIn, wordIn); } doc->freq++; return 0; } if (!doc->next) { break; } doc = doc->next; } // made it to the end without finding a matching DocumentNode if (1 == DEBUG) { printf("No DocumentNode for id=%d for \"%s\"\n", idIn, wordIn); } DocumentNode *newDoc = newDocumentNode(idIn); doc->next = newDoc; return 0; } if (!node->next) { break; } node = node->next; } WordNode *newWord = newWordNode(wordIn, idIn); node->next = newWord; return 0; }
int addToHash(char *word, char *fileName, HashTable *Index){ // get key unsigned long key = JenkinsHash(word, MAX_HASH_SLOT); // if word is in hashtable, add doc and return if(hashLookUp(word, Index) == 0){ // get word doc if(strcmp(Index->table[key]->word, word) == 0){ addDocs(Index->table[key], fileName, Index); return 0; } wordNode *tmp = Index->table[key]; while(tmp){ if(strcmp(tmp->word, word) == 0){ addDocs(tmp, fileName, Index); } tmp= tmp->next; } return 0; } ///////////////////////////////////////////////////////////////////////////////////// // create node wordNode *node = malloc(sizeof(wordNode)); node->word = malloc(strlen(word)+1); strcpy(node->word, word); node->next = NULL; node->doc = NULL; if(!node){ return 3; } if(Index->table[key] == NULL){ Index->table[key] = node; addDocs(Index->table[key], fileName, Index); return 1; } // if hashtable new word if(hashLookUp(word, Index) == 1){ if(Index->table[key]){ wordNode *tmpNode = Index->table[key]; while(tmpNode->next != NULL){ tmpNode=tmpNode->next; } tmpNode->next = node; addDocs(tmpNode->next, fileName, Index); return 1; } } return 0; }