Esempio n. 1
0
/*
 * addToHashTable - add a url to a hash table
 * 
 * Assumptions:
 *      1. ht has been allocated
 * 
 * Psuedocode:
 *      1. find the Jenkins hash code for the given url
 *      2. check wither anything has been hashed with that code yet
 *      3. add the url to the hash table in a HashTableNode structure
 */
void addToHashTable(HashTable *ht, const char *url){
    unsigned long hashVal = JenkinsHash(url, MAX_HASH_SLOT);
    /* if nothing had been hashed to that index yet, create a new HashTableNode
     * and with the provided url in that spot */
    if ( ht->table[hashVal]->url == NULL ){
        //printf("\n%ld good", hashVal);
        ht->table[hashVal]->url = malloc(sizeof(char) * 1000);
        strcpy(ht->table[hashVal]->url, url);
    }
    /* if there is already a HashTableNode hashed to that index, traverse
     * the list of nodes hashed there, and add a new one with the provided url 
     * at the end of the list */
    else{
        //printf("\n%ld here", hashVal);
        /* get ready to traverse, or insert */
        //ht->table[hashVal]->next = malloc(sizeof(HashTableNode));
        HashTableNode *current = malloc(sizeof(HashTableNode));
        current = ht->table[hashVal];
       
        /* traverse the list */
        while ( current->next != NULL ){
            current = current->next;
        }

        /* insert the new node at the end of the list */
        current->next = malloc(sizeof(HashTableNode));
        HashTableNode *newNode = current->next;
        newNode->url = malloc(sizeof(char) * 1000);
        strcpy(newNode->url, url);
        newNode->next = NULL;       
    }
}
Esempio n. 2
0
/* Adding a specific string to hashtable
 * @str: char buffer to add to hash
 * @hashTab: hash table to add the string
 */
int HashAdd(char *str, HashTable *hashTab){
	// allocate memory for the new node
	HashTableNode *addNode; 
	addNode = calloc(1, sizeof(HashTableNode));
	if(!addNode) return 0;

	// store new string as URL 
	addNode->url = str;
	addNode->next = NULL; 
	unsigned long hashValue = JenkinsHash(str, MAX_HASH_SLOT);
	
	HashTableNode *presentNode = hashTab->table[hashValue]; 

	if(presentNode == NULL){
		hashTab->table[hashValue] = addNode; // didn't find anything in slot, so add the current node at slot.
	}
	else{
		// found something in the slot, so go down the list until the last element and 
		// append 
		while(presentNode->next != NULL){
			presentNode = presentNode->next; 
		}
		presentNode->next = addNode;
	}
	return 1;
}
Esempio n. 3
0
/*Function to check if a word contains a given document that was present in another word */ 
int findDocMatch(int DocToMatch, HashTable* Table, char* firstWord){
	int hashResult = JenkinsHash(firstWord, MAX_HASH_SLOT);
		if (Table->table[hashResult] == NULL){
			printf("%s does not exist in hashTable\n", firstWord );
			exit(1);
		}
		else{
			WordNode* node2 = Table->table[hashResult]; //set node to what is there
			WordNode* dummyWord = node2;
			while (dummyWord != NULL){ //go through all the linked words
				DocumentNode *dummy_doc = dummyWord->page;

				if (strcmp(dummyWord->word, firstWord) == 0){//if they are the same word, go through the document nodes
				    //go through the document nodes
				    while (dummy_doc != NULL) {
					    if (dummy_doc->doc_id == DocToMatch){
					    	return 0; //document matched, return sucess
					    }
                    //advance
                    dummy_doc = dummy_doc->next;
                }

                break; //you've found the word, no need to continue to other words

				}
				else{
					// printf("Did not find %s\n",dummyWord->word );
				}

				dummyWord = dummyWord->next;
			}
			return 1; //return failure
		}
}
Esempio n. 4
0
/* lookUpURL - check if a url is already in a hash table
 * 
 * Assumptions:
 *      1. ht has been allocated
 *     
 * Pseudocode:
 *      1. get the Jenkinds hash code for the provided url
 *      2. check if anything has been hashed with that code yet
 *      3. compare the url to the other that are hashed to the same spot if necessary
 *      4. return 1 if there is a match, and 0 otherwise
 */
int lookUpURL(HashTable *ht, const char *url){
    unsigned long hashVal = JenkinsHash(url, MAX_HASH_SLOT);
    /* if nothing has been hashed to that index, the url is not in the
     * table yet, so return 0 */
    if ( ht->table[hashVal]->url == NULL && ht->table[hashVal]->next == NULL){
        //printf("here");
        return 0;
    } 

    /* if there is already something hashed to that index, traverse down
     * the list of nodes and compare each node's urls to the provided one */   
    else{
        HashTableNode *current = ht->table[hashVal];

        /* return 1 if there is a matching url */
        if (strcmp(url, current->url) == 0){
            return 1;
        }
        while (current->next != NULL){
            if(strcmp(url, current->next->url) == 0){
                return 1;
            }
            else{
                current = current->next;
            }
        }
    }
    return 0;
}
Esempio n. 5
0
/* Checks if a URL is already in hashtable
 * @str: char buffer to look up in hash table
 * @hashTab: hash table to look up the string.
 * @result: Returns 0 if the str is not containted, and 1 if it is.
 */
GenHashTableNode *HashContains(char *str, HashTable *hashTab){
	unsigned long hashValue = JenkinsHash(str, MAX_HASH_SLOT);
	GenHashTableNode *presentNode;

	// go through the linked list in the place that the given string hashed to, to
	// see if the string is contained in the hash table.
	for(presentNode = hashTab->table[hashValue]; presentNode != NULL; presentNode = presentNode->next) {
        	if (strcmp(str, ((WordNode *)presentNode->hashKey)->word) == 0) return presentNode;
    	}
	return NULL;
}
Esempio n. 6
0
/* Checks if url is already in a hashtable
 * and returns the hashtable node that has the matching url
 */
Hashtablenode *In_Hashtable(char *str, Hashtable *hash_table){
	unsigned long hash_number = JenkinsHash(str, MAX_HASH_SLOT);
	Hashtablenode *current_node;
	//iterate through hashtable
	for(current_node = hash_table->table[hash_number]; current_node != NULL; current_node = current_node->next) {
        	//if found, return the node it's found at
		if (strcmp(str, ((WordNode *)current_node->hash_key)->word) == 0){
			 return current_node;
		}		
    	}
	return current_node;
}
Esempio n. 7
0
// Seaches the hashtable to see if the word has already been added to the hashtable
int searchIndexHash(HashTable *hash, char *targetword){
  WordNode *findwordnode;
  int jenkins = JenkinsHash(targetword,MAX_HASH_SLOT);

  if (hash->table[jenkins] != NULL ){
    findwordnode=hash->table[jenkins]->data;
    while (findwordnode != NULL ){
      // If we find that the word has already been added to our hashtable, return success
      if( strcmp(findwordnode->word,targetword) == 0 ){
	return 0;
      }
      findwordnode=findwordnode->next;
    }
  }
  return -1;
}
Esempio n. 8
0
/* Helper function: returns 1 if word is found in the hashtable, 0 otherwise
    If found in hashtable, then it finds if doc_id is there
    if so, then it increases the freq by one then leaves, if not it adds a DocumentNode
    to the end of the list */
int inHashTable(HashTable *hashtable, char *word, int doc_id)
{
    // get the index we want to insert the has into
    int index = JenkinsHash(word, MAX_HASH_SLOT);

    if (hashtable->table[index] == NULL)
        return 0;

    for(HashTableNode *current = hashtable->table[index]; current != NULL; current = current->next) {
        //if word matches, try to find the doc id
        if(strcmp(current->key->word, word) == 0) {
            //Go through the document chain and try to find doc_id
            DocumentNode *current_doc = current->key->page;
            DocumentNode *previous_doc;
            while(current_doc != NULL) {
                if (current_doc->doc_id == doc_id) {
                    //found the doc id so update the frequency and leave function
                    (current_doc->freq)++;
                    return 1;
                }
                previous_doc = current_doc;
                current_doc = current_doc->next;
            }

            //If we get here then we didn't find the doc id, so make new DocumentNode and attach it
            DocumentNode *doc = calloc(1, sizeof(DocumentNode));
            if (!doc) {
                fprintf(stderr, "Calloc failed: Not enough memory for a DocumentNode\n");
                return -1;
            }

            (current->key->num_docs)++;

            doc->doc_id = doc_id;
            doc->freq = 1;
            doc->next = NULL;

            previous_doc->next = doc;

            return 1;
        }
    }

    //if we get here we didn't find anything
    return 0;
}
Esempio n. 9
0
//Checks if a specific word is inside the hashTable.
int InHashTable(HashTable* hTable, char* Word){
	unsigned long slot = JenkinsHash(Word, MAX_HASH_SLOT);
	
	//Check for the URL at the slot. If slot is empty, then URL is not in hashtable.
	if(hTable->table[slot]->word == NULL){
		return 0;
	}
	
	else{
		HashTableNode* currNode = hTable->table[slot];
		if(strcmp(currNode->word, Word) == 0){
			return 1;
		}
		while((currNode->next) != NULL){
			currNode = currNode->next;
			if(strcmp(currNode->word, Word) == 0){
				return 1;
			}	
		}
		return 0;	
	}	
}
Esempio n. 10
0
int hashLookUp(char *word, HashTable *Index){
    unsigned long key = JenkinsHash(word, MAX_HASH_SLOT);
    
    if(Index->table[key] == NULL){
        return 1;
    }

    if(Index->table[key] != NULL){
     
        // loop through and string compare
        wordNode *tmp = Index->table[key];
        while(tmp){
            if(strcmp(word, tmp->word) == 0){
                
                return 0;
            }
            tmp = tmp->next;
        }
    }

    return 1;
}
int add(HashTable *table, char *URL ) {
	unsigned long bucket = JenkinsHash(URL,MAX_HASH_SLOT);
	HashTableNode *new_node;
        HashTableNode *node_ptr;

	if(!table->table[bucket]) {
		new_node = calloc(1, sizeof(HashTableNode));
		if (new_node != NULL) {
		new_node->url = URL;

		table->table[bucket] = new_node;

		return 1;  // SUCCESS
		}
	}

	for(node_ptr = table->table[bucket]; node_ptr != NULL; node_ptr = node_ptr->next) {
		if(strcmp(URL, node_ptr->url) == 0) {
			return 0; // FAILURE
		}
	}	

	new_node = calloc(1, sizeof(HashTableNode));
	if (new_node != NULL) {
	new_node->url = URL;

	for(node_ptr = table->table[bucket]; node_ptr->next != NULL; node_ptr = node_ptr->next) {
	// do nothing
	}	

	node_ptr->next = new_node;
	}

	return 1;

}
Esempio n. 12
0
/* Adding a specific string to hashtable
 * @str: char buffer to add to hash
 * @hashTab: hash table to add the string
 */
int HashAdd(char *str, HashTable *hashTab, int curDocID){

	// allocate memory for the new node
	GenHashTableNode *addNode; 
	
	// see if a word is contained already in the hashtable. 
	// Two cases (word already contained or not) are processed
	// very differently
	addNode = HashContains(str, hashTab);
	if(!addNode){ // the word is not contained in the hashtable.

		// allocate memory for a hashtable node, WordNode
		// and DocNode
		addNode = calloc(1, sizeof(GenHashTableNode));
        	if(!addNode) return 0;
		
		WordNode *addWord;
		addWord = calloc(1, sizeof(GenHashTableNode));	
		if(!addWord) return 0;
		
		addWord->word = str; // add current word into the WordNode

		DocNode *addDoc;
		addDoc = calloc(1, sizeof(DocNode));
		if(!addDoc) return 0;

		// link the DocNode, WordNode, and hashtable node correctly.
		addDoc->documentID = curDocID; // store documentID passed into function
		addDoc->occurrences = 1; // count of occurences starts at 1
		addWord->docs = addDoc;
		addNode->hashKey = addWord;

		// find the index that the word hashes to
		unsigned long hashValue = JenkinsHash(str, MAX_HASH_SLOT);
		GenHashTableNode *presentNode = hashTab->table[hashValue]; 

		if(presentNode == NULL){
			// didn't find anything in slot, so add the current node
			// at slot.
			hashTab->table[hashValue] = addNode;
		}
		else{
			// found something in the slot, so go down the list until 
			// the last element and append 
			while(presentNode->next != NULL){
				presentNode = presentNode->next; 
			}
			presentNode->next = addNode;
		}
		return 1;
	}
	else{  // the word was already in the hashtable
		free(str); // we do not need the actual words anymore so free. 

		// iterate through the DocNode chain stored at the current WordNode
		// position, while keeping a previous pointer so we can use it later
		// to append to this DocNode chain
		DocNode *curDoc = ((WordNode*)addNode->hashKey)->docs;
		DocNode *prevDoc;
		for( ; curDoc != NULL; curDoc = curDoc->nextDoc){
			if(curDoc->documentID == curDocID){
				// if there is a document ID match, we have seen the
				// word already in a the same document to increment
				// its occurrence value and return
				curDoc->occurrences = (curDoc->occurrences + 1);
				return 1;
			}
			prevDoc = curDoc;
		}
		
                DocNode *addDoc;
                addDoc = calloc(1, sizeof(DocNode));
                if(!addDoc) return 0;

		// we didnt find a DocNode with the same document ID as the current
		// file so store the document ID into a new DocNode, and the occurrence
		// of this word in this document is the starting number, 1.
                addDoc->documentID = curDocID;
                addDoc->occurrences = 1;

		// append this new DocNode into the DocNode chain.
		prevDoc->nextDoc = addDoc;

		return 1;
	}
}
Esempio n. 13
0
void And(char *word, HashTable *Index) {

	unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code.
	
	// Declare variables for traversal.
	WordNode *current;
	DocumentNode *ptr, *ptr2, *runner, *no_need;
	int num;
	
	// Get matching WordNode of word if it is in the InvertedIndex.
	if ((num = InHashTable(word, Index))) {
		current = Index->table[index]->data;
		// Loop until we get the matching WordNode.
		for (int i=1; i < num; i++) {
			current = current->next;
		}
		ptr2 = current->page; // Set to start of the list of document nodes for the current word.
	}
	else {
		ptr2 = NULL;
	}
	
	// Initialize variables. 
	ptr = temp_list;
	
	while (ptr != NULL) {
	
		// Check that the word is in the InvertedIndex.
		if (num) {
			ptr2 = current->page; // Set to start of the list of document nodes for the current word.
			
			// Loop until the end of the new list of matching DocumentNodes.
			while (ptr2 != NULL) {
				// Check for a match in doc_id.
				if (ptr->doc_id == ptr2->doc_id) {
					ptr->freq += ptr2->freq; // Add the frequencies.
					break;
				}
				ptr2 = ptr2->next;
			}
			
			// Case of no match.
			if (ptr2 == NULL) {
			
				// Check if we need to delete the first node of temp_list.
				if (ptr == temp_list) { 
					temp_list = temp_list->next;
				}
				else { // All other cases.
					runner->next = runner->next->next;
				}
				no_need = ptr;
				ptr = ptr->next;
				
				// Free the node to be deleted.
				no_need->next = NULL;
				free(no_need);
				no_need = NULL;
				
			}
			else { // Case of match.
				runner = ptr;
				ptr = ptr->next;
			}
		}
		else { // Word is not in the InvertedIndex.
			ptr = NULL;
			FreeList(0);
		}
	}
}
Esempio n. 14
0
int GetLinks(char *line, HashTable *Index) {
	
	// Declare variables.
	char *buf;
	char word[MAX];
	int flag; // flag to do union or intersection operations.
	int count; // variable to count the position of a word in the line.
	
	// Initialize variables.
	buf = line;
	flag = 1;
	count = 0;
	temp_list = NULL;
	final_list = NULL;
	
	// Loop through the line and do the appropriate operations.
	while (sscanf(buf, "%s", word) == 1) {
		
		count++;
		
		// If word is AND, then ignore and read in new word.
		if (strcmp(word, operator1) == 0) {
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
			}
			
			// Increment position in line.
			buf = strstr(buf, word) + strlen(word);
			continue;
		}
		
		// If word is OR, then tell the program to do OR operation.
		if (strcmp(word, operator2) == 0) {
			flag = 2; // Set flag to union operation.
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
			}
		}
		
		// Hold onto original copy of word in case NormalizeWord() changes its content.
		char *word_old = (char *)calloc(1, strlen(word) + 1);
		strcpy(word_old, word);
		
		// Change word to lowercase.
		if (strcmp(word, operator1) != 0 && strcmp(word, operator2) != 0) {
			NormalizeWord(word); // Normalize if word is not an operator.
		}
		
		// Add list of docs to temp_list.
		// Case when it is the first word of the block.
		if (count == 1) {
			
			// Declare variables.
			WordNode *current; // variable for traversal.
			DocumentNode *ptr, *ptr2; // variables for traversal.
			int num;
			
			// Case when the word is in the InvertedIndex.
			if ((num = InHashTable(word, Index))) {
				unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code.
				current = Index->table[index]->data;
				
				// Loop until we get the matching WordNode.
				for (int i=1; i < num; i++) {
					current = current->next;
				}
				
				// Loop through each DocumentNode and add to temp_list.
				for (ptr = current->page; ptr != NULL; ptr = ptr->next) {
					
					// Declare and initialize a DocumentNode with the same values as ptr.
					DocumentNode *dn;
					dn = (DocumentNode *)calloc(1, sizeof(DocumentNode));
					dn->doc_id = ptr->doc_id;
					dn->freq = ptr->freq;
					
					// Add the new DocumentNode to temp_list.
					if (temp_list == NULL) { // Case when temp_list is empty.
						temp_list = dn;
						ptr2 = temp_list;
					}
					else { // Case when temp_list is nonempty.
						ptr2->next = dn;
						ptr2 = ptr2->next;
					}
				}
			}
		}
		else { // If not first word of the block, then do the operation.
		
			// Check if the current operation is "AND".
			if (flag == 1) {
				And(word, Index);
			}
			
			// Check if the current operation is "OR".
			if (flag == 2) {
				if (temp_list != NULL) {
					Or();
				}
				flag = 1; // Set flag back to "AND" operation.
				count = 0; // Set word count to 0 to signal the start of a new block of words.
			}
		}
		
		// Increment position in the query line to read in next word.
		buf = strstr(buf, word_old) + strlen(word_old);
		
		free(word_old); // Cleanup.
	}
	
	// If the last word of the query line is an operator, throw an error.
	if (strcmp(word, operator1) == 0 || strcmp(word, operator2) == 0) {
		return 0;
	
	}
	
	// If nonempty, flush out temp_list to final_list.
	if (temp_list != NULL) { 
		Or();
	}
	
	return 1; // Return 1 if successful.
}	
Esempio n. 15
0
int main(int argc, char* argv[]){

//call the check args function to check the input arguments
checkArgs(argc, argv); 

//init the HashTable
HashTable* Table = ReadFile(argv[1]);
//init the array to hold all of the input words
char wordArray[MAX_ROWS][MAX_ROWS][MAX_WORD_LENGTH + 1];

//init keyboard input string
char line[MAX_WORD_LENGTH+1];



while (1){	//main loop

printf("\nEnter your string (enter \"QUIT\" to exit the function) \n");	
//accept user input. Deal with user input longer than the max line
if (fgets(line, MAX_LINE, stdin)){
	if (NULL == strchr(line, '\n')){
		printf("Query only accepts 1000 characters\n");
		eat_extra(); //"eats" characters after 1000 characters are input then exits
		exit(1);
	}
}

//handle when the user quits the program
if (strcmp(line, "QUIT\n") == 0){
	printf("Exit command reached, Cleaning memory and quitting\n");
	CleanHashMemory(Table);
	exit(0);
}

// size_t length = strlen(line);
// printf("length of input is %zu\n", length );

//check if the inputted line ends with AND or OR
EndsWithAND(line);
EndsWithOR(line);

char* argv2 = argv[2];

//make sure the wordArray is cleared out between queries
memset(wordArray, 0, sizeof(wordArray[0][0][0]) * 500 * MAX_ROWS * MAX_WORD_LENGTH + 1);

int FinalDocMatchArray[1705] = {0}; //keep the documents ids that have matched all the criteria
int FinalArrayIndex = 0;
int scoreArray[1705] = {0}; //keep the scores of the FinalDocMatchArray in parallel positions
int index = 0; 

//init variables for GetNextWord
int pos = 0;
int counter = 0;
int andPos = 0;
int andFlag = 0;
int orFlag = 0; 
int orPos = 0; 
char* word;
while((pos = GetNextWord(line, pos, &word)) > 0){ //go through the words in the query   	
	//if the word exists, add it to the hash table
	if (word != NULL && strlen(word) < MAX_WORD_LENGTH) { 
		//check if it starts with AND or OR
		if (counter == 0 && (strcmp(word, "AND") == 0 || strcmp(word, "OR") == 0)){
			printf("Input cannot start or end with AND or OR\n");
			exit(1);
		}
		else if (strcmp(word, "AND") == 0){
			// printf("AND detected\n");
			if (andFlag == 1)
			{
				printf("Two ANDs in a row. Invalid input.\n");
				exit(1);
			}
			andFlag = 1;
		}
		//detect ORs and increment position in wordArray
		else if (strcmp(word, "OR") == 0){
			// printf("OR detected\n");
			if (orFlag == 1)
			{
				printf("Two ORs in a row. Invalid input.\n");
				exit(1);
			}
			orPos++;
			andPos = 0;
			orFlag = 1;
		}
		else{

		NormalizeWord(word);
		// printf("Word is %s %i\n", word, counter);
		andFlag = 0;
		orFlag = 0;
		//put the word in the wordArray at the appropriate place
		int len = strlen(word+1);
		char wordCpy[len+1];
		strcpy(wordCpy,word);
		strcpy(wordArray[andPos][orPos], wordCpy);
		// printf("Adding %s to array at %i %i \n",word, andPos, orPos );
		andPos++;

		}

		counter++;
	}
	free(word);
	word = NULL;
}


//k is incremented every time an OR is processed
int k = 0;
while (strcmp(wordArray[0][k], "") != 0){ 

int docMatchArray[1705] = {0}; //temporary array of matching documents
int docMatchArrayIndex = 0;

char* firstWord = wordArray[0][k];
// printf("Word is: %s\n", firstWord);

//compute jenkins hash
int hashResult = JenkinsHash(firstWord, MAX_HASH_SLOT);

if (Table->table[hashResult] == NULL){
	printf("%s does not exist in hashTable database\n", firstWord );
	exit(1);
}

//go through the hashtable until you find the appropriate word and documents
//put it into a temporary array to be matched against
else{
	WordNode* node2 = Table->table[hashResult];
	WordNode* dummyWord = node2;
	while (dummyWord != NULL){ //go through all the linked words
		DocumentNode *dummy_doc = dummyWord->page;
		if (strcmp(dummyWord->word, firstWord) == 0){//if they are the same word, go through the document nodes
		    //go through the document nodes
		    while (dummy_doc != NULL) {
			    //put all of the first words docs into the temp list
			    docMatchArray[docMatchArrayIndex] = dummy_doc->doc_id;
			    docMatchArrayIndex++;
	            //advance
	            dummy_doc = dummy_doc->next;	
        }
        break; //you've found the word, no need to continue to other words
		}
		else{
			// printf("Did not find %s\n", firstWord );
		}

		dummyWord = dummyWord->next;
		// printf("Advancing\n");
	}
}

//if there's only 1 word to examine, no need to compare other words
if (strcmp(wordArray[1][k], "") == 0){ 
	//add everything in the doc match array to the FinalDocMatchArray
	for (int i = 0; i < docMatchArrayIndex; i ++ ){
		if (docMatchArray[i] != '\0'){
			int dupIndex = 0;
			int dupFlag = 0;
			while (FinalDocMatchArray[dupIndex] != '\0'){
				//check if they're the same
				if(docMatchArray[i] == FinalDocMatchArray[dupIndex]){
					// printf("FOUND A DUPLICATE for %i\n", docMatchArray[i] );
					dupFlag = 1; 
					//a duplicate was found, compute the final score and increment that element
					int finalScore = 0;
					int index3=0;
					// printf("docNum is %i\n",FinalDocMatchArray[index]);
					while(strcmp(wordArray[index3][k],"") != 0){ //for every word 
						//go through all the words and compute the final score
						finalScore += ComputeScore(FinalDocMatchArray[dupIndex], Table, wordArray[index3][k]);
						index3++;
					}
					//put it in the score array
					scoreArray[dupIndex] += finalScore;
					finalScore = 0;										
					break;
				}
				dupIndex++;
			}
			//if the duplicate was not found and there's only 1 word, then put everything into the final array
			if (dupFlag != 1) { //if a duplicate was not found in the list
				FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i];
				FinalArrayIndex++; 
			}
		}
	}
}

//if there's more than one word between the OR statements, compute the final scores for them all
else{        
	for (int i = 0; i < 1705; i ++){
		if (docMatchArray[i] != 0){
		int result = 1;
		int m = 0;//make sure to adjust based on current position in masterList

		//for every doc in the docMatchArray, test if all other words contain that doc
		while (strcmp(wordArray[m][k], "") != 0) {  //increment word 

			//check if this word's documents and see if there's a match
			result = findDocMatch(docMatchArray[i], Table, wordArray[m][k]);

			if (result != 0){
				break; //the document had no matches, skip the rest
			}
			m++;
		}
		if (result == 0){
			//before you add it to the final array, check if you've already added it
			int dupIndex2 = 0;
			int dupFlag2 = 0;
			while (FinalDocMatchArray[dupIndex2] != '\0'){
				//if it's already in the list, then only increment the score
				if(docMatchArray[i] == FinalDocMatchArray[dupIndex2]){
					dupFlag2 = 1;
					int finalScore2 = 0;
					int index4 = 0;
					while(strcmp(wordArray[index4][k],"") != 0){//for every word 
						// printf("Word is %s\n",wordArray[index4][k]);
						finalScore2 += ComputeScore(FinalDocMatchArray[dupIndex2], Table, wordArray[index4][k]);
						index4++;
					}					
					scoreArray[dupIndex2] += finalScore2; //increment the appropriate score
					finalScore2 = 0;
					break; 
				}
				dupIndex2++;
			}
			//otherwise, add it to end of the Final Array
			if (dupFlag2 != 1){
				FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i];
				FinalArrayIndex++;
			}
			}
		}
	}
}

//compute the scores for all the non-duplicates toward the end of the array
int finalScore = 0;
while (FinalDocMatchArray[index] != '\0'){ //for every doc that matches all AND words
	int index2=0;
	// printf("docNum is %i\n",FinalDocMatchArray[index]);
	while(strcmp(wordArray[index2][k],"") != 0){//for every word 
		// printf("Word is %s\n",wordArray[index2][k]);
		finalScore += ComputeScore(FinalDocMatchArray[index], Table, wordArray[index2][k]);
		index2++;
	}

	// printf("Score for %i is %i\n",FinalDocMatchArray[index], finalScore);
	//put it in the score array
	scoreArray[index] = finalScore;
	finalScore = 0;
	index++;
}
k++; //increment OR position
}

//sort the Final Array
BubbleSort(FinalDocMatchArray, scoreArray, argv2);

}//loop back to string entry

} //end main
Esempio n. 16
0
// returns 1 if successful, 0 otherwise
int addToHashTable(HashTable *hashtable, char *word, int doc_id)
{


    //If it is already in hash table you can leave, because inHashTable handles this case
    if (inHashTable(hashtable, word, doc_id))
        return 1;

    //otherwise make new node, wordnode, and docnode

    HashTableNode *node = malloc(sizeof(HashTableNode));
    if (!node) {
        fprintf(stderr, "Malloc failed: not enough memory to allocate a new HashNode in the HashTable.\n");
        return 0;
    }

    node->next = NULL;

    WordNode *word_node = malloc(sizeof(WordNode));
    if(!word_node) {
        fprintf(stderr, "Malloc failed: not enough memory to allocate a new WordNode in the HashTable.\n");
        return 0;
    }

    word_node->word = calloc(1+strlen(word), sizeof(char));
    if (word_node->word == NULL) {
        fprintf(stderr, "Calloc failed: not enough memory to allocate a new word in the HashTable.\n");
        return 0;
    }

    strcpy(word_node->word, word); //copy the value from the word to our node

    DocumentNode *add_doc = calloc(1, sizeof(DocumentNode));
    if (!add_doc) {
        fprintf(stderr, "Calloc failed: not enough memory to allocate a new DocumentNode in the HashTable.\n");
        return 0;
    }

    add_doc->doc_id = doc_id;
    add_doc->freq = 1;
    add_doc->next = NULL;

    word_node->page = add_doc;

    word_node->next = NULL;
    word_node->num_docs = 1;


    node->key = word_node;

    // get the index we want to insert the has into
    int index = JenkinsHash(word, MAX_HASH_SLOT);

    if (hashtable->table[index] == NULL) {
        //Nothing in the slot so add the node
        hashtable->table[index] = node;
    } else {
        HashTableNode *current = hashtable->table[index];
        while(current->next != NULL) {
            current = current->next;
        }

        current->next = node;
    }
    return 1;
}
Esempio n. 17
0
// Adds a hashtablenode, wordnode, and documentnode for the word found
int insertIndexHash(HashTable *hash, char *targetword, int targetid, int frequency){
  int jenkins = JenkinsHash(targetword, MAX_HASH_SLOT);
  WordNode *findwordnode;
  //If the word already exists in the hashtable
  if(searchIndexHash(hash,targetword) == 0 ){
    findwordnode=hash->table[jenkins]->data;
    while(findwordnode != NULL ){
      if( strcmp(findwordnode->word,targetword) == 0 ){
	DocumentNode *finddoc=findwordnode->page;
	DocumentNode *dummydoc=finddoc;
	while(finddoc != NULL ){
	  dummydoc=finddoc;
	  //If the docnode already exists w/ the same doc_id, just increase freq by one
	  if ( finddoc->doc_id == targetid ){
	    (finddoc->freq)++;
	    free(targetword);
	    return 0;
	  }
	  finddoc=finddoc->next;
	}
	// If no doc_id's match, make a new docnode for the given word and target id
	DocumentNode *newDocNode=(DocumentNode *)malloc(sizeof(DocumentNode));
	newDocNode->next=NULL;
	newDocNode->doc_id=targetid;
	newDocNode->freq=frequency;
	dummydoc->next=newDocNode;
	free(targetword);
	return 0;
      }
      findwordnode=findwordnode->next;
    }
  } else {
    // If the word is not in the hashtable, make a wordnode and docnode for it
    DocumentNode *newDocNode=(DocumentNode *)malloc(sizeof(DocumentNode));
    newDocNode->next=NULL;
    newDocNode->doc_id=targetid;
    newDocNode->freq=frequency;

    WordNode *newWordNode=(WordNode *)malloc(sizeof(WordNode));
    newWordNode->next=NULL;
    newWordNode->word=targetword;
    newWordNode->page=newDocNode;

    HashTableNode *newHashNode=(HashTableNode *)malloc(sizeof(HashTableNode));
    newHashNode->data=newWordNode;
    
    if (hash->table[jenkins] != NULL ){
      findwordnode=hash->table[jenkins]->data;
      WordNode *holdernode=findwordnode;
      while (findwordnode != NULL ){
	holdernode=findwordnode;
	findwordnode=findwordnode->next;
      }
      holdernode->next=newWordNode;
      free(newHashNode);
      return 0;
    } else {
      hash->table[jenkins]=newHashNode;
      return 0;
    }
  }
  return 0;
}
Esempio n. 18
0
int AddToHashTable(HashTable* hTable, char* Word, int docId){

	//Find the slot of the word in the hashtable using JenkinsHash
	unsigned long slot = JenkinsHash(Word, MAX_HASH_SLOT);
	//unsigned long slot = 0;
	//If the slot is empty, simply add the word.
	if(hTable->table[slot]->word == NULL){
		
		hTable->table[slot]->word = strdup(Word);
		DocumentNode* tempNode = (DocumentNode*)calloc(1, sizeof(DocumentNode));
		hTable->table[slot]->page = tempNode;	
		hTable->table[slot]->page->doc_id = docId;
		hTable->table[slot]->page->freq = 1;
		hTable->table[slot]->page->next = NULL;
		//printf("First node at this place\n");	
		return 0;
	}
	
	else{
		HashTableNode* currNode = hTable->table[slot];
		//The current node 
		while((currNode->next)!=NULL){
			//If the word of the node and the given word are the same, then add a docNode or increase frequency.
			if(strcmp(currNode->word, Word) == 0){
				DocumentNode* docNode = currNode->page;
				while((docNode->next)!=NULL){
					//if the document node id and the given id are the same, increment the frequency.
					if(docNode->doc_id == docId){
						docNode->freq += 1;
						//printf("increased freuency\n");
						return 0;
					}
					docNode = docNode->next;
				}
				//check for the last node.
				if(docNode->doc_id == docId){
					docNode->freq += 1;
					//printf("increased frequency2\n");
					return 0;
				}
				//Add a document Node at the end since we didn't find a match.
				DocumentNode* tempNode = (DocumentNode*)calloc(1, sizeof(DocumentNode));
				tempNode->next = NULL;
				tempNode->doc_id = docId;
				tempNode->freq = 1;
				docNode->next = tempNode;
				//printf("added a document node\n");
				return 0;
			}
			currNode = currNode->next;
		}
		//Check if there is a match at the last Word node.
		if(strcmp(currNode->word, Word) == 0){
			DocumentNode* docNode = currNode->page;
				while((docNode->next)!=NULL){
					//if the document node id and the given id are the same, increment the frequency.
					if(docNode->doc_id == docId){
						docNode->freq += 1;
						//printf("increased frequency 3\n");
						return 0;
					}
					docNode = docNode->next;
				}
				//check for the last node.
				if(docNode->doc_id == docId){
					docNode->freq += 1;
					//printf("increased frequency 4\n");
					return 0;
				}
				//Add a document Node at the end since we didn't find a match.
				DocumentNode* tempNode = (DocumentNode*)calloc(1, sizeof(DocumentNode));
				tempNode->next = NULL;
				tempNode->doc_id = docId;
				tempNode->freq = 1;
				docNode->next = tempNode;
				//printf("added a document node\n");
				return 0;
		}
		//Couldn't find a match. Add a new word node at the end of the list.
		HashTableNode* tempWord = (HashTableNode*) calloc(1, sizeof(HashTableNode));
		tempWord->word = strdup(Word);
		tempWord->next = NULL;
		//Create a document node for the newly created wordNode.
		DocumentNode* docNode = (DocumentNode*) calloc(1, sizeof(DocumentNode));
		tempWord->page = docNode;
		tempWord->page->doc_id = docId;
		tempWord->page->freq = 1;
		tempWord->page->next = NULL;
		currNode->next = tempWord;
		//printf("Create a new Wordnode\n");
		return 0;
	}
	return 1;
}
Esempio n. 19
0
/*
 * reads index and recreates an index in a new file
 */
int read_file(char *fileName, Hashtable *hash_table){
	FILE *input = fopen(fileName, "r");
	int i;						
	int counter = 0;	
	int docID, frequency, file_count;
	char *word;			
	char temp_character;					
	unsigned long hash_number;			
	WordNode *wordnode;			
	DocumentNode *docNode, *end_doc;		
	Hashtablenode *current_node, *newHashNode;

	if(!input) return 0;
 
	while((temp_character = fgetc(input)) != EOF){	
		// read until next non alphabetical character
		while(isalpha(temp_character)){
			temp_character = fgetc(input);			
			counter++;
		}
		
		//put pointer back to start	
		fseek(input, -(counter + 1), SEEK_CUR);
		word = calloc(counter + 1, sizeof(char));
		fread(word, sizeof(char), counter, input);
		
		//find where the word hashes to		
		hash_number = JenkinsHash(word, MAX_HASH_SLOT);
		current_node = hash_table->table[hash_number];

		//allocate memory for new hashtable node and word node		
		wordnode = calloc(1, sizeof(WordNode));
		newHashNode = calloc(1, sizeof(Hashtablenode));		
		
		//store wordnode and found word	
		newHashNode->hash_key = wordnode;
		wordnode->word = word;

		//add the hashnode in 	
		if(current_node==NULL){
			hash_table->table[hash_number] = newHashNode;
		}
		else{
			while(current_node->next) current_node = current_node->next;
			current_node->next = newHashNode;
		}

		//find number of file containing current word	
		fscanf(input ," %d ", &file_count);

		//iterate through rest of line and store frequency and document id		
		for(i = 0; i < file_count; i++){
			
			fscanf(input, "%d %d ", &docID, &frequency);			
			docNode = calloc(1, sizeof(DocumentNode));
			docNode->doc_id = docID;
			docNode->frequency = frequency;
			//on the first iteration, add to wordnode, after that append to end		
			if(i == 0){
				wordnode->page = docNode;
				end_doc = docNode;
			}
			else{
				end_doc->next = docNode;
				end_doc = end_doc->next;
			}
		}	
		
		counter = 0;
	}
	fclose(input);
	return 1;
}
Esempio n. 20
0
/* Adds a given string to the hashtable
 */
int add_to_hashtable(char *str, Hashtable *hash_table, int document_id){

	// allocate memory for the new node
	Hashtablenode *hashnode; 
	int found_flag = 0;	
	//check if word is already in table, if not mark it with a flag	
	hashnode = In_Hashtable(str, hash_table);
	if (hashnode!=NULL){
		found_flag = 1;
	}
	else{
		hashnode = calloc(1, sizeof(Hashtablenode));
	}
	//if word is not found
	if(found_flag==0){
		        					
		//create word node
		WordNode *wordnode;
		wordnode = calloc(1, sizeof(Hashtablenode));	
		wordnode->word = str; // add current word into the WordNode
		
		DocumentNode *docNode;
		docNode = calloc(1, sizeof(DocumentNode));
		
		//link hashnode with word node and doc node
		hashnode->hash_key = wordnode;
		wordnode->page = docNode;	
		//store the document id in the doc node
		docNode->doc_id = document_id;
		//start the count at 1
		docNode->frequency = 1;

		// find the index that the word hashes to
		unsigned long hash_number = JenkinsHash(str, MAX_HASH_SLOT);
		Hashtablenode *current_node = hash_table->table[hash_number]; 
		//if there, append
		if(current_node!=NULL){
			while(current_node->next != NULL){
                                current_node = current_node->next;
                        }
                        current_node->next = hashnode;
		}
		//else, add it
		else{	
			
			hash_table->table[hash_number] = hashnode;
		}
		return 1;
	}
	//otherwise, str was found
	else{
		//iterate through document nodes, looking for matching document ids	
		DocumentNode *temp_doc;
		DocumentNode *end_doc;

		for( temp_doc = ((WordNode*)hashnode->hash_key)->page; temp_doc != NULL; temp_doc = temp_doc->next){
			if(temp_doc->doc_id == document_id){			
				temp_doc->frequency = (temp_doc->frequency + 1);
				return 1;
			}
			end_doc = temp_doc;
		}
		
                DocumentNode *docNode;
                docNode = calloc(1, sizeof(DocumentNode));
		//otherwise, the document id wasn't found to be matching, so set frequency to 1		
                docNode->frequency = 1;
		docNode->doc_id = document_id;

		//add to the last document node		
		end_doc->next = docNode;

		return 1;
	}
}
// main crawler function
int main(int argc, char* argv[]) {

    // local variables
    FILE *fp; // file pointer for html files
    char *nextURL; // pointer to the next URL found on the seed page
    char *newURL; // pointer to the next URL in the while loop

    // check command line arguments
    if (argc != 4) {
        printf("Incorrect number of arguments provided.");
        exit(1);
    }
    // check that the second argument is a directory
    stat(argv[2],&statbuffer);
    if S_ISDIR(statbuffer.st_mode) { }
    else {
        printf("Error, you did not supply a valid directory");
        exit(1);
    }

    // get arguments
    char *seedURL = argv[1];
    int filename_len = strlen(argv[2])+21;

    // get the directory
    char*filename = calloc(filename_len,sizeof(char));

    // check the maxDepth
    int value = is_numeric(argv[3]);
    if (value != 0) {
        sscanf(argv[3],"%i",&maxDepth);
    }
    else {
        printf("Error! maxDepth must be a number");
        exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // initialize data structures/variables

    // initialize hashtable
    HashTable *table = malloc(sizeof(HashTable));
    memset(table,0,MAX_HASH_SLOT);

    // initialize linked list
    List *WebPageList;
    WebPageList = createList();

    // setup seed page

    // get seed webpage
    // if it fails, report and exit
    if (NormalizeURL(seedURL) == 0) {
        printf("Error, bad URL");
        exit(1);
    }
    // write seed file

    // create WebPage object by allocating memory
    WebPage *seedPage = malloc(sizeof(WebPage));

    // assign values to each part of the struct
    seedPage->url = seedURL;
    seedPage->html = NULL;
    seedPage->html_len = 0;
    seedPage->depth = 0;

    // try to get the webpage up to MAX_TRY times
    if (!GetWebPage(seedPage)) {
        for (tries = 0; tries < MAX_TRY; tries++) {
            if (GetWebPage(seedPage)) {
                break;
            }
        }
    }

    // write html contents to a file "1" in the given directory
    sprintf(filename,"%s/%d",argv[2],1);
    fp = fopen(filename,"w");
    fputs(seedURL,fp);
    fputs("\n",fp);
    fprintf(fp,"%d\n",seedPage->depth);
    fputs(seedPage->html,fp);

    // close the file and wipe the filename
    fclose(fp);
    memset(filename,'\0',filename_len);

    // add seed page to hashtable
    add(table,seedURL);

    // extract urls from seed page

    // while there are still URLs in the seed page's html
    while ((pos = GetNextURL(seedPage->html,pos,seedPage->url,&nextURL)) > 0) {

        // only visiting them if it wouldn't exceed maxDepth
        if ((seedPage->depth+1) > maxDepth) {
            free(seedPage);
            exit(1);
        }

        // ensure it's a valid url
        if (NormalizeURL(nextURL) != 0) {

            // also check if its in the right domain
            if (strncmp(URL_PREFIX,nextURL,strlen(URL_PREFIX)) == 0) {

                // if it is added to the hashtable it is a unique URL that
                // hasn't been visited before, add it to the linked list
                // of URLs to visit
                if (add(table,nextURL)) {
                    // create a new webpage object
                    WebPage *pages = malloc(sizeof(WebPage));
                    pages->url = nextURL;
                    pages->html = NULL;
                    pages->html_len = 0;
                    pages->depth = 1;

                    // try to get the webpage up until the MAX_TRY
                    tries = 0;
                    if (!GetWebPage(pages)) {
                        for (tries = 0; tries < MAX_TRY; tries++) {
                            if (GetWebPage(pages)) {
                                break;
                            }
                        }
                    }

                    // add it to the linked list
                    addToEnd(WebPageList,pages);
                }
            }
        }
    }

    // while there are urls to crawl
    while (WebPageList->head != NULL) {
        // get next url from list
        WebPage *nextPage = malloc(sizeof(WebPage));
        nextPage = removeFromFront(WebPageList);

        // try to get the webpage up until the MAX_TRY
        tries = 0;
        if (!GetWebPage(nextPage)) {
            for (tries = 0; tries < MAX_TRY; tries++) {
                if (GetWebPage(nextPage)) {
                    break;
                }
            }
        }

        // write page file
        sprintf(filename,"%s/%d",argv[2],docNum);
        fp = fopen(filename,"w");
        fputs(nextPage->url,fp);
        fputs("\n",fp);
        fprintf(fp,"%d\n",nextPage->depth);
        fputs(nextPage->html,fp);

        // close the file and wipe the filename (to be used next time)
        fclose(fp);
        memset(filename,'\0',filename_len);

        // increment the doc num
        docNum++;

        // check if visiting the URLs on this page will exceed maxDepth
        if ((nextPage->depth+1) > maxDepth) {
            free(nextPage);
            continue;
        }
        pos = 0;
        // iterate through all the URLs on the page
        while ((pos = GetNextURL(nextPage->html,pos,nextPage->url,&newURL))>0) {
            // check to ensure that the URLs are the proper format
            if (NormalizeURL(newURL) != 0 ) {
                // check to ensure that they are in the right domain
                if (strncmp(URL_PREFIX,newURL,strlen(URL_PREFIX)) == 0) {
                    // making sure to only add new ones to the list
                    if (add(table,newURL) != 0) {
                        // create a new WebPage object
                        WebPage *page = malloc(sizeof(WebPage));
                        page->url = newURL;
                        page->html = NULL;
                        page->html_len = 0;
                        page->depth = nextPage->depth + 1;
                        GetWebPage(page);

                        // try to get the webpage up until the MAX_TRY
                        tries = 0;
                        if (!GetWebPage(page)) {
                            for (tries = 0; tries < MAX_TRY; tries++) {
                                if (GetWebPage(page)) {
                                    break;
                                }
                            }
                        }

                        // add the page to the linked list
                        addToEnd(WebPageList,page);
                    }
                }
            }
        }
        // Sleep for a bit to avoid annoying the target
        sleep(INTERVAL_PER_FETCH);

        // Free resources
        free(nextPage);

    }

    // cleanup curl
    curl_global_cleanup();

    // free resources
    // free hashtable
    hash = JenkinsHash(seedURL,MAX_HASH_SLOT);
    HashTableNode *freer = table->table[hash];
    HashTableNode *tempHash = NULL;
    while (freer != NULL) {
        tempHash = freer;
        freer = freer->next;
        free(tempHash);
    }
    free(table);

    // free linked list
    free(WebPageList);

    // free WebPage and filename pointer
    free(seedPage);
    free(filename);
    return 0;
}
Esempio n. 22
0
/* ==========================================================================
 * Take a word and DocumentNode information and a hashtable and try to put
 * or find the information in the index and update the index appropriately.
 *
 * *** Content ***
 * Case 1: Nothing found in word-hashed slot. Insert new word & doc nodes
 * Case 2: Linked list of wordnodes hashed to slot. Loop over them, if one
 * of them is for the parameter-passed word, break and assign a wordnode. If
 * not, reached end of linked list without finding word (a collision of hash
 * number). If the word was found, loop over the doc nodes. If one found,
 * increment docID, if not found, make a new one and set it to the next doc
 * node. If that doc node doesn't exist for the parameter passed document ID,
 * make new document node and add to end of document node linked lists.
 * ========================================================================== */
int reloadIndexHash(char *word, int docID, int freq, HashTable *index) {
    unsigned long hashNumber = JenkinsHash(word, MAX_HASH_SLOT);
    
    DocumentNode *docNode = NULL;
    WordNode *wordNode = NULL;
    
    // Update index backwards, essentially.
    if (index->table[hashNumber]->hashKey == NULL) {
        docNode = DNode(docID, freq);
        wordNode = WNode(docNode, word);
        index->table[hashNumber]->hashKey = wordNode;    // nothing found, slot in new wordnode
        return 1;
    }
    
    else {                                  // loop in that bucket in index table
        GenHashTableNode *loopNode = index->table[hashNumber];
        WordNode *relevantWordNode = NULL;
        WordNode *lastWordNode = NULL;
        
        while (loopNode->hashKey) {
            wordNode = loopNode->hashKey;
            if (!strcmp(wordNode->word, word)) {    // if wordnode found in linked list break
                relevantWordNode = wordNode;
                break;
            }
            if (!loopNode->next) {
                lastWordNode = wordNode;        // last wordnode in linked list. assign wordnode
                break;
            }
            loopNode = loopNode->next;
        } // end while
        
        // depending on outcome of loop: relevantWordNode is a match for the inputted word
        // or lastWordNode refers to the last word node in the linked list
        
        if (relevantWordNode) {
            docNode = relevantWordNode->page;
            
            // loop through that WordNode's list of document nodes
            while (docNode) {
                if (docNode->docID == docID) {      // found a document node, increment freq.
                    docNode->freq++;
                    return 1;
                }
                
                if (!docNode->next) {               // didn't find. make new.
                    DocumentNode *addition = NULL;
                    addition = DNode(docID, freq);
                    docNode->next = addition;
                    return 1;
                }
                docNode = docNode->next;
            }
        }
        if (lastWordNode) {      // word node not found. make new word node and new doc node
            GenHashTableNode *nextNode = (GenHashTableNode*)calloc(1, sizeof(GenHashTableNode));
            
            docNode = DNode(docID, freq);
            WordNode *addition = NULL;
            addition = WNode(docNode, word);
            loopNode->next = nextNode;
            loopNode->next->hashKey = addition;
            nextNode->next = NULL;
            return 1;
            
        }
    } // end else
    return 1;
    
}
Esempio n. 23
0
/*
 * Makes a indexer hashtable from a file with formatting:
 *   word fileID occurrences fileID occurrences ... fileID occurrences \n
 * One line should be written per a word that occurred in at least one of
 * the htmls from the crawler indexed.
 * @fileName: fileName of the file to be made into a hashtable
 * @hashTab: hashTable to create from a file of the above format
 * @result: the modified hashTab created from the file, fileName. Returns 1
 * if successful and 0 if not.
 */
int readFile(char *fileName, HashTable *hashTab){
	FILE *input = fopen(fileName, "r");		// file to read from
	int i;						
	int letCount = 0;				// counts length of read word
	int docID, occurrences, numFiles;		// ints read from the file
	char *word;					// stores the read word
	char tempc;					
	unsigned long hashValue;			
	WordNode *addWord;				// New WordNode, DocNode, and hash table node
	DocNode *addDoc, *prevDoc;			// in reconstruction of hashtable as well as 
	GenHashTableNode *curHashNode, *newHashNode;	// previous pointers to some constructs

	if(!input) return 0;

	// read until the end of a file 
	while((tempc = fgetc(input)) != EOF){	
		// read until next space (or non alphabet)
		while(isalpha(tempc)){
			tempc = fgetc(input);
			// count the length of the word until space
			letCount++;
		}
		
		// rewind pointer, and save the word into an allocated memory
		fseek(input, -(letCount + 1), SEEK_CUR);
		word = calloc(letCount + 1, sizeof(char));
		fread(word, sizeof(char), letCount, input);
		
		// find where the word naturally hashes to.
		hashValue = JenkinsHash(word, MAX_HASH_SLOT);
		curHashNode = hashTab->table[hashValue];

		// allocate memory for the new GenHashTableNode and WordNode
		newHashNode = calloc(1, sizeof(GenHashTableNode));
		if(!newHashNode) return 0;
		addWord = calloc(1, sizeof(WordNode));
		if(!addWord) return 0;
		
		// store the found word into the new WordNode and store the 
		// this WordNode into the newly created GenHashTableNode
		addWord->word = word;
		newHashNode->hashKey = addWord;

		// connect this newHashNode in the right place of the hashtable.
		if(!curHashNode){
			hashTab->table[hashValue] = newHashNode;
		}
		else{
			while(curHashNode->next) curHashNode = curHashNode->next;
			curHashNode->next = newHashNode;
		}

		// find the number of the files containing the current word 
		// (this is the word in the line of a file right after the word)
		fscanf(input ," %d ", &numFiles);

		// iterate through the current line for the rest of the numbers
		// encoding the documentID and occurrences of the word. We iterate
		// numFiles times.
		for(i = 0; i < numFiles; i++){
			
			// find the int encoding documentID and occurrences.
			fscanf(input, "%d %d ", &docID, &occurrences);

			// Need to make a new DocNode, so allocate memory for that
			// and store the newly found documentID and occurrences 
			// into this new DocNode
			addDoc = calloc(1, sizeof(DocNode));
			addDoc->documentID = docID;
			addDoc->occurrences = occurrences;

			// link the new DocNode to the right DocNode or WordNode
			// in the first iteration, the DocNode will be added to
			// the WordNode. All other iteration should link an existing 
			// DocNode to the current DocNode
			if(i == 0){
				addWord->docs = addDoc;
				prevDoc = addDoc;
			}
			else{
				prevDoc->nextDoc = addDoc;
				prevDoc = prevDoc->nextDoc;
			}
		}	
		// set the letter count back to 0.
		letCount = 0;
	}
	fclose(input);
	return 1;
}
Esempio n. 24
0
//Inserting the words found into a temporary docList that will be used later in order to compose the master docList
docList *insertDocList(HashTable *currHash, char *currWord, docList *currDocs){
  WordNode *findwordnode;
  int jenkins = JenkinsHash(currWord,MAX_HASH_SLOT);

  //If the word does occupy a slot on the hashtable...
  if (currHash->table[jenkins] != NULL ){
    findwordnode=currHash->table[jenkins]->data;
    //while there are still possible words in the hashtable slot...
    while(findwordnode != NULL ){
      //If the word matches our target word
      if( strcmp(findwordnode->word,currWord) == 0 ){
	DocumentNode *findDocumentNode = findwordnode->page;
	//If there are already entries in the docList
	if ( currDocs->head != NULL ){
	  //While there are already DocumentNodes in the docList we are adding to
	  if( empty != 2 ){
	    docList *tempDoclist = malloc(sizeof(docList));
	    docNode *edocNode = malloc(sizeof(docNode));
	    edocNode->next=NULL;
	    tempDoclist->head=edocNode;

	    docNode *tempDocNode = tempDoclist->head;
	    int count = 0;
	    int change = 0;
	    while( findDocumentNode != NULL ){
	      docNode *newDocNode = malloc(sizeof(docNode));
	      newDocNode->doc_id=findDocumentNode->doc_id;
	      newDocNode->freq=findDocumentNode->freq;
	      newDocNode->next=NULL;
	      
	      //If the docList is already populated, simply add on the new docNode to the end of the docList
	      docNode *findnode = currDocs->head;
	      while(findnode != NULL){
		if(newDocNode->doc_id == findnode->doc_id){
		  change=1;
		  if ( count == 0 ){
		    tempDoclist->head->doc_id=newDocNode->doc_id;
		    tempDoclist->head->freq=newDocNode->freq+findnode->freq;
		    count=1;
		  } else {
		    newDocNode->freq+=findnode->freq;
		    tempDocNode->next=newDocNode;
		    tempDocNode=tempDocNode->next;
		  }
		}
		findnode=findnode->next;
	      }
	      //If the doc_id doesn't match with any on the current list, free the list and set to null
	      if( change == 0 ){
		freeDocList(currDocs);
		docList *failedDocs = malloc(sizeof(docList));
		failedDocs->head=NULL;
		empty=2;
		return failedDocs;
		}
	      findDocumentNode = findDocumentNode->next;
	    }
	  return tempDoclist;
	  }
	} else { //If the docList has not been populated once yet create an initial docNode and set it as the head of the docList
	  if ( empty != 2){
	    docNode *emptydocNode = malloc(sizeof(docNode));
	    emptydocNode->next=NULL;
	    currDocs->head=emptydocNode;
	    int flag = 0;
	    while ( findDocumentNode != NULL ){
	      docNode *newDocNode = malloc(sizeof(docNode));
	      newDocNode->doc_id=findDocumentNode->doc_id;
	      newDocNode->freq=findDocumentNode->freq;
	      newDocNode->next=NULL;
	      docNode *findDocNode=currDocs->head;
	      while(findDocNode->next != NULL ){
		findDocNode=findDocNode->next;
	      }
	      if( flag == 0 ){
		currDocs->head->doc_id=newDocNode->doc_id;
		currDocs->head->freq=newDocNode->freq;
		flag = 1;
	      } else {
		findDocNode->next=newDocNode;
	      }
	      findDocumentNode = findDocumentNode->next;
	    }
	  }
	} return currDocs;
      }
      findwordnode=findwordnode->next;
    }
  } 

  //If the hashtable search came up empty, meaning that the word cannot be found in any url, set the docList so it reads NULL
  if(empty == 0 || empty == 2 ) {
    freeDocList(currDocs);
    docList *failedDocs = malloc(sizeof(docList));
    failedDocs->head=NULL;
    empty=2;
    return failedDocs;
    }
return currDocs;
}
// updates the index with a given word
// either adds it or increments the count of an existing document
// or adds a new document to an existing word
int updateIndex(char *wordIn, int idIn, IndexTable *tableIn) {
    // null word, invalid doc id, or nonexistent index
    if (!wordIn || idIn < 0 || !tableIn) {
        if (1 == DEBUG) {
            printf("updateIndex received invalid params\n");
        }
        return 1; // failed
    }

    int location = JenkinsHash(wordIn, MAX_HASH_SLOT);
    WordNode *node = tableIn->table[location]; 
    
    // word not in index
    if (!tableIn->table[location]->word) {
        if (1 == DEBUG) { 
            printf("Word %s not found in index, adding it\n", wordIn);
        }
        tableIn->table[location] = newWordNode(wordIn, idIn);
        return 0;
    }
    
    // word already in index, search through list to find its node
    while (1) {
        if (strcmp(node->word, wordIn) == 0) {
            // found the WordNode containing wordIn
            if (1 == DEBUG) {
                printf("Found WordNode for \"%s\"\n", wordIn);
            }
            DocumentNode *doc = node->page;
            while (1) {
                if (doc->docId == idIn) {
                    // found the DocumentNode for this doc/word combo
                    
                    if (1 == DEBUG) {
                        printf("Found DocumentNode for id=%d for \"%s\"\n",
                                idIn, wordIn);
                    }
                    doc->freq++;
                    return 0;
                }
                if (!doc->next) {
                    break;
                } 
                doc = doc->next;
            }
            // made it to the end without finding a matching DocumentNode
            
            if (1 == DEBUG) {
                printf("No DocumentNode for id=%d for \"%s\"\n", 
                        idIn, wordIn);
            }
            DocumentNode *newDoc = newDocumentNode(idIn);
            doc->next = newDoc;
            return 0;
        }
        if (!node->next) {
            break;
        } 
        node = node->next;
    }
    WordNode *newWord = newWordNode(wordIn, idIn);
    node->next = newWord;
    return 0; 
}
Esempio n. 26
0
int addToHash(char *word, char *fileName, HashTable *Index){
    // get key 
    unsigned long key = JenkinsHash(word, MAX_HASH_SLOT);



    // if word is in hashtable, add doc and return 
    if(hashLookUp(word, Index) == 0){

        // get word doc
        if(strcmp(Index->table[key]->word, word) == 0){
            addDocs(Index->table[key], fileName, Index);
            return 0;
        }

        wordNode *tmp = Index->table[key];
        while(tmp){
            if(strcmp(tmp->word, word) == 0){
                addDocs(tmp, fileName, Index);
            }
            tmp= tmp->next;
        }
       return 0;
    }

/////////////////////////////////////////////////////////////////////////////////////
    // create node
    wordNode *node = malloc(sizeof(wordNode));
    node->word = malloc(strlen(word)+1);
    strcpy(node->word, word);
    node->next = NULL;
    node->doc = NULL;


    if(!node){
      
        return 3;
    }

    if(Index->table[key] == NULL){
        
        Index->table[key] = node;
        addDocs(Index->table[key], fileName, Index);
        return 1;
    }

    // if hashtable new word 
    if(hashLookUp(word, Index) == 1){
        
        if(Index->table[key]){
            wordNode *tmpNode = Index->table[key];
            while(tmpNode->next != NULL){
                tmpNode=tmpNode->next;
            }

            tmpNode->next = node;
            addDocs(tmpNode->next, fileName, Index);
            return 1;
        }
    }


return 0;

}