Exemple #1
0
//Reads an inverted index file and recreates the data structure it represents
HashTable *ReadFile(char *file) {
	HashTable *reloadedIndex = calloc(1, sizeof(HashTable));			//allocate new index
	FILE *fp;

	fp = fopen(file, "r");								//open the input file
	char *line = calloc(100000, sizeof(char));

	//read the file line by line, parsing each line for the word, docids and freqs
	while (fgets(line, INT_MAX, fp) != NULL) {					
		line = strtok(line, "\n");
		char *token;
		char *word = calloc(100, sizeof(char));	
		token = strtok(line, " ");
		strcpy(word, token);
		token = strtok(NULL, " ");
		token = strtok(NULL, " ");
		
		//once word has been parsed and doccount has been skipped, start reading the docids and freqs until the end of the line
		while (token != NULL) {
			int doc_id = atoi(token);
			token = strtok(NULL, " ");
			int freq = atoi(token);
			int i = 0;
			
			//increment the frequency as many times as needed
			while (i < freq) {
				if (InHashTable(word, reloadedIndex) == 0) {
					AddToHashTable(word, reloadedIndex);
					UpdateHashTable(word, doc_id, reloadedIndex);
				}
				else {
					UpdateHashTable(word, doc_id, reloadedIndex);
				}
				i++;
			}
			token = strtok(NULL, " ");
		}
		free(token);
	}
	free(line);
	fclose(fp);	
	return reloadedIndex;	
}
Exemple #2
0
void And(char *word, HashTable *Index) {

	unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code.
	
	// Declare variables for traversal.
	WordNode *current;
	DocumentNode *ptr, *ptr2, *runner, *no_need;
	int num;
	
	// Get matching WordNode of word if it is in the InvertedIndex.
	if ((num = InHashTable(word, Index))) {
		current = Index->table[index]->data;
		// Loop until we get the matching WordNode.
		for (int i=1; i < num; i++) {
			current = current->next;
		}
		ptr2 = current->page; // Set to start of the list of document nodes for the current word.
	}
	else {
		ptr2 = NULL;
	}
	
	// Initialize variables. 
	ptr = temp_list;
	
	while (ptr != NULL) {
	
		// Check that the word is in the InvertedIndex.
		if (num) {
			ptr2 = current->page; // Set to start of the list of document nodes for the current word.
			
			// Loop until the end of the new list of matching DocumentNodes.
			while (ptr2 != NULL) {
				// Check for a match in doc_id.
				if (ptr->doc_id == ptr2->doc_id) {
					ptr->freq += ptr2->freq; // Add the frequencies.
					break;
				}
				ptr2 = ptr2->next;
			}
			
			// Case of no match.
			if (ptr2 == NULL) {
			
				// Check if we need to delete the first node of temp_list.
				if (ptr == temp_list) { 
					temp_list = temp_list->next;
				}
				else { // All other cases.
					runner->next = runner->next->next;
				}
				no_need = ptr;
				ptr = ptr->next;
				
				// Free the node to be deleted.
				no_need->next = NULL;
				free(no_need);
				no_need = NULL;
				
			}
			else { // Case of match.
				runner = ptr;
				ptr = ptr->next;
			}
		}
		else { // Word is not in the InvertedIndex.
			ptr = NULL;
			FreeList(0);
		}
	}
}
Exemple #3
0
int GetLinks(char *line, HashTable *Index) {
	
	// Declare variables.
	char *buf;
	char word[MAX];
	int flag; // flag to do union or intersection operations.
	int count; // variable to count the position of a word in the line.
	
	// Initialize variables.
	buf = line;
	flag = 1;
	count = 0;
	temp_list = NULL;
	final_list = NULL;
	
	// Loop through the line and do the appropriate operations.
	while (sscanf(buf, "%s", word) == 1) {
		
		count++;
		
		// If word is AND, then ignore and read in new word.
		if (strcmp(word, operator1) == 0) {
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
			}
			
			// Increment position in line.
			buf = strstr(buf, word) + strlen(word);
			continue;
		}
		
		// If word is OR, then tell the program to do OR operation.
		if (strcmp(word, operator2) == 0) {
			flag = 2; // Set flag to union operation.
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
			}
		}
		
		// Hold onto original copy of word in case NormalizeWord() changes its content.
		char *word_old = (char *)calloc(1, strlen(word) + 1);
		strcpy(word_old, word);
		
		// Change word to lowercase.
		if (strcmp(word, operator1) != 0 && strcmp(word, operator2) != 0) {
			NormalizeWord(word); // Normalize if word is not an operator.
		}
		
		// Add list of docs to temp_list.
		// Case when it is the first word of the block.
		if (count == 1) {
			
			// Declare variables.
			WordNode *current; // variable for traversal.
			DocumentNode *ptr, *ptr2; // variables for traversal.
			int num;
			
			// Case when the word is in the InvertedIndex.
			if ((num = InHashTable(word, Index))) {
				unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code.
				current = Index->table[index]->data;
				
				// Loop until we get the matching WordNode.
				for (int i=1; i < num; i++) {
					current = current->next;
				}
				
				// Loop through each DocumentNode and add to temp_list.
				for (ptr = current->page; ptr != NULL; ptr = ptr->next) {
					
					// Declare and initialize a DocumentNode with the same values as ptr.
					DocumentNode *dn;
					dn = (DocumentNode *)calloc(1, sizeof(DocumentNode));
					dn->doc_id = ptr->doc_id;
					dn->freq = ptr->freq;
					
					// Add the new DocumentNode to temp_list.
					if (temp_list == NULL) { // Case when temp_list is empty.
						temp_list = dn;
						ptr2 = temp_list;
					}
					else { // Case when temp_list is nonempty.
						ptr2->next = dn;
						ptr2 = ptr2->next;
					}
				}
			}
		}
		else { // If not first word of the block, then do the operation.
		
			// Check if the current operation is "AND".
			if (flag == 1) {
				And(word, Index);
			}
			
			// Check if the current operation is "OR".
			if (flag == 2) {
				if (temp_list != NULL) {
					Or();
				}
				flag = 1; // Set flag back to "AND" operation.
				count = 0; // Set word count to 0 to signal the start of a new block of words.
			}
		}
		
		// Increment position in the query line to read in next word.
		buf = strstr(buf, word_old) + strlen(word_old);
		
		free(word_old); // Cleanup.
	}
	
	// If the last word of the query line is an operator, throw an error.
	if (strcmp(word, operator1) == 0 || strcmp(word, operator2) == 0) {
		return 0;
	
	}
	
	// If nonempty, flush out temp_list to final_list.
	if (temp_list != NULL) { 
		Or();
	}
	
	return 1; // Return 1 if successful.
}	
Exemple #4
0
// Function to crawl a given webpage for links.
int CrawlPage(WebPage *wp) {
	
	char *result; // variable to hold the url.
    	int pos = 0; // position in each html page.
    	WebPage *newPage; // New webpage.
    
    	// Check that the depth does not exceed the depth passed.
    	if (wp->depth >= depth) {
    		return 0;
    	}
    
    	printf("\n");
    	printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled.
    	printf("\n");
    
    	// Loop through each html page to get all its urls.
    	while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) {
    
    		// Check that the url has proper domain (old-www).
		if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) {
			free(result);
			continue;
		}
 		
		// Normalize the url.
    		if (!NormalizeURL(result)) {
    			free(result);
    			continue;
    		}
    	
    		// Check that the url isn't already in the hash table.
    		if (!InHashTable(result)) {
    			AddToHashTable(result); // Add the url to the hash table.
    		
    			// Setup new page for each url.
			newPage = calloc(1, sizeof(WebPage));
			newPage->depth = wp->depth + 1;
			newPage->url = (char *)malloc(strlen(result) + 1);
			if (!newPage->url) { // Check that memory was allocated.
				continue;
			}
			strcpy(newPage->url, result);

			// Get html for each url.
			if (!GetWebPage(newPage)) {
				FreeWebMemory(newPage);
				free(result);
				continue;
			}
			
			printf("[crawler]: Parser found link - %s\n", result);
			
			// Add to the list of webpages to be visited.
			if (!AppendList(newPage)) {
				free(result);
				return 0;
			}
    		}
    		free(result);
    	}
	return 1;
}
Exemple #5
0
int main(int argc, char* argv[]) {
	//check argument number
	if (argc < 3 || argc > 4) {
		printf("too many or too little arguments, please try again");
		exit(0);
	}
	
	//check directory validity
	if (!IsDir(argv[1])) {
		printf("invalid directory, please try again");
		exit(0);
	}
	
	//Initialize variables and index
	int docId;
	int pos;
	char *doc;
	char **filenames = NULL;
	int num_files = 0;
	HashTable *WordsFound = calloc(1, sizeof(HashTable));
	num_files = GetFilenamesInDir(argv[1], &filenames);

	//check whether the folder has files
	if (num_files < 0) {
		printf("failed to get any filenames");
		exit(0);
	}

	//iterate through each file in the directory
	for (int i = 0; i < num_files; i++) {
		
		//check that the file is in the correct format (title is a number)
		int filechecker = 0;
		for (int c = 0; c < strlen(filenames[i]); c++) {
			if (!isdigit(filenames[i][c])) {
				filechecker = 1;
			}
		}
		if (filechecker == 1) {
			continue;
		}

		//Load the document
		char *word;
		char file[100];
		strcpy(file, argv[1]);
		strcat(file, filenames[i]);
		doc = LoadDocument(file);
		docId = GetDocumentId(filenames[i]);
		free(filenames[i]);
		
		pos = 0;
		//Iterate through each word in the html file (doc)
		while ((pos = GetNextWord(doc, pos, &word)) > 0) {
			NormalizeWord(word);
			if (InHashTable(word, WordsFound) == 0) {
				AddToHashTable(word, WordsFound);
				UpdateHashTable(word, docId, WordsFound);
			}
			else {
				UpdateHashTable(word, docId, WordsFound);
				free(word);
			}
		}
		free(doc);
	}	
	free(filenames);
	SaveIndexToFile(argv[2], WordsFound);				//Save the index to the file specified
	FreeHashTable(WordsFound);

	//only proceed if there was a third argument specified. If so, reload the index form the file you just created
	if (argc == 4) {
		HashTable *ReloadedIndex = ReadFile(argv[2]);
		SaveIndexToFile(argv[3], ReloadedIndex);
		FreeHashTable(ReloadedIndex);
	}
	return 0;
}