コード例 #1
ファイル: indexer.c プロジェクト: jlee9595/TinySearchEngine
//Reads an inverted index file and recreates the data structure it represents
HashTable *ReadFile(char *file) {
	HashTable *reloadedIndex = calloc(1, sizeof(HashTable));			//allocate new index
	FILE *fp;

	fp = fopen(file, "r");								//open the input file
	char *line = calloc(100000, sizeof(char));

	//read the file line by line, parsing each line for the word, docids and freqs
	while (fgets(line, INT_MAX, fp) != NULL) {					
		line = strtok(line, "\n");
		char *token;
		char *word = calloc(100, sizeof(char));	
		token = strtok(line, " ");
		strcpy(word, token);
		token = strtok(NULL, " ");
		token = strtok(NULL, " ");
		//once word has been parsed and doccount has been skipped, start reading the docids and freqs until the end of the line
		while (token != NULL) {
			int doc_id = atoi(token);
			token = strtok(NULL, " ");
			int freq = atoi(token);
			int i = 0;
			//increment the frequency as many times as needed
			while (i < freq) {
				if (InHashTable(word, reloadedIndex) == 0) {
					AddToHashTable(word, reloadedIndex);
					UpdateHashTable(word, doc_id, reloadedIndex);
				else {
					UpdateHashTable(word, doc_id, reloadedIndex);
			token = strtok(NULL, " ");
	return reloadedIndex;	
コード例 #2
ファイル: query.c プロジェクト: cjunmokim/Search-Engine
void And(char *word, HashTable *Index) {

	unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code.
	// Declare variables for traversal.
	WordNode *current;
	DocumentNode *ptr, *ptr2, *runner, *no_need;
	int num;
	// Get matching WordNode of word if it is in the InvertedIndex.
	if ((num = InHashTable(word, Index))) {
		current = Index->table[index]->data;
		// Loop until we get the matching WordNode.
		for (int i=1; i < num; i++) {
			current = current->next;
		ptr2 = current->page; // Set to start of the list of document nodes for the current word.
	else {
		ptr2 = NULL;
	// Initialize variables. 
	ptr = temp_list;
	while (ptr != NULL) {
		// Check that the word is in the InvertedIndex.
		if (num) {
			ptr2 = current->page; // Set to start of the list of document nodes for the current word.
			// Loop until the end of the new list of matching DocumentNodes.
			while (ptr2 != NULL) {
				// Check for a match in doc_id.
				if (ptr->doc_id == ptr2->doc_id) {
					ptr->freq += ptr2->freq; // Add the frequencies.
				ptr2 = ptr2->next;
			// Case of no match.
			if (ptr2 == NULL) {
				// Check if we need to delete the first node of temp_list.
				if (ptr == temp_list) { 
					temp_list = temp_list->next;
				else { // All other cases.
					runner->next = runner->next->next;
				no_need = ptr;
				ptr = ptr->next;
				// Free the node to be deleted.
				no_need->next = NULL;
				no_need = NULL;
			else { // Case of match.
				runner = ptr;
				ptr = ptr->next;
		else { // Word is not in the InvertedIndex.
			ptr = NULL;
コード例 #3
ファイル: query.c プロジェクト: cjunmokim/Search-Engine
int GetLinks(char *line, HashTable *Index) {
	// Declare variables.
	char *buf;
	char word[MAX];
	int flag; // flag to do union or intersection operations.
	int count; // variable to count the position of a word in the line.
	// Initialize variables.
	buf = line;
	flag = 1;
	count = 0;
	temp_list = NULL;
	final_list = NULL;
	// Loop through the line and do the appropriate operations.
	while (sscanf(buf, "%s", word) == 1) {
		// If word is AND, then ignore and read in new word.
		if (strcmp(word, operator1) == 0) {
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
			// Increment position in line.
			buf = strstr(buf, word) + strlen(word);
		// If word is OR, then tell the program to do OR operation.
		if (strcmp(word, operator2) == 0) {
			flag = 2; // Set flag to union operation.
			if (count == 1) { // If there is no previous word, then throw an error.
				return 0;
		// Hold onto original copy of word in case NormalizeWord() changes its content.
		char *word_old = (char *)calloc(1, strlen(word) + 1);
		strcpy(word_old, word);
		// Change word to lowercase.
		if (strcmp(word, operator1) != 0 && strcmp(word, operator2) != 0) {
			NormalizeWord(word); // Normalize if word is not an operator.
		// Add list of docs to temp_list.
		// Case when it is the first word of the block.
		if (count == 1) {
			// Declare variables.
			WordNode *current; // variable for traversal.
			DocumentNode *ptr, *ptr2; // variables for traversal.
			int num;
			// Case when the word is in the InvertedIndex.
			if ((num = InHashTable(word, Index))) {
				unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code.
				current = Index->table[index]->data;
				// Loop until we get the matching WordNode.
				for (int i=1; i < num; i++) {
					current = current->next;
				// Loop through each DocumentNode and add to temp_list.
				for (ptr = current->page; ptr != NULL; ptr = ptr->next) {
					// Declare and initialize a DocumentNode with the same values as ptr.
					DocumentNode *dn;
					dn = (DocumentNode *)calloc(1, sizeof(DocumentNode));
					dn->doc_id = ptr->doc_id;
					dn->freq = ptr->freq;
					// Add the new DocumentNode to temp_list.
					if (temp_list == NULL) { // Case when temp_list is empty.
						temp_list = dn;
						ptr2 = temp_list;
					else { // Case when temp_list is nonempty.
						ptr2->next = dn;
						ptr2 = ptr2->next;
		else { // If not first word of the block, then do the operation.
			// Check if the current operation is "AND".
			if (flag == 1) {
				And(word, Index);
			// Check if the current operation is "OR".
			if (flag == 2) {
				if (temp_list != NULL) {
				flag = 1; // Set flag back to "AND" operation.
				count = 0; // Set word count to 0 to signal the start of a new block of words.
		// Increment position in the query line to read in next word.
		buf = strstr(buf, word_old) + strlen(word_old);
		free(word_old); // Cleanup.
	// If the last word of the query line is an operator, throw an error.
	if (strcmp(word, operator1) == 0 || strcmp(word, operator2) == 0) {
		return 0;
	// If nonempty, flush out temp_list to final_list.
	if (temp_list != NULL) { 
	return 1; // Return 1 if successful.
コード例 #4
ファイル: crawler.c プロジェクト: cjunmokim/Search-Engine
// Function to crawl a given webpage for links.
int CrawlPage(WebPage *wp) {
	char *result; // variable to hold the url.
    	int pos = 0; // position in each html page.
    	WebPage *newPage; // New webpage.
    	// Check that the depth does not exceed the depth passed.
    	if (wp->depth >= depth) {
    		return 0;
    	printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled.
    	// Loop through each html page to get all its urls.
    	while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) {
    		// Check that the url has proper domain (old-www).
		if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) {
		// Normalize the url.
    		if (!NormalizeURL(result)) {
    		// Check that the url isn't already in the hash table.
    		if (!InHashTable(result)) {
    			AddToHashTable(result); // Add the url to the hash table.
    			// Setup new page for each url.
			newPage = calloc(1, sizeof(WebPage));
			newPage->depth = wp->depth + 1;
			newPage->url = (char *)malloc(strlen(result) + 1);
			if (!newPage->url) { // Check that memory was allocated.
			strcpy(newPage->url, result);

			// Get html for each url.
			if (!GetWebPage(newPage)) {
			printf("[crawler]: Parser found link - %s\n", result);
			// Add to the list of webpages to be visited.
			if (!AppendList(newPage)) {
				return 0;
	return 1;
コード例 #5
ファイル: indexer.c プロジェクト: jlee9595/TinySearchEngine
int main(int argc, char* argv[]) {
	//check argument number
	if (argc < 3 || argc > 4) {
		printf("too many or too little arguments, please try again");
	//check directory validity
	if (!IsDir(argv[1])) {
		printf("invalid directory, please try again");
	//Initialize variables and index
	int docId;
	int pos;
	char *doc;
	char **filenames = NULL;
	int num_files = 0;
	HashTable *WordsFound = calloc(1, sizeof(HashTable));
	num_files = GetFilenamesInDir(argv[1], &filenames);

	//check whether the folder has files
	if (num_files < 0) {
		printf("failed to get any filenames");

	//iterate through each file in the directory
	for (int i = 0; i < num_files; i++) {
		//check that the file is in the correct format (title is a number)
		int filechecker = 0;
		for (int c = 0; c < strlen(filenames[i]); c++) {
			if (!isdigit(filenames[i][c])) {
				filechecker = 1;
		if (filechecker == 1) {

		//Load the document
		char *word;
		char file[100];
		strcpy(file, argv[1]);
		strcat(file, filenames[i]);
		doc = LoadDocument(file);
		docId = GetDocumentId(filenames[i]);
		pos = 0;
		//Iterate through each word in the html file (doc)
		while ((pos = GetNextWord(doc, pos, &word)) > 0) {
			if (InHashTable(word, WordsFound) == 0) {
				AddToHashTable(word, WordsFound);
				UpdateHashTable(word, docId, WordsFound);
			else {
				UpdateHashTable(word, docId, WordsFound);
	SaveIndexToFile(argv[2], WordsFound);				//Save the index to the file specified

	//only proceed if there was a third argument specified. If so, reload the index form the file you just created
	if (argc == 4) {
		HashTable *ReloadedIndex = ReadFile(argv[2]);
		SaveIndexToFile(argv[3], ReloadedIndex);
	return 0;