Example #1
0
/*  -----------------------------------------------------------------------
    Function Name: initial_index()
    Functionality: Crawler data in directory specified is scanned and each word 
    inserted into the Hashtable Index  
    Input <--- directory where crawler data lives (html from crawled pages) and
        the Hashtable Index 
    Outputs ---> void (updated index)
    ----------------------------------------------------------------------- */
void initial_index(char *argv[], HashTable *Index)
{
    int doc_id;
    char **results_filenames = NULL;

    int num_files = GetFilenamesInDir(argv[1], &results_filenames);

    //going through each file, allocating memory, parsing html and updating the index with word
    for (doc_id = 1; doc_id < num_files; doc_id++) {

        char *file_name = calloc(1, floor(log10(abs(doc_id))) + 2);
        snprintf(file_name, floor(log10(abs(doc_id))) + 2, "%d", doc_id);
        // printf("results_filename is: %s", file_name);

        char *file_path = calloc(1, strlen(argv[1]) + strlen(file_name) + 4);
        snprintf(file_path, strlen(argv[1]) + strlen(file_name) + 4, "./%s/%s", argv[1], file_name);
        // printf("path is: %s \n", file_path);

        if (IsFile(file_path)) {

            char *content = file_parsing(file_path); //parsing out first two line_contents of each file, so only html left over
          
            //normalizing each word in each file and updating the index hashtable with it
            if (content != NULL) {
                int pos = 0;
                char *word;
                while ((pos = GetNextWord(content, pos, &word)) > 0) {
                    NormalizeWord(word);
                    insert_to_index(doc_id, word, Index);
                }
            }
            free(content);
        }
        free(file_name);
        free(file_path);
    }
    free(results_filenames);
}
Example #2
0
int main(int argc, char *argv[])
{

	/* Check command line arguments */
	
	// Must have at least 3 arguments (one ./indexer, two command line) 
	if (argc < 3) {
		fprintf(stderr, "Indexer.c program requires at least two input arguments\n");
		if (DISPLAY_HELP) {displayhelp();}
		return 1;
	}
	
	// Check validity of arguments
	else {
	
		// Check argv[1] - webPageDirectory
		webPageDirectory = strdup(argv[1]);
	
		// Check if webPageDirectory is an existing directory
		if (!IsDir(webPageDirectory)) {
			fprintf(stderr, "Please provide an existing directory for the file database\n");
			if (DISPLAY_HELP) {displayhelp();}
		
			free(webPageDirectory);
			return 1;
		}
	
		// Check if webPageDirectory contains regular files
		num_files = GetFilenamesInDir(webPageDirectory, &filenames);
	
		if(num_files < 0) {
			fprintf(stderr, "Please provide a directory containing regular files for the file database\n");
			if (DISPLAY_HELP) {displayhelp();}
		
			free(webPageDirectory);
			return 1;
		}
		
		// Check if webPageDirectory contains valid docId's
		for(int index = 0; index < num_files; index++) {
			
			char posschar;
			int possint;	
			
			// Checks to see if any characters can be picked up
			if (sscanf(filenames[index], "%d%c", &possint, &posschar) != 1) {
				fprintf(stderr, "Please provide int's for filenames in file database\n \
The file %s is not a valid filename\n", filenames[index]);
				if (DISPLAY_HELP) {displayhelp();}
				
				// Free all filenames up to this point
				for (int index2 = 0; index2 < num_files; index2++) {
					free(filenames[index2]);	
				}
				
				free(filenames);
				free(webPageDirectory);
				return 1;
			}	
		}
		
		// Check argv[2] - outputFile
		outputFile = strdup(argv[2]);
		
		// Check if outputFile contains .dat extension
		char *point = strrchr(outputFile,'.');
		
		if (point != NULL ) {
		
			if(strcmp(point,".dat") != 0) {
				fprintf(stderr, "Please provide a .dat file for the outputFile to be written into\n");
				
				// Free all filenames up to this point
				for (int index = 0; index < num_files; index++) {
					free(filenames[index]);
				}
				
				free(filenames);
				free(webPageDirectory);
				free(outputFile);
				return 1;
			}
	    }
		
		
	}
Example #3
0
int main(int argc, char* argv[]) {
	//check argument number
	if (argc < 3 || argc > 4) {
		printf("too many or too little arguments, please try again");
		exit(0);
	}
	
	//check directory validity
	if (!IsDir(argv[1])) {
		printf("invalid directory, please try again");
		exit(0);
	}
	
	//Initialize variables and index
	int docId;
	int pos;
	char *doc;
	char **filenames = NULL;
	int num_files = 0;
	HashTable *WordsFound = calloc(1, sizeof(HashTable));
	num_files = GetFilenamesInDir(argv[1], &filenames);

	//check whether the folder has files
	if (num_files < 0) {
		printf("failed to get any filenames");
		exit(0);
	}

	//iterate through each file in the directory
	for (int i = 0; i < num_files; i++) {
		
		//check that the file is in the correct format (title is a number)
		int filechecker = 0;
		for (int c = 0; c < strlen(filenames[i]); c++) {
			if (!isdigit(filenames[i][c])) {
				filechecker = 1;
			}
		}
		if (filechecker == 1) {
			continue;
		}

		//Load the document
		char *word;
		char file[100];
		strcpy(file, argv[1]);
		strcat(file, filenames[i]);
		doc = LoadDocument(file);
		docId = GetDocumentId(filenames[i]);
		free(filenames[i]);
		
		pos = 0;
		//Iterate through each word in the html file (doc)
		while ((pos = GetNextWord(doc, pos, &word)) > 0) {
			NormalizeWord(word);
			if (InHashTable(word, WordsFound) == 0) {
				AddToHashTable(word, WordsFound);
				UpdateHashTable(word, docId, WordsFound);
			}
			else {
				UpdateHashTable(word, docId, WordsFound);
				free(word);
			}
		}
		free(doc);
	}	
	free(filenames);
	SaveIndexToFile(argv[2], WordsFound);				//Save the index to the file specified
	FreeHashTable(WordsFound);

	//only proceed if there was a third argument specified. If so, reload the index form the file you just created
	if (argc == 4) {
		HashTable *ReloadedIndex = ReadFile(argv[2]);
		SaveIndexToFile(argv[3], ReloadedIndex);
		FreeHashTable(ReloadedIndex);
	}
	return 0;
}
Example #4
0
int main (int argc, char **argv) {

	/* Check Arguments */
	if (!CheckArguments(argc, argv)) {
		exit(-1);
	}

	/* Make variables for all things needed for indexer and indexer testing */
	char *page_directory;
	char *index_filename;
	char *read_index_filename;
	char *new_index_filename;
	// If argument count is 3 initialize only 2 variables else initialize all
	page_directory = argv[1];
	index_filename = argv[2];

	// Initialize hashtable, word node, and document node
	HashTable *index_hashtable = calloc(1, sizeof(HashTable));

	/*Make array to hold filenames (just document numbers) and use GetFilenamesInDir to grab all names */
	char **filename_array;
	int number_of_files;
	if ((number_of_files = GetFilenamesInDir(page_directory, &filename_array)) < 0) {
		fprintf(stderr, "Could not get filenames in page directory. Exiting Now.\n");
		exit(-1);
	}


	/* Add page_directory to the front of the filenames */
	for (int i = 0; i < number_of_files; i++) {
		// Make pointe to current string in filename_array
		char *previous_string = filename_array[i];
		// Get length of full string and initialize element of filename_array to that size
		int len = strlen(page_directory) + strlen(previous_string) + 1;
		char *new_string = calloc(len, sizeof(char));
		// Make new string and free previous string
		strcpy(new_string, page_directory);
		strcat(new_string, previous_string);
		if (previous_string)
			free(previous_string);		

		filename_array[i] = new_string;
	}

	/* Populate the index data structure from the words on each doc
	 * Then Save to an index file                     
	 */
	for (int i = 0; i < number_of_files; i++) {

		/* Check that the filenames are digits */
		int continue_flag = 0;
		char *digit_string = filename_array[i] + strlen(page_directory);
		// Check that every character in the filename is a digit
		for (int j = 0; j < strlen(digit_string); j++) {
			if (!isdigit(digit_string[j])) {
				fprintf(stderr, "This file %s contains something other than a digit \n", filename_array[i]);
				continue_flag = 1;
			}
		}
		if (continue_flag ==1)
			continue;

		// Check that each file in the filename array is a good file
		char *file_name = filename_array[i];
		if (!IsFile(file_name)) {
			fprintf(stderr, "not file\n");	
			continue;
		}

		// Get contents of file into a string
		char *document = LoadDocument(file_name);
		if (document == NULL) {
			continue;
		}

		// Get DocumentID of file (check if bad)
		int document_id = GetDocumentId(file_name, page_directory);
		if (document_id < 0) {
			fprintf(stderr, "Error when converting document id char to integer\n");
			continue;
		}

		// Use GetNext word, with pos variable and buffer, to get every word and add the word to the data structure
		int pos = 0;
		char *word_buffer;
		while ((pos = GetNextWord(document, pos, &word_buffer)) > 0) {
			// Update the index for each word
			// Normalize word then update index with that word
			NormalizeWord(word_buffer);
			UpdateIndex(word_buffer, document_id, index_hashtable);
			free(word_buffer);
		}
		// free the string containing the html and the word in filenamearray
		free(document);
	}

	/* Save to index file, and check that it actually went well */
	if (!SaveIndexToFile(index_hashtable, index_filename)) {
		fprintf(stderr, "Could not save index hashtable to file\n");
		exit(-1);
	}

	for (int i = 0; i < number_of_files; i++) {
		free(filename_array[i]);
	}
	free(filename_array);
	FreeHashTable(index_hashtable);

	if (argc == 3) {
		;
	}
	/* Read index file into data strucutres and save to new index file */
 	else {
 		// Assign 2 filenames
 		read_index_filename = argv[3];
		new_index_filename = argv[4];
		// Read index file into data structures 
		HashTable *read_index = ReadFile(read_index_filename);
		if (read_index == NULL) {
			fprintf(stderr, "Error when reading index file into data structures.\n");
			exit(-1);
		}
		// Save index data structures into new file
		if (!SaveIndexToFile(read_index, new_index_filename)) {
			fprintf(stderr, "Could not save read index file into new index file\n");
			exit(-1);
		}
		
		FreeHashTable(read_index);
    }

	return 0;
}
Example #5
0
//return 1 if successful, 0 otherwise
int buildIndexFromDirectory(char *dir, HashTable *hashtable)
{
	char **filenames = NULL;
	int num_files = 0;

	//get the file names
	num_files = GetFilenamesInDir(dir, &filenames);
	if (num_files < 0) {
		fprintf(stderr, "Error: Unable to obtain files in directory\n");
		free(filenames);
		return 0;
	}

	//for each of the files in the directory, read and add to the indexer
	for(int i = 0; i < num_files; i++){

    	//allocate a char array, directoryname/filename
    	char *file_name = malloc((strlen(dir) + strlen(filenames[i])) * (sizeof(char))); 

	    //makes filename based on whether the directoryname has a slash at the end or not
	    if (dir[strlen(dir)-1] == '/')
	        sprintf(file_name, "%s%s", dir, filenames[i]);
	    else
	        sprintf(file_name, "%s/%s", dir, filenames[i]);
   
	    FILE *file = fopen(file_name, "rb");
		if (file) {
			    fseek(file, 0, SEEK_END);
			    long html_len = ftell(file);
			    fseek(file, 0, SEEK_SET);

			    //Get rid of the first 2 lines

			    //get rid of url
			    int offset = 0;
			    char character;
			    do{
			    	character = fgetc(file);
			    	offset++;
			    } while(character != '\n' && character != EOF);
			    fseek(file, offset, SEEK_SET);

			    //get rid of depth
			    do{
			    	character = fgetc(file);
			    	offset++;
			    } while(character != '\n' && character != EOF);
			    fseek(file, offset, SEEK_SET);

			    //read in the file
			    char *html = malloc(html_len * (sizeof(char)));
			    fread(html, sizeof(char), html_len, file); 

			    fclose(file);

				int pos = 0;
		 		char *word = NULL;

				//errstring will hold error message from function, char **errstring = &errstring, it is the mailbox
				int doc_id = strtol(filenames[i], NULL, 10);
				if (doc_id == 0) { 
					fprintf(stderr, "Error: %s is an invalid crawler filename.\n", filenames[i]);
					continue;
				} 

		 		//get the words
		 		while((pos = GetNextWord(html, pos, &word)) > 0){
		 			NormalizeWord(word);
		 			addToHashTable(hashtable, word, doc_id);		//add word to indexer
		 			free(word);
		 			word = NULL;
		  		}
		  		free(html);	
	  		} 
	  	free(file_name);
	  	if (filenames[i]) 
	  		free(filenames[i]);
	  }
	free(filenames);
	return 1;
}