/* ----------------------------------------------------------------------- Function Name: initial_index() Functionality: Crawler data in directory specified is scanned and each word inserted into the Hashtable Index Input <--- directory where crawler data lives (html from crawled pages) and the Hashtable Index Outputs ---> void (updated index) ----------------------------------------------------------------------- */ void initial_index(char *argv[], HashTable *Index) { int doc_id; char **results_filenames = NULL; int num_files = GetFilenamesInDir(argv[1], &results_filenames); //going through each file, allocating memory, parsing html and updating the index with word for (doc_id = 1; doc_id < num_files; doc_id++) { char *file_name = calloc(1, floor(log10(abs(doc_id))) + 2); snprintf(file_name, floor(log10(abs(doc_id))) + 2, "%d", doc_id); // printf("results_filename is: %s", file_name); char *file_path = calloc(1, strlen(argv[1]) + strlen(file_name) + 4); snprintf(file_path, strlen(argv[1]) + strlen(file_name) + 4, "./%s/%s", argv[1], file_name); // printf("path is: %s \n", file_path); if (IsFile(file_path)) { char *content = file_parsing(file_path); //parsing out first two line_contents of each file, so only html left over //normalizing each word in each file and updating the index hashtable with it if (content != NULL) { int pos = 0; char *word; while ((pos = GetNextWord(content, pos, &word)) > 0) { NormalizeWord(word); insert_to_index(doc_id, word, Index); } } free(content); } free(file_name); free(file_path); } free(results_filenames); }
int main(int argc, char *argv[]) { /* Check command line arguments */ // Must have at least 3 arguments (one ./indexer, two command line) if (argc < 3) { fprintf(stderr, "Indexer.c program requires at least two input arguments\n"); if (DISPLAY_HELP) {displayhelp();} return 1; } // Check validity of arguments else { // Check argv[1] - webPageDirectory webPageDirectory = strdup(argv[1]); // Check if webPageDirectory is an existing directory if (!IsDir(webPageDirectory)) { fprintf(stderr, "Please provide an existing directory for the file database\n"); if (DISPLAY_HELP) {displayhelp();} free(webPageDirectory); return 1; } // Check if webPageDirectory contains regular files num_files = GetFilenamesInDir(webPageDirectory, &filenames); if(num_files < 0) { fprintf(stderr, "Please provide a directory containing regular files for the file database\n"); if (DISPLAY_HELP) {displayhelp();} free(webPageDirectory); return 1; } // Check if webPageDirectory contains valid docId's for(int index = 0; index < num_files; index++) { char posschar; int possint; // Checks to see if any characters can be picked up if (sscanf(filenames[index], "%d%c", &possint, &posschar) != 1) { fprintf(stderr, "Please provide int's for filenames in file database\n \ The file %s is not a valid filename\n", filenames[index]); if (DISPLAY_HELP) {displayhelp();} // Free all filenames up to this point for (int index2 = 0; index2 < num_files; index2++) { free(filenames[index2]); } free(filenames); free(webPageDirectory); return 1; } } // Check argv[2] - outputFile outputFile = strdup(argv[2]); // Check if outputFile contains .dat extension char *point = strrchr(outputFile,'.'); if (point != NULL ) { if(strcmp(point,".dat") != 0) { fprintf(stderr, "Please provide a .dat file for the outputFile to be written into\n"); // Free all filenames up to this point for (int index = 0; index < num_files; index++) { free(filenames[index]); } free(filenames); free(webPageDirectory); free(outputFile); return 1; } } }
int main(int argc, char* argv[]) { //check argument number if (argc < 3 || argc > 4) { printf("too many or too little arguments, please try again"); exit(0); } //check directory validity if (!IsDir(argv[1])) { printf("invalid directory, please try again"); exit(0); } //Initialize variables and index int docId; int pos; char *doc; char **filenames = NULL; int num_files = 0; HashTable *WordsFound = calloc(1, sizeof(HashTable)); num_files = GetFilenamesInDir(argv[1], &filenames); //check whether the folder has files if (num_files < 0) { printf("failed to get any filenames"); exit(0); } //iterate through each file in the directory for (int i = 0; i < num_files; i++) { //check that the file is in the correct format (title is a number) int filechecker = 0; for (int c = 0; c < strlen(filenames[i]); c++) { if (!isdigit(filenames[i][c])) { filechecker = 1; } } if (filechecker == 1) { continue; } //Load the document char *word; char file[100]; strcpy(file, argv[1]); strcat(file, filenames[i]); doc = LoadDocument(file); docId = GetDocumentId(filenames[i]); free(filenames[i]); pos = 0; //Iterate through each word in the html file (doc) while ((pos = GetNextWord(doc, pos, &word)) > 0) { NormalizeWord(word); if (InHashTable(word, WordsFound) == 0) { AddToHashTable(word, WordsFound); UpdateHashTable(word, docId, WordsFound); } else { UpdateHashTable(word, docId, WordsFound); free(word); } } free(doc); } free(filenames); SaveIndexToFile(argv[2], WordsFound); //Save the index to the file specified FreeHashTable(WordsFound); //only proceed if there was a third argument specified. If so, reload the index form the file you just created if (argc == 4) { HashTable *ReloadedIndex = ReadFile(argv[2]); SaveIndexToFile(argv[3], ReloadedIndex); FreeHashTable(ReloadedIndex); } return 0; }
int main (int argc, char **argv) { /* Check Arguments */ if (!CheckArguments(argc, argv)) { exit(-1); } /* Make variables for all things needed for indexer and indexer testing */ char *page_directory; char *index_filename; char *read_index_filename; char *new_index_filename; // If argument count is 3 initialize only 2 variables else initialize all page_directory = argv[1]; index_filename = argv[2]; // Initialize hashtable, word node, and document node HashTable *index_hashtable = calloc(1, sizeof(HashTable)); /*Make array to hold filenames (just document numbers) and use GetFilenamesInDir to grab all names */ char **filename_array; int number_of_files; if ((number_of_files = GetFilenamesInDir(page_directory, &filename_array)) < 0) { fprintf(stderr, "Could not get filenames in page directory. Exiting Now.\n"); exit(-1); } /* Add page_directory to the front of the filenames */ for (int i = 0; i < number_of_files; i++) { // Make pointe to current string in filename_array char *previous_string = filename_array[i]; // Get length of full string and initialize element of filename_array to that size int len = strlen(page_directory) + strlen(previous_string) + 1; char *new_string = calloc(len, sizeof(char)); // Make new string and free previous string strcpy(new_string, page_directory); strcat(new_string, previous_string); if (previous_string) free(previous_string); filename_array[i] = new_string; } /* Populate the index data structure from the words on each doc * Then Save to an index file */ for (int i = 0; i < number_of_files; i++) { /* Check that the filenames are digits */ int continue_flag = 0; char *digit_string = filename_array[i] + strlen(page_directory); // Check that every character in the filename is a digit for (int j = 0; j < strlen(digit_string); j++) { if (!isdigit(digit_string[j])) { fprintf(stderr, "This file %s contains something other than a digit \n", filename_array[i]); continue_flag = 1; } } if (continue_flag ==1) continue; // Check that each file in the filename array is a good file char *file_name = filename_array[i]; if (!IsFile(file_name)) { fprintf(stderr, "not file\n"); continue; } // Get contents of file into a string char *document = LoadDocument(file_name); if (document == NULL) { continue; } // Get DocumentID of file (check if bad) int document_id = GetDocumentId(file_name, page_directory); if (document_id < 0) { fprintf(stderr, "Error when converting document id char to integer\n"); continue; } // Use GetNext word, with pos variable and buffer, to get every word and add the word to the data structure int pos = 0; char *word_buffer; while ((pos = GetNextWord(document, pos, &word_buffer)) > 0) { // Update the index for each word // Normalize word then update index with that word NormalizeWord(word_buffer); UpdateIndex(word_buffer, document_id, index_hashtable); free(word_buffer); } // free the string containing the html and the word in filenamearray free(document); } /* Save to index file, and check that it actually went well */ if (!SaveIndexToFile(index_hashtable, index_filename)) { fprintf(stderr, "Could not save index hashtable to file\n"); exit(-1); } for (int i = 0; i < number_of_files; i++) { free(filename_array[i]); } free(filename_array); FreeHashTable(index_hashtable); if (argc == 3) { ; } /* Read index file into data strucutres and save to new index file */ else { // Assign 2 filenames read_index_filename = argv[3]; new_index_filename = argv[4]; // Read index file into data structures HashTable *read_index = ReadFile(read_index_filename); if (read_index == NULL) { fprintf(stderr, "Error when reading index file into data structures.\n"); exit(-1); } // Save index data structures into new file if (!SaveIndexToFile(read_index, new_index_filename)) { fprintf(stderr, "Could not save read index file into new index file\n"); exit(-1); } FreeHashTable(read_index); } return 0; }
//return 1 if successful, 0 otherwise int buildIndexFromDirectory(char *dir, HashTable *hashtable) { char **filenames = NULL; int num_files = 0; //get the file names num_files = GetFilenamesInDir(dir, &filenames); if (num_files < 0) { fprintf(stderr, "Error: Unable to obtain files in directory\n"); free(filenames); return 0; } //for each of the files in the directory, read and add to the indexer for(int i = 0; i < num_files; i++){ //allocate a char array, directoryname/filename char *file_name = malloc((strlen(dir) + strlen(filenames[i])) * (sizeof(char))); //makes filename based on whether the directoryname has a slash at the end or not if (dir[strlen(dir)-1] == '/') sprintf(file_name, "%s%s", dir, filenames[i]); else sprintf(file_name, "%s/%s", dir, filenames[i]); FILE *file = fopen(file_name, "rb"); if (file) { fseek(file, 0, SEEK_END); long html_len = ftell(file); fseek(file, 0, SEEK_SET); //Get rid of the first 2 lines //get rid of url int offset = 0; char character; do{ character = fgetc(file); offset++; } while(character != '\n' && character != EOF); fseek(file, offset, SEEK_SET); //get rid of depth do{ character = fgetc(file); offset++; } while(character != '\n' && character != EOF); fseek(file, offset, SEEK_SET); //read in the file char *html = malloc(html_len * (sizeof(char))); fread(html, sizeof(char), html_len, file); fclose(file); int pos = 0; char *word = NULL; //errstring will hold error message from function, char **errstring = &errstring, it is the mailbox int doc_id = strtol(filenames[i], NULL, 10); if (doc_id == 0) { fprintf(stderr, "Error: %s is an invalid crawler filename.\n", filenames[i]); continue; } //get the words while((pos = GetNextWord(html, pos, &word)) > 0){ NormalizeWord(word); addToHashTable(hashtable, word, doc_id); //add word to indexer free(word); word = NULL; } free(html); } free(file_name); if (filenames[i]) free(filenames[i]); } free(filenames); return 1; }