//Reads an inverted index file and recreates the data structure it represents HashTable *ReadFile(char *file) { HashTable *reloadedIndex = calloc(1, sizeof(HashTable)); //allocate new index FILE *fp; fp = fopen(file, "r"); //open the input file char *line = calloc(100000, sizeof(char)); //read the file line by line, parsing each line for the word, docids and freqs while (fgets(line, INT_MAX, fp) != NULL) { line = strtok(line, "\n"); char *token; char *word = calloc(100, sizeof(char)); token = strtok(line, " "); strcpy(word, token); token = strtok(NULL, " "); token = strtok(NULL, " "); //once word has been parsed and doccount has been skipped, start reading the docids and freqs until the end of the line while (token != NULL) { int doc_id = atoi(token); token = strtok(NULL, " "); int freq = atoi(token); int i = 0; //increment the frequency as many times as needed while (i < freq) { if (InHashTable(word, reloadedIndex) == 0) { AddToHashTable(word, reloadedIndex); UpdateHashTable(word, doc_id, reloadedIndex); } else { UpdateHashTable(word, doc_id, reloadedIndex); } i++; } token = strtok(NULL, " "); } free(token); } free(line); fclose(fp); return reloadedIndex; }
void And(char *word, HashTable *Index) { unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code. // Declare variables for traversal. WordNode *current; DocumentNode *ptr, *ptr2, *runner, *no_need; int num; // Get matching WordNode of word if it is in the InvertedIndex. if ((num = InHashTable(word, Index))) { current = Index->table[index]->data; // Loop until we get the matching WordNode. for (int i=1; i < num; i++) { current = current->next; } ptr2 = current->page; // Set to start of the list of document nodes for the current word. } else { ptr2 = NULL; } // Initialize variables. ptr = temp_list; while (ptr != NULL) { // Check that the word is in the InvertedIndex. if (num) { ptr2 = current->page; // Set to start of the list of document nodes for the current word. // Loop until the end of the new list of matching DocumentNodes. while (ptr2 != NULL) { // Check for a match in doc_id. if (ptr->doc_id == ptr2->doc_id) { ptr->freq += ptr2->freq; // Add the frequencies. break; } ptr2 = ptr2->next; } // Case of no match. if (ptr2 == NULL) { // Check if we need to delete the first node of temp_list. if (ptr == temp_list) { temp_list = temp_list->next; } else { // All other cases. runner->next = runner->next->next; } no_need = ptr; ptr = ptr->next; // Free the node to be deleted. no_need->next = NULL; free(no_need); no_need = NULL; } else { // Case of match. runner = ptr; ptr = ptr->next; } } else { // Word is not in the InvertedIndex. ptr = NULL; FreeList(0); } } }
int GetLinks(char *line, HashTable *Index) { // Declare variables. char *buf; char word[MAX]; int flag; // flag to do union or intersection operations. int count; // variable to count the position of a word in the line. // Initialize variables. buf = line; flag = 1; count = 0; temp_list = NULL; final_list = NULL; // Loop through the line and do the appropriate operations. while (sscanf(buf, "%s", word) == 1) { count++; // If word is AND, then ignore and read in new word. if (strcmp(word, operator1) == 0) { if (count == 1) { // If there is no previous word, then throw an error. return 0; } // Increment position in line. buf = strstr(buf, word) + strlen(word); continue; } // If word is OR, then tell the program to do OR operation. if (strcmp(word, operator2) == 0) { flag = 2; // Set flag to union operation. if (count == 1) { // If there is no previous word, then throw an error. return 0; } } // Hold onto original copy of word in case NormalizeWord() changes its content. char *word_old = (char *)calloc(1, strlen(word) + 1); strcpy(word_old, word); // Change word to lowercase. if (strcmp(word, operator1) != 0 && strcmp(word, operator2) != 0) { NormalizeWord(word); // Normalize if word is not an operator. } // Add list of docs to temp_list. // Case when it is the first word of the block. if (count == 1) { // Declare variables. WordNode *current; // variable for traversal. DocumentNode *ptr, *ptr2; // variables for traversal. int num; // Case when the word is in the InvertedIndex. if ((num = InHashTable(word, Index))) { unsigned long index = JenkinsHash(word, MAX_HASH_SLOT); // Get the hash code. current = Index->table[index]->data; // Loop until we get the matching WordNode. for (int i=1; i < num; i++) { current = current->next; } // Loop through each DocumentNode and add to temp_list. for (ptr = current->page; ptr != NULL; ptr = ptr->next) { // Declare and initialize a DocumentNode with the same values as ptr. DocumentNode *dn; dn = (DocumentNode *)calloc(1, sizeof(DocumentNode)); dn->doc_id = ptr->doc_id; dn->freq = ptr->freq; // Add the new DocumentNode to temp_list. if (temp_list == NULL) { // Case when temp_list is empty. temp_list = dn; ptr2 = temp_list; } else { // Case when temp_list is nonempty. ptr2->next = dn; ptr2 = ptr2->next; } } } } else { // If not first word of the block, then do the operation. // Check if the current operation is "AND". if (flag == 1) { And(word, Index); } // Check if the current operation is "OR". if (flag == 2) { if (temp_list != NULL) { Or(); } flag = 1; // Set flag back to "AND" operation. count = 0; // Set word count to 0 to signal the start of a new block of words. } } // Increment position in the query line to read in next word. buf = strstr(buf, word_old) + strlen(word_old); free(word_old); // Cleanup. } // If the last word of the query line is an operator, throw an error. if (strcmp(word, operator1) == 0 || strcmp(word, operator2) == 0) { return 0; } // If nonempty, flush out temp_list to final_list. if (temp_list != NULL) { Or(); } return 1; // Return 1 if successful. }
// Function to crawl a given webpage for links. int CrawlPage(WebPage *wp) { char *result; // variable to hold the url. int pos = 0; // position in each html page. WebPage *newPage; // New webpage. // Check that the depth does not exceed the depth passed. if (wp->depth >= depth) { return 0; } printf("\n"); printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled. printf("\n"); // Loop through each html page to get all its urls. while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) { // Check that the url has proper domain (old-www). if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) { free(result); continue; } // Normalize the url. if (!NormalizeURL(result)) { free(result); continue; } // Check that the url isn't already in the hash table. if (!InHashTable(result)) { AddToHashTable(result); // Add the url to the hash table. // Setup new page for each url. newPage = calloc(1, sizeof(WebPage)); newPage->depth = wp->depth + 1; newPage->url = (char *)malloc(strlen(result) + 1); if (!newPage->url) { // Check that memory was allocated. continue; } strcpy(newPage->url, result); // Get html for each url. if (!GetWebPage(newPage)) { FreeWebMemory(newPage); free(result); continue; } printf("[crawler]: Parser found link - %s\n", result); // Add to the list of webpages to be visited. if (!AppendList(newPage)) { free(result); return 0; } } free(result); } return 1; }
int main(int argc, char* argv[]) { //check argument number if (argc < 3 || argc > 4) { printf("too many or too little arguments, please try again"); exit(0); } //check directory validity if (!IsDir(argv[1])) { printf("invalid directory, please try again"); exit(0); } //Initialize variables and index int docId; int pos; char *doc; char **filenames = NULL; int num_files = 0; HashTable *WordsFound = calloc(1, sizeof(HashTable)); num_files = GetFilenamesInDir(argv[1], &filenames); //check whether the folder has files if (num_files < 0) { printf("failed to get any filenames"); exit(0); } //iterate through each file in the directory for (int i = 0; i < num_files; i++) { //check that the file is in the correct format (title is a number) int filechecker = 0; for (int c = 0; c < strlen(filenames[i]); c++) { if (!isdigit(filenames[i][c])) { filechecker = 1; } } if (filechecker == 1) { continue; } //Load the document char *word; char file[100]; strcpy(file, argv[1]); strcat(file, filenames[i]); doc = LoadDocument(file); docId = GetDocumentId(filenames[i]); free(filenames[i]); pos = 0; //Iterate through each word in the html file (doc) while ((pos = GetNextWord(doc, pos, &word)) > 0) { NormalizeWord(word); if (InHashTable(word, WordsFound) == 0) { AddToHashTable(word, WordsFound); UpdateHashTable(word, docId, WordsFound); } else { UpdateHashTable(word, docId, WordsFound); free(word); } } free(doc); } free(filenames); SaveIndexToFile(argv[2], WordsFound); //Save the index to the file specified FreeHashTable(WordsFound); //only proceed if there was a third argument specified. If so, reload the index form the file you just created if (argc == 4) { HashTable *ReloadedIndex = ReadFile(argv[2]); SaveIndexToFile(argv[3], ReloadedIndex); FreeHashTable(ReloadedIndex); } return 0; }