void index_update(HashTable *index, char *line_content) { //delimiting each line by word const char delim[2] = " "; char *word = strtok(line_content, delim); //getting total file count char *file_count_converted_to_string = strtok(NULL, delim); int file_count = 0; char *tmp; *&file_count = strtod(file_count_converted_to_string, &tmp); //converting int doc_id; int freq; for (int i = 0; i < file_count; i++) { //iterating through the documents that have the word specified //breaking up the line into delimited sections by a space char *doc_id_converted_to_string = strtok(NULL, delim); //converting doc_id to a digit *&doc_id = strtod(doc_id_converted_to_string, &tmp); char *freq_converted_to_string = strtok(NULL, delim); *&freq = strtod(freq_converted_to_string, &tmp); //updating the index to rebuild it into its former glory-self for (int j = 0; j < freq; j++) { insert_to_index(doc_id, word, index); } } }
/* ----------------------------------------------------------------------- Function Name: initial_index() Functionality: Crawler data in directory specified is scanned and each word inserted into the Hashtable Index Input <--- directory where crawler data lives (html from crawled pages) and the Hashtable Index Outputs ---> void (updated index) ----------------------------------------------------------------------- */ void initial_index(char *argv[], HashTable *Index) { int doc_id; char **results_filenames = NULL; int num_files = GetFilenamesInDir(argv[1], &results_filenames); //going through each file, allocating memory, parsing html and updating the index with word for (doc_id = 1; doc_id < num_files; doc_id++) { char *file_name = calloc(1, floor(log10(abs(doc_id))) + 2); snprintf(file_name, floor(log10(abs(doc_id))) + 2, "%d", doc_id); // printf("results_filename is: %s", file_name); char *file_path = calloc(1, strlen(argv[1]) + strlen(file_name) + 4); snprintf(file_path, strlen(argv[1]) + strlen(file_name) + 4, "./%s/%s", argv[1], file_name); // printf("path is: %s \n", file_path); if (IsFile(file_path)) { char *content = file_parsing(file_path); //parsing out first two line_contents of each file, so only html left over //normalizing each word in each file and updating the index hashtable with it if (content != NULL) { int pos = 0; char *word; while ((pos = GetNextWord(content, pos, &word)) > 0) { NormalizeWord(word); insert_to_index(doc_id, word, Index); } } free(content); } free(file_name); free(file_path); } free(results_filenames); }
void page_node::insert(bt_key* key, RID rid, bt_key* itr) { if (is_leaf_node()) insert_to_leaf(key, rid, itr); else insert_to_index(key, rid, itr); }