Exemplo n.º 1
0
void index_update(HashTable *index, char *line_content)
{
    //delimiting each line by word
    const char delim[2] = " ";
    char *word = strtok(line_content, delim);
    //getting total file count
    char *file_count_converted_to_string = strtok(NULL, delim);

    int file_count = 0;
    char *tmp;
    *&file_count = strtod(file_count_converted_to_string, &tmp); //converting 

    int doc_id;
    int freq;
    for (int i = 0; i < file_count; i++) { //iterating through the documents that have the word specified
        
        //breaking up the line into delimited sections by a space
        char *doc_id_converted_to_string = strtok(NULL, delim);
        //converting doc_id to a digit 
        *&doc_id = strtod(doc_id_converted_to_string, &tmp);

        char *freq_converted_to_string = strtok(NULL, delim);
        *&freq = strtod(freq_converted_to_string, &tmp);

        //updating the index to rebuild it into its former glory-self
        for (int j = 0; j < freq; j++) {
            insert_to_index(doc_id, word, index);
        }
    }
}
Exemplo n.º 2
0
/*  -----------------------------------------------------------------------
    Function Name: initial_index()
    Functionality: Crawler data in directory specified is scanned and each word 
    inserted into the Hashtable Index  
    Input <--- directory where crawler data lives (html from crawled pages) and
        the Hashtable Index 
    Outputs ---> void (updated index)
    ----------------------------------------------------------------------- */
void initial_index(char *argv[], HashTable *Index)
{
    int doc_id;
    char **results_filenames = NULL;

    int num_files = GetFilenamesInDir(argv[1], &results_filenames);

    //going through each file, allocating memory, parsing html and updating the index with word
    for (doc_id = 1; doc_id < num_files; doc_id++) {

        char *file_name = calloc(1, floor(log10(abs(doc_id))) + 2);
        snprintf(file_name, floor(log10(abs(doc_id))) + 2, "%d", doc_id);
        // printf("results_filename is: %s", file_name);

        char *file_path = calloc(1, strlen(argv[1]) + strlen(file_name) + 4);
        snprintf(file_path, strlen(argv[1]) + strlen(file_name) + 4, "./%s/%s", argv[1], file_name);
        // printf("path is: %s \n", file_path);

        if (IsFile(file_path)) {

            char *content = file_parsing(file_path); //parsing out first two line_contents of each file, so only html left over
          
            //normalizing each word in each file and updating the index hashtable with it
            if (content != NULL) {
                int pos = 0;
                char *word;
                while ((pos = GetNextWord(content, pos, &word)) > 0) {
                    NormalizeWord(word);
                    insert_to_index(doc_id, word, Index);
                }
            }
            free(content);
        }
        free(file_name);
        free(file_path);
    }
    free(results_filenames);
}
Exemplo n.º 3
0
void page_node::insert(bt_key* key, RID rid, bt_key* itr) {
	if (is_leaf_node()) insert_to_leaf(key, rid, itr);
	else insert_to_index(key, rid, itr);
}