// OK // Ex : hashmap_put(map, 'student.rate', 56); // hashmap_traverse(map, 'student.rate') 56 void hashmap_put(t_hashmap* map, char* path, void* value, Type type) { printf("\n---------------MAP_PUT-------------------"); printf("\nmap->size: %d\tmap->slots*load_factor: %1.1f\tpath: %s\tvalue: %s\ttype: %s", map->size, map->slots * map->load_factor, path, (char*)value,printType(type)); if(map->size >= (map->slots * map->load_factor)){ hashmap_resize(map); printf("\nDEBUG: after resize map->slots: %d\tmap->size: %d\n", map->slots, map->size); } // printf("DEBUG: not resizing\n"); int slot = hashmap_hashcode(path, map->slots); printf("\thashcode: %d\n", slot); t_hashmap_entry** entries = &(map->entries[slot]); // printf("DEBUG: before while\nvalue=%s", ); while ((*entries) != NULL) { if (strcmp((*entries)->key, path) == 0) { (*entries)->value = value; (*entries)->type = type; return; } entries = &((*entries)->next); } (*entries) = hashmap_entry_create(path, value,type); map->size++; }
error_t run_sequential_search( sequential_search_state_t* state, int64_t doc_len, const unsigned char* doc_contents, uint64_t doc_num ) { error_t err; int got; int j; size_t chunk = 1*1024*1024; // 1 MB chunk. size_t overlap = 1024; // handle extra length of up to 1024. result_type_t type = state->type; int keepmatches = state->keepmatches; memset(state->docmatched, 0, state->n_regexps); // start results writers for all of our regexps. for( j = 0; j < state->n_regexps; j++ ) { if( type != 0 ) { err = results_writer_create(&state->regexps[j]->results_writer, type); if( err ) return err; } } for( int64_t i = 0; i < doc_len; i += chunk ) { // scan chunk+overlap bytes with each regular expression. // Run all of the regexps on the input. for( j = 0; j < state->n_regexps; j++ ) { sequential_regexp_query_t* s = state->regexps[j]; size_t cur = 0; size_t end = chunk + overlap; size_t end_no_overlap = chunk; size_t match_start, match_len; if( i + end > doc_len ) end = doc_len - i; if( i + end_no_overlap > doc_len ) end_no_overlap = doc_len - i; if( type == RESULT_TYPE_DOCUMENTS && state->docmatched[j] ) { break; // no matching necessary. } while( cur < end ) { const unsigned char* base = doc_contents + i + cur; size_t len = end - cur; size_t len_no_overlap = end_no_overlap - cur; got = seq_match_regexp(s->matcher, base, len, &match_start, &match_len); if( got && match_start < len_no_overlap ) { // report the result. s->query.count++; if( type == RESULT_TYPE_DOCUMENTS ) { state->docmatched[j] = 1; err = results_writer_append(&s->results_writer, doc_num, 0); if( err ) return err; // once the 1st match is found for a document, we're done! return ERR_NOERR; } else if( type == RESULT_TYPE_DOC_OFFSETS ) { err = results_writer_append(&s->results_writer, doc_num, i + cur + match_start); if( err ) return err; } // if we're doing the matches, add to the matches. if( keepmatches ) { // look for the match in our hashmap. seq_search_match_key_t search_key; hm_entry_t entry; search_key.data = base + match_start; search_key.len = match_len; entry.key = &search_key; entry.value = NULL; if( hashmap_retrieve(& s->matches, &entry) ) { seq_search_match_value_t* v = (seq_search_match_value_t*) entry.value; v->num_matches++; } else { seq_search_match_key_t* key = malloc(sizeof(seq_search_match_key_t)); seq_search_match_value_t* v = malloc(sizeof(seq_search_match_value_t)); unsigned char* data = malloc(search_key.len); if( ! key ) return ERR_MEM; if( ! v ) return ERR_MEM; if( ! data ) return ERR_MEM; v->num_matches = 1; memcpy(data, search_key.data, search_key.len); key->data = data; key->len = search_key.len; // make room for more entries.. err = hashmap_resize(& s->matches); if( err ) return err; entry.key = key; entry.value = v; err = hashmap_insert(& s->matches, &entry); if( err ) return err; } } // advance by one. cur += match_start + 1; } else { cur = end; // no more matches for this pattern. break; } } } } // Boolean query processing will happen in finish query. return ERR_NOERR; }
static error_t do_dedup_file(const char* path) { struct stat st; error_t err; int rc; void* data; int fd; MYHASH_key h; hm_entry_t entry; if( NULL != strchr(path, GLOM_CHAR) ) return ERR_IO_STR_OBJ("Path contains glom character ", path); // Otherwise, get the document length, etc. rc = stat(path, &st); if( rc != 0 ) { return ERR_IO_STR_OBJ("Could not stat", path); } if( ! S_ISREG(st.st_mode) ) { return ERR_IO_STR_OBJ("Not regular file", path); } fd = open(path, O_RDONLY); if( fd < 0 ) { return ERR_IO_STR_OBJ("Could not open", path); } if( st.st_size > 0 ) { data = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); if( data == NULL || data == MAP_FAILED ) { return ERR_IO_STR_OBJ("Could not mmap", path); } // madvise sequential. err = advise_sequential_pages(data, st.st_size); warn_if_err(err); // failed madvise does not cause total failure. } else { // A size 0 file! data = NULL; } // Deduplicate this file! memset(&h, 0, sizeof(MYHASH_key)); SHA1(data, st.st_size, &h.h[0]); //printf("chk "); MYHASH_print(&h, path); // Populate the hashtable. entry.key = &h; entry.value = NULL; if( hashmap_retrieve(&dedup_table, &entry) ) { MYHASH_key* k = (MYHASH_key*) entry.key; MYHASH_value* v = (MYHASH_value*) entry.value; // Got an entry // Glom path into the existing hash entry. // foo\0 -> foo|bar\0 err = append_array(&v->npaths, &v->paths, sizeof(char*), &path); if( err ) return err; // No need to reinsert since we just updated the value. // Add this path to the dups hashtable. entry.key = (void*) path; entry.value = NULL; err = hashmap_resize(&dups); if( err ) return err; err = hashmap_insert(&dups, &entry); if( err ) return err; printf("dup "); MYHASH_print(k, path); entry.key = (void*) path; entry.value = NULL; assert( hashmap_retrieve(&dups, &entry) ); assert(entry.value == NULL); } else { MYHASH_key* k = malloc(sizeof(MYHASH_key)); MYHASH_value* v = malloc(sizeof(MYHASH_value)); if( !k ) return ERR_MEM; if( !v ) return ERR_MEM; *k = h; v->npaths = 0; v->paths = NULL; err = append_array(&v->npaths, &v->paths, sizeof(char*), &path); if( err ) return err; // Add this hash to the dedup table. entry.key = k; entry.value = v; err = hashmap_resize(&dedup_table); if( err ) return err; err = hashmap_insert(&dedup_table, &entry); if( err ) return err; entry.key = k; entry.value = NULL; assert( hashmap_retrieve(&dedup_table, &entry) ); assert(entry.value == v); // Add this path to the dups hashtable. entry.key = (void*) path; entry.value = v; err = hashmap_resize(&dups); if( err ) return err; err = hashmap_insert(&dups, &entry); if( err ) return err; printf("new "); MYHASH_print(k, path); entry.key = (void*) path; entry.value = NULL; assert( hashmap_retrieve(&dups, &entry) ); assert(entry.value == v); } if( data ) { rc = munmap(data, st.st_size); if( rc ) { return ERR_IO_STR("Could not munmap"); } } rc = close(fd); if( rc ) { return ERR_IO_STR_OBJ("Could not close", path); } return ERR_NOERR; }