int its_free_doc(int64_t doc_num, int64_t doc_len, int64_t doc_info_len, unsigned char* doc_info, int64_t num_doc_headers, int64_t* doc_header_lens, unsigned char** doc_headers, unsigned char* doc_contents) { int rc; assert(doc_header_lens == NULL); assert(doc_headers == NULL); if( doc_info ) { free(doc_info); } if( doc_contents ) { rc = munmap(doc_contents, doc_len); if( rc ) { error_t err = ERR_IO_STR("Could not munmap"); warn_if_err(err); return -1; } } return 0; }
error_t read_file(char* path, void* state) { FILE* f; bwt_state_t* s = (bwt_state_t*) state; prepared_text_t* p = &s->p; error_t err; // First thing, open the file f = fopen(path, "r"); if( ! f ) return ERR_IO_STR("Could not open file"); printf("%s\n", path); // now append the file. err = append_file(p, f, 0, NULL, NULL, strlen(path), (unsigned char*) path); fclose(f); if( err ) return err; return ERR_NOERR; }
int do_munmap(void) { int rc; if( data ) { rc = munmap(data, data_len); if( rc ) { error_t err = ERR_IO_STR("Could not munmap"); warn_if_err(err); return rc; } } if( path ) { free(path); } path = NULL; data = NULL; data_len = 0; cur_offset = 0; return 0; }
static error_t go_down(file_find_state_t* s) { error_t err; struct stat stats; while( 1 ) { err = set_curpath(s); if( err ) return err; // Just starting out with root cur_root.. err = stat(s->cur_path, &stats); if( err ) { fprintf(stderr, "Cannot stat file at path '%s'\n", s->cur_path); return ERR_IO_STR("Could not stat file"); } if( ! S_ISDIR(stats.st_mode) ) { // OK! Not a directory. return ERR_NOERR; } else { // It's a directory. char** names = NULL; int names_count = 0; char* name = NULL; DIR* dir; struct dirent* ent; int idx; // it's a directory! dir = opendir(s->cur_path); if( ! dir ) return ERR_IO_UNK; while( (ent = readdir(dir)) ) { if( 0 == strcmp(ent->d_name, ".") ) continue; else if( 0 == strcmp(ent->d_name, "..") ) continue; name = strdup(ent->d_name); if( ! name ) return ERR_MEM; err = append_array(&names_count, &names, sizeof(char*), &name); if( err ) return err; } closedir(dir); // If we have no names, go up. if( names_count == 0 ) { // Advance to the next one there, if we're not already at the end. if( s->names_stack[s->names_depth][s->names_i[s->names_depth]] ) { s->names_i[s->names_depth]++; } err = go_up(s); if( err ) return err; if( ! s->names_stack[s->names_depth][s->names_i[s->names_depth]] ) { // We're at the end! return ERR_NOERR; } } else { // Sort the names qsort(names, names_count, sizeof(char*), cmpstringp); // Always append a NULL. name = NULL; err = append_array(&names_count, &names, sizeof(char*), &name); if( err ) return err; idx = 0; s->names_depth++; // put names at the end of the names stack, and set index=0. if( s->names_depth < s->names_stack_size ) { s->names_stack[s->names_depth] = names; s->names_i[s->names_depth] = idx; } else { err = append_array(&s->names_stack_size, &s->names_stack, sizeof(char**), &names); if( err ) return err; err = append_array(&s->names_i_size, &s->names_i, sizeof(int), &idx); if( err ) return err; } assert( s->names_stack_size == s->names_i_size ); } } } }
static error_t do_dedup_file(const char* path) { struct stat st; error_t err; int rc; void* data; int fd; MYHASH_key h; hm_entry_t entry; if( NULL != strchr(path, GLOM_CHAR) ) return ERR_IO_STR_OBJ("Path contains glom character ", path); // Otherwise, get the document length, etc. rc = stat(path, &st); if( rc != 0 ) { return ERR_IO_STR_OBJ("Could not stat", path); } if( ! S_ISREG(st.st_mode) ) { return ERR_IO_STR_OBJ("Not regular file", path); } fd = open(path, O_RDONLY); if( fd < 0 ) { return ERR_IO_STR_OBJ("Could not open", path); } if( st.st_size > 0 ) { data = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); if( data == NULL || data == MAP_FAILED ) { return ERR_IO_STR_OBJ("Could not mmap", path); } // madvise sequential. err = advise_sequential_pages(data, st.st_size); warn_if_err(err); // failed madvise does not cause total failure. } else { // A size 0 file! data = NULL; } // Deduplicate this file! memset(&h, 0, sizeof(MYHASH_key)); SHA1(data, st.st_size, &h.h[0]); //printf("chk "); MYHASH_print(&h, path); // Populate the hashtable. entry.key = &h; entry.value = NULL; if( hashmap_retrieve(&dedup_table, &entry) ) { MYHASH_key* k = (MYHASH_key*) entry.key; MYHASH_value* v = (MYHASH_value*) entry.value; // Got an entry // Glom path into the existing hash entry. // foo\0 -> foo|bar\0 err = append_array(&v->npaths, &v->paths, sizeof(char*), &path); if( err ) return err; // No need to reinsert since we just updated the value. // Add this path to the dups hashtable. entry.key = (void*) path; entry.value = NULL; err = hashmap_resize(&dups); if( err ) return err; err = hashmap_insert(&dups, &entry); if( err ) return err; printf("dup "); MYHASH_print(k, path); entry.key = (void*) path; entry.value = NULL; assert( hashmap_retrieve(&dups, &entry) ); assert(entry.value == NULL); } else { MYHASH_key* k = malloc(sizeof(MYHASH_key)); MYHASH_value* v = malloc(sizeof(MYHASH_value)); if( !k ) return ERR_MEM; if( !v ) return ERR_MEM; *k = h; v->npaths = 0; v->paths = NULL; err = append_array(&v->npaths, &v->paths, sizeof(char*), &path); if( err ) return err; // Add this hash to the dedup table. entry.key = k; entry.value = v; err = hashmap_resize(&dedup_table); if( err ) return err; err = hashmap_insert(&dedup_table, &entry); if( err ) return err; entry.key = k; entry.value = NULL; assert( hashmap_retrieve(&dedup_table, &entry) ); assert(entry.value == v); // Add this path to the dups hashtable. entry.key = (void*) path; entry.value = v; err = hashmap_resize(&dups); if( err ) return err; err = hashmap_insert(&dups, &entry); if( err ) return err; printf("new "); MYHASH_print(k, path); entry.key = (void*) path; entry.value = NULL; assert( hashmap_retrieve(&dups, &entry) ); assert(entry.value == v); } if( data ) { rc = munmap(data, st.st_size); if( rc ) { return ERR_IO_STR("Could not munmap"); } } rc = close(fd); if( rc ) { return ERR_IO_STR_OBJ("Could not close", path); } return ERR_NOERR; }