static error_t get_next_non_dup_path(char** path_out, MYHASH_value** dups_out) { error_t err = ERR_NOERR; char* path; MYHASH_value* d; hm_entry_t entry; while( 1 ) { path = NULL; d = NULL; err = file_find_get_next(&file_find_state, &path); if( err ) break; if( !path ) break; if( !dedup ) break; entry.key = path; entry.value = NULL; if( !hashmap_retrieve(&dups, &entry) ) { printf("Missing from dups hashmap? Data added? %s\n", path); } else { if( entry.value ) { // Canonical version! d = (MYHASH_value*) entry.value; break; } // otherwise, continue looping. } free(path); } *path_out = path; *dups_out = d; return err; }
static error_t do_dedup(file_find_state_t* s) { error_t err; char* path; if( ! dedup ) return 1; // For each file, compute its MD5-sum // populate the MYHASH table while( 1 ) { path = NULL; err = file_find_get_next(s, &path); if( err ) return err; if( path == NULL ) break; // Now dedup this file! err = do_dedup_file(path); if( err ) return err; // Normally we would free path, but don't // because we will leave hashtable entries pointing to it! //free(path); } return ERR_NOERR; }
int its_get_doc(int64_t doc_num, int64_t* doc_len_out, int64_t* doc_info_len_out, unsigned char** doc_info_out, int64_t* num_doc_headers_out, int64_t** doc_header_lens_out, unsigned char*** doc_headers_out, unsigned char** doc_contents_out) { char* got_path; error_t err; int rc; size_t info_len, doc_len; unsigned char* info_data; unsigned char* doc_data; size_t next_offset; // First, move on to the next file if we need to. while( !data || cur_offset >= data_len ) { got_path = NULL; err = file_find_get_next(&file_find_state, &got_path); if( err ) { warn_if_err(err); return -1; } // No more documents! if( !got_path ) return 0; rc = do_mmap(got_path); if( rc ) return rc; } // Now, get some data from the current file. rc = parse_data(data, data_len, cur_offset, &info_len, &info_data, &doc_len, &doc_data, &next_offset); if( rc ) return rc; cur_offset = next_offset; *doc_len_out = doc_len; *doc_info_len_out = info_len; *doc_info_out = info_data; *num_doc_headers_out = 0; *doc_header_lens_out = NULL; *doc_headers_out = NULL; *doc_contents_out = doc_data; return 1; }
error_t file_find(file_find_state_t* s, file_find_func_t fun, void* state) { error_t err; char* path; while( 1 ) { path = NULL; err = file_find_get_next(s, &path); if( err ) return err; if( path == NULL ) break; fun(path, state); free(path); } return ERR_NOERR; }