void test_set_remove (void) { Set* set; char buf[10]; int i; unsigned int num_entries; set = generate_set(); num_entries = set_num_entries (set); assert (num_entries == 10000); /* Remove some entries */ for (i = 4000; i < 6000; ++i) { sprintf (buf, "%i", i); /* Check this is in the set */ assert (set_query (set, buf) != 0); /* Remove it */ assert (set_remove (set, buf) != 0); /* Check the number of entries decreases */ assert (set_num_entries (set) == num_entries - 1); /* Check it is no longer in the set */ assert (set_query (set, buf) == 0); --num_entries; } /* Try to remove some invalid entries */ for (i = -1000; i < -500; ++i) { sprintf (buf, "%i", i); assert (set_remove (set, buf) == 0); assert (set_num_entries (set) == num_entries); } for (i = 50000; i < 51000; ++i) { sprintf (buf, "%i", i); assert (set_remove (set, buf) == 0); assert (set_num_entries (set) == num_entries); } set_free (set); }
void test_set_out_of_memory (void) { Set* set; int values[66]; unsigned int i; set = set_new (int_hash, int_equal); /* Test normal failure */ alloc_test_set_limit (0); values[0] = 0; assert (set_insert (set, &values[0]) == 0); assert (set_num_entries (set) == 0); alloc_test_set_limit (-1); /* Test failure when increasing table size. * The initial table size is 193 entries. The table increases in * size when 1/3 full, so the 66th entry should cause the insert * to fail. */ for (i = 0; i < 65; ++i) { values[i] = (int) i; assert (set_insert (set, &values[i]) != 0); assert (set_num_entries (set) == i + 1); } assert (set_num_entries (set) == 65); /* Test the 66th insert */ alloc_test_set_limit (0); values[65] = 65; assert (set_insert (set, &values[65]) == 0); assert (set_num_entries (set) == 65); set_free (set); }
void test_set_intersection (void) { int numbers1[] = {1, 2, 3, 4, 5, 6, 7}; int numbers2[] = {5, 6, 7, 8, 9, 10, 11}; int result[] = {5, 6, 7}; int i; Set* set1; Set* set2; Set* result_set; size_t allocated; /* Create the first set */ set1 = set_new (int_hash, int_equal); for (i = 0; i < 7; ++i) { set_insert (set1, &numbers1[i]); } /* Create the second set */ set2 = set_new (int_hash, int_equal); for (i = 0; i < 7; ++i) { set_insert (set2, &numbers2[i]); } /* Perform the intersection */ result_set = set_intersection (set1, set2); assert (set_num_entries (result_set) == 3); for (i = 0; i < 3; ++i) { assert (set_query (result_set, &result[i]) != 0); } /* Test out of memory scenario */ alloc_test_set_limit (0); assert (set_intersection (set1, set2) == NULL); /* Can allocate set, can't copy all values */ alloc_test_set_limit (2 + 2); allocated = alloc_test_get_allocated(); assert (set_intersection (set1, set2) == NULL); assert (alloc_test_get_allocated() == allocated); set_free (set1); set_free (set2); set_free (result_set); }
void test_set_iterating_remove (void) { Set* set; SetIterator iterator; int count; unsigned int removed; char* value; set = generate_set(); count = 0; removed = 0; /* Iterate over all values in the set */ set_iterate (set, &iterator); while (set_iter_has_more (&iterator)) { value = set_iter_next (&iterator); if ( (atoi (value) % 100) == 0) { /* Remove this value */ set_remove (set, value); ++removed; } ++count; } /* Check final counts */ assert (count == 10000); assert (removed == 100); assert (set_num_entries (set) == 10000 - removed); set_free (set); }
void test_set_insert (void) { Set* set; int numbers1[] = { 1, 2, 3, 4, 5, 6 }; int numbers2[] = { 5, 6, 7, 8, 9, 10 }; int i; /* Perform a union of numbers1 and numbers2. Cannot add the same * value twice. */ set = set_new (int_hash, int_equal); for (i = 0; i < 6; ++i) { set_insert (set, &numbers1[i]); } for (i = 0; i < 6; ++i) { set_insert (set, &numbers2[i]); } assert (set_num_entries (set) == 10); set_free (set); }
Set* generate_set (void) { Set* set; char buf[10]; unsigned int i; char* value; set = set_new (string_hash, string_equal); /* Add 10,000 items sequentially, checking that the counter * works properly */ for (i = 0; i < 10000; ++i) { sprintf (buf, "%i", i); value = strdup (buf); set_insert (set, value); assert (set_num_entries (set) == i + 1); } set_register_free_function (set, free); return set; }
int main(int argc, char *argv[]) { size_t path_len, total_files; off_t bytes_wasted, total_wasted; char path_buffer[PATH_MAX_LEN], *hash_value; struct file_entry_t *file_entry, *trie_entry; SListIterator slist_iterator; SetIterator set_iterator; /* Step 0: Session data */ struct file_info_t file_info; clear_info(&file_info); /* Step 1: Parse arguments */ while (--argc) { /* Being unable to record implies insufficient resources */ if (!record(argv[argc], &file_info)){ fprintf(stderr, "[FATAL] out of memory\n"); destroy_info(&file_info); return (EXIT_FAILURE); } } /* Step 2: Fully explore any directories specified */ #ifndef NDEBUG printf("[DEBUG] Creating file list...\n"); #endif while (slist_length(file_info.file_stack) > 0) { /* Pick off the top of the file stack */ file_entry = (struct file_entry_t *)(slist_data(file_info.file_stack)); slist_remove_entry(&file_info.file_stack, file_info.file_stack); assert(file_entry->type == DIRECTORY); /* Copy the basename to a buffer */ memset(path_buffer, '\0', PATH_MAX_LEN); path_len = strnlen(file_entry->path, PATH_MAX_LEN); memcpy(path_buffer, file_entry->path, path_len); /* Ignore cases that would cause overflow */ if (path_len < PATH_MAX_LEN) { /* Append a trailing slash */ path_buffer[path_len] = '/'; /* Record all contents (may push onto file stack or one of the lists) */ DIR *directory = opendir(file_entry->path); if (traverse(&file_info, directory, path_buffer, ++path_len)) { fprintf(stderr, "[FATAL] out of memory\n"); destroy_info(&file_info); return (EXIT_FAILURE); } else if (closedir(directory)) { fprintf(stderr, "[WARNING] '%s' (close failed)\n", file_entry->path); } } /* Discard this entry */ destroy_entry(file_entry); } /* Step 3: Warn about any ignored files */ if (slist_length(file_info.bad_files) > 0) { slist_iterate(&file_info.bad_files, &slist_iterator); while (slist_iter_has_more(&slist_iterator)) { file_entry = slist_iter_next(&slist_iterator); fprintf(stderr, "[WARNING] '%s' ", file_entry->path); switch (file_entry->type) { case INVALID: ++file_info.invalid_files; fprintf(stderr, "(invalid file)\n"); break; case INACCESSIBLE: ++file_info.protected_files; fprintf(stderr, "(protected file)\n"); break; default: ++file_info.irregular_files; fprintf(stderr, "(irregular file)\n"); break; } } fprintf(stderr, "[WARNING] %lu file(s) ignored\n", (long unsigned)(num_errors(&file_info))); } #ifndef NDEBUG if (num_errors(&file_info) > 0) { fprintf(stderr, "[FATAL] cannot parse entire file tree\n"); destroy_info(&file_info); return (EXIT_FAILURE); } printf("[DEBUG] Found %lu / %lu valid files\n", (unsigned long)(num_files(&file_info)), (unsigned long)(file_info.total_files)); #endif /* Step 4: Begin the filtering process */ #ifndef NDEBUG printf("[DEBUG] Creating file table...\n"); #endif if (slist_length(file_info.good_files) > 0) { file_info.hash_trie = trie_new(); file_info.shash_trie = trie_new(); optimize_filter(&file_info); /* Extract each file from the list (they should all be regular) */ slist_iterate(&file_info.good_files, &slist_iterator); while (slist_iter_has_more(&slist_iterator)) { file_entry = slist_iter_next(&slist_iterator); assert(file_entry->type == REGULAR); /* Perform a "shallow" hash of the file */ hash_value = hash_entry(file_entry, SHALLOW); #ifndef NDEBUG printf("[SHASH] %s\t*%s\n", file_entry->path, hash_value); #endif /* Check to see if we might have seen this file before */ if (bloom_filter_query(file_info.shash_filter, hash_value)) { /* Get the full hash of the new file */ hash_value = hash_entry(file_entry, FULL); #ifndef NDEBUG printf("[+HASH] %s\t*%s\n", file_entry->path, hash_value); #endif archive(&file_info, file_entry); /* Check to see if bloom failed us */ trie_entry = trie_lookup(file_info.shash_trie, file_entry->shash); if (trie_entry == TRIE_NULL) { #ifndef NDEBUG printf("[DEBUG] '%s' (false positive)\n", file_entry->path); #endif trie_insert(file_info.shash_trie, file_entry->shash, file_entry); } else { /* Get the full hash of the old file */ hash_value = hash_entry(trie_entry, FULL); #ifndef NDEBUG if (hash_value) { printf("[-HASH] %s\t*%s\n", trie_entry->path, hash_value); } #endif archive(&file_info, trie_entry); } } else { /* Add a record of this shash to the filter */ bloom_filter_insert(file_info.shash_filter, hash_value); trie_insert(file_info.shash_trie, hash_value, file_entry); } } persist("bloom_store", &file_info); } /* Step 5: Output results and cleanup before exit */ printf("[EXTRA] Found %lu sets of duplicates...\n", (unsigned long)(slist_length(file_info.duplicates))); slist_iterate(&file_info.duplicates, &slist_iterator); for (total_files = total_wasted = bytes_wasted = 0; slist_iter_has_more(&slist_iterator); total_wasted += bytes_wasted) { Set *set = slist_iter_next(&slist_iterator); int size = set_num_entries(set); if (size < 2) { continue; } printf("[EXTRA] %lu files (w/ same hash):\n", (unsigned long)(size)); set_iterate(set, &set_iterator); for (bytes_wasted = 0; set_iter_has_more(&set_iterator); bytes_wasted += file_entry->size, ++total_files) { file_entry = set_iter_next(&set_iterator); printf("\t%s (%lu bytes)\n", file_entry->path, (unsigned long)(file_entry->size)); } } printf("[EXTRA] %lu bytes in %lu files (wasted)\n", (unsigned long)(total_wasted), (unsigned long)(total_files)); destroy_info(&file_info); return (EXIT_SUCCESS); }