Set *set_union(Set *set1, Set *set2) { SetIterator iterator; Set *new_set; SetValue value; new_set = set_new(set1->hash_func, set1->equal_func); if (new_set == NULL) { return NULL; } /* Add all values from the first set */ set_iterate(set1, &iterator); while (set_iter_has_more(&iterator)) { /* Read the next value */ value = set_iter_next(&iterator); /* Copy the value into the new set */ if (!set_insert(new_set, value)) { /* Failed to insert */ set_free(new_set); return NULL; } } /* Add all values from the second set */ set_iterate(set2, &iterator); while (set_iter_has_more(&iterator)) { /* Read the next value */ value = set_iter_next(&iterator); /* Has this value been put into the new set already? * If so, do not insert this again */ if (set_query(new_set, value) == 0) { if (!set_insert(new_set, value)) { /* Failed to insert */ set_free(new_set); return NULL; } } } return new_set; }
void test_set_iterating (void) { Set* set; SetIterator iterator; int count; set = generate_set(); /* Iterate over all values in the set */ count = 0; set_iterate (set, &iterator); while (set_iter_has_more (&iterator)) { set_iter_next (&iterator); ++count; } /* Test iter_next after iteration has completed. */ assert (set_iter_next (&iterator) == NULL); /* Check final count */ assert (count == 10000); set_free (set); /* Test iterating over an empty set */ set = set_new (int_hash, int_equal); set_iterate (set, &iterator); assert (set_iter_has_more (&iterator) == 0); set_free (set); }
Set *set_intersection(Set *set1, Set *set2) { Set *new_set; SetIterator iterator; SetValue value; new_set = set_new(set1->hash_func, set2->equal_func); if (new_set == NULL) { return NULL; } /* Iterate over all values in set 1. */ set_iterate(set1, &iterator); while (set_iter_has_more(&iterator)) { /* Get the next value */ value = set_iter_next(&iterator); /* Is this value in set 2 as well? If so, it should be * in the new set. */ if (set_query(set2, value) != 0) { /* Copy the value first before inserting, * if necessary */ if (!set_insert(new_set, value)) { set_free(new_set); return NULL; } } } return new_set; }
void test_set_iterating_remove (void) { Set* set; SetIterator iterator; int count; unsigned int removed; char* value; set = generate_set(); count = 0; removed = 0; /* Iterate over all values in the set */ set_iterate (set, &iterator); while (set_iter_has_more (&iterator)) { value = set_iter_next (&iterator); if ( (atoi (value) % 100) == 0) { /* Remove this value */ set_remove (set, value); ++removed; } ++count; } /* Check final counts */ assert (count == 10000); assert (removed == 100); assert (set_num_entries (set) == 10000 - removed); set_free (set); }
int main(int argc, char *argv[]) { size_t path_len, total_files; off_t bytes_wasted, total_wasted; char path_buffer[PATH_MAX_LEN], *hash_value; struct file_entry_t *file_entry, *trie_entry; SListIterator slist_iterator; SetIterator set_iterator; /* Step 0: Session data */ struct file_info_t file_info; clear_info(&file_info); /* Step 1: Parse arguments */ while (--argc) { /* Being unable to record implies insufficient resources */ if (!record(argv[argc], &file_info)){ fprintf(stderr, "[FATAL] out of memory\n"); destroy_info(&file_info); return (EXIT_FAILURE); } } /* Step 2: Fully explore any directories specified */ #ifndef NDEBUG printf("[DEBUG] Creating file list...\n"); #endif while (slist_length(file_info.file_stack) > 0) { /* Pick off the top of the file stack */ file_entry = (struct file_entry_t *)(slist_data(file_info.file_stack)); slist_remove_entry(&file_info.file_stack, file_info.file_stack); assert(file_entry->type == DIRECTORY); /* Copy the basename to a buffer */ memset(path_buffer, '\0', PATH_MAX_LEN); path_len = strnlen(file_entry->path, PATH_MAX_LEN); memcpy(path_buffer, file_entry->path, path_len); /* Ignore cases that would cause overflow */ if (path_len < PATH_MAX_LEN) { /* Append a trailing slash */ path_buffer[path_len] = '/'; /* Record all contents (may push onto file stack or one of the lists) */ DIR *directory = opendir(file_entry->path); if (traverse(&file_info, directory, path_buffer, ++path_len)) { fprintf(stderr, "[FATAL] out of memory\n"); destroy_info(&file_info); return (EXIT_FAILURE); } else if (closedir(directory)) { fprintf(stderr, "[WARNING] '%s' (close failed)\n", file_entry->path); } } /* Discard this entry */ destroy_entry(file_entry); } /* Step 3: Warn about any ignored files */ if (slist_length(file_info.bad_files) > 0) { slist_iterate(&file_info.bad_files, &slist_iterator); while (slist_iter_has_more(&slist_iterator)) { file_entry = slist_iter_next(&slist_iterator); fprintf(stderr, "[WARNING] '%s' ", file_entry->path); switch (file_entry->type) { case INVALID: ++file_info.invalid_files; fprintf(stderr, "(invalid file)\n"); break; case INACCESSIBLE: ++file_info.protected_files; fprintf(stderr, "(protected file)\n"); break; default: ++file_info.irregular_files; fprintf(stderr, "(irregular file)\n"); break; } } fprintf(stderr, "[WARNING] %lu file(s) ignored\n", (long unsigned)(num_errors(&file_info))); } #ifndef NDEBUG if (num_errors(&file_info) > 0) { fprintf(stderr, "[FATAL] cannot parse entire file tree\n"); destroy_info(&file_info); return (EXIT_FAILURE); } printf("[DEBUG] Found %lu / %lu valid files\n", (unsigned long)(num_files(&file_info)), (unsigned long)(file_info.total_files)); #endif /* Step 4: Begin the filtering process */ #ifndef NDEBUG printf("[DEBUG] Creating file table...\n"); #endif if (slist_length(file_info.good_files) > 0) { file_info.hash_trie = trie_new(); file_info.shash_trie = trie_new(); optimize_filter(&file_info); /* Extract each file from the list (they should all be regular) */ slist_iterate(&file_info.good_files, &slist_iterator); while (slist_iter_has_more(&slist_iterator)) { file_entry = slist_iter_next(&slist_iterator); assert(file_entry->type == REGULAR); /* Perform a "shallow" hash of the file */ hash_value = hash_entry(file_entry, SHALLOW); #ifndef NDEBUG printf("[SHASH] %s\t*%s\n", file_entry->path, hash_value); #endif /* Check to see if we might have seen this file before */ if (bloom_filter_query(file_info.shash_filter, hash_value)) { /* Get the full hash of the new file */ hash_value = hash_entry(file_entry, FULL); #ifndef NDEBUG printf("[+HASH] %s\t*%s\n", file_entry->path, hash_value); #endif archive(&file_info, file_entry); /* Check to see if bloom failed us */ trie_entry = trie_lookup(file_info.shash_trie, file_entry->shash); if (trie_entry == TRIE_NULL) { #ifndef NDEBUG printf("[DEBUG] '%s' (false positive)\n", file_entry->path); #endif trie_insert(file_info.shash_trie, file_entry->shash, file_entry); } else { /* Get the full hash of the old file */ hash_value = hash_entry(trie_entry, FULL); #ifndef NDEBUG if (hash_value) { printf("[-HASH] %s\t*%s\n", trie_entry->path, hash_value); } #endif archive(&file_info, trie_entry); } } else { /* Add a record of this shash to the filter */ bloom_filter_insert(file_info.shash_filter, hash_value); trie_insert(file_info.shash_trie, hash_value, file_entry); } } persist("bloom_store", &file_info); } /* Step 5: Output results and cleanup before exit */ printf("[EXTRA] Found %lu sets of duplicates...\n", (unsigned long)(slist_length(file_info.duplicates))); slist_iterate(&file_info.duplicates, &slist_iterator); for (total_files = total_wasted = bytes_wasted = 0; slist_iter_has_more(&slist_iterator); total_wasted += bytes_wasted) { Set *set = slist_iter_next(&slist_iterator); int size = set_num_entries(set); if (size < 2) { continue; } printf("[EXTRA] %lu files (w/ same hash):\n", (unsigned long)(size)); set_iterate(set, &set_iterator); for (bytes_wasted = 0; set_iter_has_more(&set_iterator); bytes_wasted += file_entry->size, ++total_files) { file_entry = set_iter_next(&set_iterator); printf("\t%s (%lu bytes)\n", file_entry->path, (unsigned long)(file_entry->size)); } } printf("[EXTRA] %lu bytes in %lu files (wasted)\n", (unsigned long)(total_wasted), (unsigned long)(total_files)); destroy_info(&file_info); return (EXIT_SUCCESS); }