Beispiel #1
void test_set_remove (void) {
  Set* set;
  char buf[10];
  int i;
  unsigned int num_entries;

  set = generate_set();

  num_entries = set_num_entries (set);
  assert (num_entries == 10000);

  /* Remove some entries */

  for (i = 4000; i < 6000; ++i) {

    sprintf (buf, "%i", i);

    /* Check this is in the set */

    assert (set_query (set, buf) != 0);

    /* Remove it */

    assert (set_remove (set, buf) != 0);

    /* Check the number of entries decreases */

    assert (set_num_entries (set) == num_entries - 1);

    /* Check it is no longer in the set */

    assert (set_query (set, buf) == 0);


  /* Try to remove some invalid entries */

  for (i = -1000; i < -500; ++i) {
    sprintf (buf, "%i", i);

    assert (set_remove (set, buf) == 0);
    assert (set_num_entries (set) == num_entries);

  for (i = 50000; i < 51000; ++i) {
    sprintf (buf, "%i", i);

    assert (set_remove (set, buf) == 0);
    assert (set_num_entries (set) == num_entries);

  set_free (set);
Beispiel #2
void test_set_out_of_memory (void) {
  Set* set;
  int values[66];
  unsigned int i;

  set = set_new (int_hash, int_equal);

  /* Test normal failure */

  alloc_test_set_limit (0);
  values[0] = 0;
  assert (set_insert (set, &values[0]) == 0);
  assert (set_num_entries (set) == 0);

  alloc_test_set_limit (-1);

  /* Test failure when increasing table size.
   * The initial table size is 193 entries.  The table increases in
   * size when 1/3 full, so the 66th entry should cause the insert
   * to fail. */

  for (i = 0; i < 65; ++i) {
    values[i] = (int) i;

    assert (set_insert (set, &values[i]) != 0);
    assert (set_num_entries (set) == i + 1);

  assert (set_num_entries (set) == 65);

  /* Test the 66th insert */

  alloc_test_set_limit (0);

  values[65] = 65;

  assert (set_insert (set, &values[65]) == 0);
  assert (set_num_entries (set) == 65);

  set_free (set);
Beispiel #3
void test_set_intersection (void) {
  int numbers1[] = {1, 2, 3, 4, 5, 6, 7};
  int numbers2[] = {5, 6, 7, 8, 9, 10, 11};
  int result[] = {5, 6, 7};
  int i;
  Set* set1;
  Set* set2;
  Set* result_set;
  size_t allocated;

  /* Create the first set */

  set1 = set_new (int_hash, int_equal);

  for (i = 0; i < 7; ++i) {
    set_insert (set1, &numbers1[i]);

  /* Create the second set */

  set2 = set_new (int_hash, int_equal);

  for (i = 0; i < 7; ++i) {
    set_insert (set2, &numbers2[i]);

  /* Perform the intersection */

  result_set = set_intersection (set1, set2);

  assert (set_num_entries (result_set) == 3);

  for (i = 0; i < 3; ++i) {
    assert (set_query (result_set, &result[i]) != 0);

  /* Test out of memory scenario */

  alloc_test_set_limit (0);
  assert (set_intersection (set1, set2) == NULL);

  /* Can allocate set, can't copy all values */

  alloc_test_set_limit (2 + 2);
  allocated = alloc_test_get_allocated();
  assert (set_intersection (set1, set2) == NULL);
  assert (alloc_test_get_allocated() == allocated);

  set_free (set1);
  set_free (set2);
  set_free (result_set);
Beispiel #4
void test_set_iterating_remove (void) {
  Set* set;
  SetIterator iterator;
  int count;
  unsigned int removed;
  char* value;

  set = generate_set();

  count = 0;
  removed = 0;

  /* Iterate over all values in the set */

  set_iterate (set, &iterator);

  while (set_iter_has_more (&iterator)) {

    value = set_iter_next (&iterator);

    if ( (atoi (value) % 100) == 0) {

      /* Remove this value */

      set_remove (set, value);



  /* Check final counts */

  assert (count == 10000);
  assert (removed == 100);
  assert (set_num_entries (set) == 10000 - removed);

  set_free (set);
Beispiel #5
void test_set_insert (void) {
  Set* set;
  int numbers1[] = { 1, 2, 3, 4, 5, 6 };
  int numbers2[] = { 5, 6, 7, 8, 9, 10 };
  int i;

  /* Perform a union of numbers1 and numbers2.  Cannot add the same
   * value twice. */

  set = set_new (int_hash, int_equal);

  for (i = 0; i < 6; ++i) {
    set_insert (set, &numbers1[i]);

  for (i = 0; i < 6; ++i) {
    set_insert (set, &numbers2[i]);

  assert (set_num_entries (set) == 10);

  set_free (set);
Beispiel #6
Set* generate_set (void) {
  Set* set;
  char buf[10];
  unsigned int i;
  char* value;

  set = set_new (string_hash, string_equal);

  /* Add 10,000 items sequentially, checking that the counter
   * works properly */

  for (i = 0; i < 10000; ++i) {
    sprintf (buf, "%i", i);
    value = strdup (buf);

    set_insert (set, value);

    assert (set_num_entries (set) == i + 1);

  set_register_free_function (set, free);

  return set;
Beispiel #7
main(int argc, char *argv[])
	size_t path_len, total_files;
	off_t bytes_wasted, total_wasted;
	char path_buffer[PATH_MAX_LEN], *hash_value;
	struct file_entry_t *file_entry, *trie_entry;

	SListIterator slist_iterator;
	SetIterator set_iterator;

	/* Step 0: Session data */
	struct file_info_t file_info;

	/* Step 1: Parse arguments */
	while (--argc) {
		/* Being unable to record implies insufficient resources */
		if (!record(argv[argc], &file_info)){
			fprintf(stderr, "[FATAL] out of memory\n");
			return (EXIT_FAILURE);

	/* Step 2: Fully explore any directories specified */
	#ifndef NDEBUG
	printf("[DEBUG] Creating file list...\n");
	while (slist_length(file_info.file_stack) > 0) {
		/* Pick off the top of the file stack */
		file_entry = (struct file_entry_t *)(slist_data(file_info.file_stack));
		slist_remove_entry(&file_info.file_stack, file_info.file_stack);
		assert(file_entry->type == DIRECTORY);
		/* Copy the basename to a buffer */
		memset(path_buffer, '\0', PATH_MAX_LEN);
		path_len = strnlen(file_entry->path, PATH_MAX_LEN);
		memcpy(path_buffer, file_entry->path, path_len);
		/* Ignore cases that would cause overflow */
		if (path_len < PATH_MAX_LEN) {
			/* Append a trailing slash */
			path_buffer[path_len] = '/';
			/* Record all contents (may push onto file stack or one of the lists) */
			DIR *directory = opendir(file_entry->path);
			if (traverse(&file_info, directory, path_buffer, ++path_len)) {
				fprintf(stderr, "[FATAL] out of memory\n");
				return (EXIT_FAILURE);
			} else if (closedir(directory)) {
				fprintf(stderr, "[WARNING] '%s' (close failed)\n", file_entry->path);
		/* Discard this entry */

	/* Step 3: Warn about any ignored files */
	if (slist_length(file_info.bad_files) > 0) {
		slist_iterate(&file_info.bad_files, &slist_iterator);
		while (slist_iter_has_more(&slist_iterator)) {
			file_entry = slist_iter_next(&slist_iterator);
			fprintf(stderr, "[WARNING] '%s' ", file_entry->path);
			switch (file_entry->type) {
			case INVALID:
				fprintf(stderr, "(invalid file)\n");
				fprintf(stderr, "(protected file)\n");
				fprintf(stderr, "(irregular file)\n");
		fprintf(stderr, "[WARNING] %lu file(s) ignored\n",
			(long unsigned)(num_errors(&file_info)));
	#ifndef NDEBUG
	if (num_errors(&file_info) > 0) {
		fprintf(stderr, "[FATAL] cannot parse entire file tree\n");
		return (EXIT_FAILURE);
	printf("[DEBUG] Found %lu / %lu valid files\n",
		(unsigned long)(num_files(&file_info)),
		(unsigned long)(file_info.total_files));

	/* Step 4: Begin the filtering process */
	#ifndef NDEBUG
	printf("[DEBUG] Creating file table...\n");
	if (slist_length(file_info.good_files) > 0) {
		file_info.hash_trie = trie_new();
		file_info.shash_trie = trie_new();
		/* Extract each file from the list (they should all be regular) */
		slist_iterate(&file_info.good_files, &slist_iterator);
		while (slist_iter_has_more(&slist_iterator)) {
			file_entry = slist_iter_next(&slist_iterator);
			assert(file_entry->type == REGULAR);
			/* Perform a "shallow" hash of the file */
			hash_value = hash_entry(file_entry, SHALLOW);
			#ifndef NDEBUG
			printf("[SHASH] %s\t*%s\n", file_entry->path, hash_value);
			/* Check to see if we might have seen this file before */
			if (bloom_filter_query(file_info.shash_filter, hash_value)) {
				/* Get the full hash of the new file */
				hash_value = hash_entry(file_entry, FULL);
				#ifndef NDEBUG
				printf("[+HASH] %s\t*%s\n", file_entry->path, hash_value);
				archive(&file_info, file_entry);
				/* Check to see if bloom failed us */
				trie_entry = trie_lookup(file_info.shash_trie, file_entry->shash);
				if (trie_entry == TRIE_NULL) {
					#ifndef NDEBUG
					printf("[DEBUG] '%s' (false positive)\n", file_entry->path);
					trie_insert(file_info.shash_trie, file_entry->shash, file_entry);
				} else {
					/* Get the full hash of the old file */
					hash_value = hash_entry(trie_entry, FULL);
					#ifndef NDEBUG
					if (hash_value) {
						printf("[-HASH] %s\t*%s\n", trie_entry->path, hash_value);
					archive(&file_info, trie_entry);
			} else {
				/* Add a record of this shash to the filter */
				bloom_filter_insert(file_info.shash_filter, hash_value);
				trie_insert(file_info.shash_trie, hash_value, file_entry);
		persist("bloom_store", &file_info);

	/* Step 5: Output results and cleanup before exit */
	printf("[EXTRA] Found %lu sets of duplicates...\n",
		(unsigned long)(slist_length(file_info.duplicates)));
	slist_iterate(&file_info.duplicates, &slist_iterator);
	for (total_files = total_wasted = bytes_wasted = 0;
		total_wasted += bytes_wasted)
		Set *set = slist_iter_next(&slist_iterator);
		int size = set_num_entries(set);
		if (size < 2) { continue; }
		printf("[EXTRA] %lu files (w/ same hash):\n", (unsigned long)(size));
		set_iterate(set, &set_iterator);
		for (bytes_wasted = 0;
			bytes_wasted += file_entry->size,
			file_entry = set_iter_next(&set_iterator);
			printf("\t%s (%lu bytes)\n",
				(unsigned long)(file_entry->size));
	printf("[EXTRA] %lu bytes in %lu files (wasted)\n",
		(unsigned long)(total_wasted),
		(unsigned long)(total_files));
	return (EXIT_SUCCESS);