static gint64 rm_tm_mark_duplicate_files(RmTreeMerger *self, RmDirectory *directory, gint64 acc) { for(GList *iter = directory->known_files.head; iter; iter = iter->next) { RmFile *file = iter->data; acc += file->is_prefd; g_hash_table_insert(self->file_checks, file->digest, file); } /* Recursively propagate to children */ for(GList *iter = directory->children.head; iter; iter = iter->next) { RmDirectory *child = iter->data; rm_tm_mark_duplicate_files(self, child, acc); } return acc; }
static gint64 rm_tm_mark_duplicate_files(RmTreeMerger *self, RmDirectory *directory) { gint64 acc = 0; for(GList *iter = directory->known_files.head; iter; iter = iter->next) { RmFile *file = iter->data; acc += file->is_prefd; } /* Recursively propagate to children */ for(GList *iter = directory->children.head; iter; iter = iter->next) { RmDirectory *child = iter->data; acc += rm_tm_mark_duplicate_files(self, child); } return acc; }
static void rm_tm_extract(RmTreeMerger *self) { /* Iterate over all directories per hash (which are same therefore) */ GList *result_table_values = g_hash_table_get_values(self->result_table); result_table_values = g_list_sort(result_table_values, (GCompareFunc)rm_tm_cmp_directory_groups); for(GList *iter = result_table_values; iter; iter = iter->next) { /* Needs at least two directories to be duplicate... */ GQueue *dir_list = iter->data; #ifdef _RM_TREEMERGE_DEBUG for(GList *i = dir_list->head; i; i = i->next) { RmDirectory *d = i->data; char buf[512]; memset(buf, 0, sizeof(buf)); rm_digest_hexstring(d->digest, buf); g_printerr(" mergeups=%" LLU ": %s - %s\n", d->mergeups, d->dirname, buf); } g_printerr("---\n"); #endif if(dir_list->length < 2) { continue; } if(rm_session_was_aborted(self->session)) { break; } /* List of result directories */ GQueue result_dirs = G_QUEUE_INIT; /* Sort the RmDirectory list by their path depth, lowest depth first */ g_queue_sort(dir_list, (GCompareDataFunc)rm_tm_sort_paths, self); /* Output the directories and mark their children to prevent * duplicate directory reports in lower levels. */ for(GList *iter = dir_list->head; iter; iter = iter->next) { RmDirectory *directory = iter->data; if(directory->finished == false) { rm_tm_mark_finished(self, directory); g_queue_push_head(&result_dirs, directory); } } /* Make sure the original directory lands as first * in the result_dirs queue. */ g_queue_sort(&result_dirs, (GCompareDataFunc)rm_tm_sort_orig_criteria, self); GQueue file_adaptor_group = G_QUEUE_INIT; for(GList *iter = result_dirs.head; iter; iter = iter->next) { RmDirectory *directory = iter->data; RmFile *mask = rm_directory_as_file(self, directory); g_queue_push_tail(&file_adaptor_group, mask); if(iter == result_dirs.head) { /* First one in the group -> It's the original */ mask->is_original = true; rm_tm_mark_original_files(self, directory); } else { if(rm_tm_mark_duplicate_files(self, directory, 0) == directory->dupe_count) { /* Mark the file as original when all files in it are preferred. */ mask->is_original = true; rm_tm_mark_original_files(self, directory); } } if(self->session->cfg->write_unfinished) { rm_tm_write_unfinished_cksums(self, directory); } } if(result_dirs.length >= 2) { rm_shred_forward_to_output(self->session, &file_adaptor_group); } g_queue_foreach(&file_adaptor_group, (GFunc)g_free, NULL); g_queue_clear(&file_adaptor_group); g_queue_clear(&result_dirs); } g_list_free(result_table_values); /* Iterate over all non-finished dirs in the tree, * and grab unfinished files that must be dupes elsewhise. */ rm_trie_iter(&self->dir_tree, NULL, true, false, rm_tm_iter_unfinished_files, self); /* Now here's a problem. Consider an input like this: * /root * ├── a * ├── sub1 * │ ├── a * │ └── b * └── sub2 * ├── a * └── b * * This yields two duplicate dirs (sub1, sub2) * and one duplicate, unmatched file (a). * * For outputting files we need groups, which consist of at least 2 files. * So how to group that, so we don't end up deleting a file many times? * We always choose which directories are originals first, so we flag all * files in it as originals. */ GHashTableIter iter; g_hash_table_iter_init(&iter, self->file_groups); GQueue *file_list = NULL; while(g_hash_table_iter_next(&iter, NULL, (void **)&file_list)) { bool has_one_dupe = false; RmOff file_size_acc = 0; GList *next = NULL; for(GList *iter = file_list->head; iter; iter = next) { RmFile *file = iter->data; next = iter->next; bool is_duplicate = g_hash_table_contains(self->file_checks, file->digest); has_one_dupe |= is_duplicate; /* with --partial-hidden we do not want to output */ if(self->session->cfg->partial_hidden && file->is_hidden) { g_queue_delete_link(file_list, iter); continue; } if(iter != file_list->head && !is_duplicate) { file_size_acc += file->file_size; } } if(file_list->length >= 2) { /* If no separate duplicate files are requested, we can stop here */ if(self->session->cfg->find_duplicates == false) { self->session->total_lint_size -= file_size_acc; self->session->dup_group_counter -= 1; self->session->dup_counter -= file_list->length - 1; } else { rm_shred_group_find_original(self->session, file_list); rm_shred_forward_to_output(self->session, file_list); } } } }