示例#1
0
static gint64 rm_tm_mark_duplicate_files(RmTreeMerger *self, RmDirectory *directory,
                                         gint64 acc) {
    for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
        RmFile *file = iter->data;
        acc += file->is_prefd;
        g_hash_table_insert(self->file_checks, file->digest, file);
    }

    /* Recursively propagate to children */
    for(GList *iter = directory->children.head; iter; iter = iter->next) {
        RmDirectory *child = iter->data;
        rm_tm_mark_duplicate_files(self, child, acc);
    }

    return acc;
}
示例#2
0
static gint64 rm_tm_mark_duplicate_files(RmTreeMerger *self, RmDirectory *directory) {
    gint64 acc = 0;

    for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
        RmFile *file = iter->data;
        acc += file->is_prefd;
    }

    /* Recursively propagate to children */
    for(GList *iter = directory->children.head; iter; iter = iter->next) {
        RmDirectory *child = iter->data;
        acc += rm_tm_mark_duplicate_files(self, child);
    }

    return acc;
}
示例#3
0
static void rm_tm_extract(RmTreeMerger *self) {
    /* Iterate over all directories per hash (which are same therefore) */
    GList *result_table_values = g_hash_table_get_values(self->result_table);
    result_table_values =
        g_list_sort(result_table_values, (GCompareFunc)rm_tm_cmp_directory_groups);

    for(GList *iter = result_table_values; iter; iter = iter->next) {
        /* Needs at least two directories to be duplicate... */
        GQueue *dir_list = iter->data;

#ifdef _RM_TREEMERGE_DEBUG
        for(GList *i = dir_list->head; i; i = i->next) {
            RmDirectory *d = i->data;
            char buf[512];
            memset(buf, 0, sizeof(buf));
            rm_digest_hexstring(d->digest, buf);
            g_printerr("    mergeups=%" LLU ": %s - %s\n", d->mergeups, d->dirname, buf);
        }
        g_printerr("---\n");
#endif
        if(dir_list->length < 2) {
            continue;
        }

        if(rm_session_was_aborted(self->session)) {
            break;
        }

        /* List of result directories */
        GQueue result_dirs = G_QUEUE_INIT;

        /* Sort the RmDirectory list by their path depth, lowest depth first */
        g_queue_sort(dir_list, (GCompareDataFunc)rm_tm_sort_paths, self);

        /* Output the directories and mark their children to prevent
         * duplicate directory reports in lower levels.
         */
        for(GList *iter = dir_list->head; iter; iter = iter->next) {
            RmDirectory *directory = iter->data;
            if(directory->finished == false) {
                rm_tm_mark_finished(self, directory);
                g_queue_push_head(&result_dirs, directory);
            }
        }

        /* Make sure the original directory lands as first
         * in the result_dirs queue.
         */
        g_queue_sort(&result_dirs, (GCompareDataFunc)rm_tm_sort_orig_criteria, self);

        GQueue file_adaptor_group = G_QUEUE_INIT;

        for(GList *iter = result_dirs.head; iter; iter = iter->next) {
            RmDirectory *directory = iter->data;
            RmFile *mask = rm_directory_as_file(self, directory);
            g_queue_push_tail(&file_adaptor_group, mask);

            if(iter == result_dirs.head) {
                /* First one in the group -> It's the original */
                mask->is_original = true;
                rm_tm_mark_original_files(self, directory);
            } else {
                if(rm_tm_mark_duplicate_files(self, directory, 0) ==
                   directory->dupe_count) {
                    /* Mark the file as original when all files in it are preferred. */
                    mask->is_original = true;
                    rm_tm_mark_original_files(self, directory);
                }
            }

            if(self->session->cfg->write_unfinished) {
                rm_tm_write_unfinished_cksums(self, directory);
            }
        }

        if(result_dirs.length >= 2) {
            rm_shred_forward_to_output(self->session, &file_adaptor_group);
        }

        g_queue_foreach(&file_adaptor_group, (GFunc)g_free, NULL);
        g_queue_clear(&file_adaptor_group);
        g_queue_clear(&result_dirs);
    }

    g_list_free(result_table_values);

    /* Iterate over all non-finished dirs in the tree,
     * and grab unfinished files that must be dupes elsewhise.
     */
    rm_trie_iter(&self->dir_tree, NULL, true, false, rm_tm_iter_unfinished_files, self);

    /* Now here's a problem. Consider an input like this:
     *  /root
     *  ├── a
     *  ├── sub1
     *  │   ├── a
     *  │   └── b
     *  └── sub2
     *      ├── a
     *      └── b
     *
     *  This yields two duplicate dirs (sub1, sub2)
     *  and one duplicate, unmatched file (a).
     *
     *  For outputting files we need groups, which consist of at least 2 files.
     *  So how to group that, so we don't end up deleting a file many times?
     *  We always choose which directories are originals first, so we flag all
     *  files in it as originals.
     */
    GHashTableIter iter;
    g_hash_table_iter_init(&iter, self->file_groups);

    GQueue *file_list = NULL;
    while(g_hash_table_iter_next(&iter, NULL, (void **)&file_list)) {
        bool has_one_dupe = false;
        RmOff file_size_acc = 0;

        GList *next = NULL;
        for(GList *iter = file_list->head; iter; iter = next) {
            RmFile *file = iter->data;
            next = iter->next;

            bool is_duplicate = g_hash_table_contains(self->file_checks, file->digest);
            has_one_dupe |= is_duplicate;

            /* with --partial-hidden we do not want to output */
            if(self->session->cfg->partial_hidden && file->is_hidden) {
                g_queue_delete_link(file_list, iter);
                continue;
            }

            if(iter != file_list->head && !is_duplicate) {
                file_size_acc += file->file_size;
            }
        }

        if(file_list->length >= 2) {
            /* If no separate duplicate files are requested, we can stop here */
            if(self->session->cfg->find_duplicates == false) {
                self->session->total_lint_size -= file_size_acc;
                self->session->dup_group_counter -= 1;
                self->session->dup_counter -= file_list->length - 1;
            } else {
                rm_shred_group_find_original(self->session, file_list);
                rm_shred_forward_to_output(self->session, file_list);
            }
        }
    }
}