Beispiel #1
0
/* Preprocess files, including embedded hardlinks.  Any embedded hardlinks
 * that are "other lint" types are sent to rm_pp_handle_other_lint.  If the
 * file itself is "other lint" types it is likewise sent to rm_pp_handle_other_lint.
 * If there are no files left after this then return TRUE so that the
 * cluster can be deleted from the node_table hash table.
 * NOTE: we rely on rm_file_list_insert to select an RM_LINT_TYPE_DUPE_CANDIDATE as head
 * file (unless ALL the files are "other lint"). */
static gboolean rm_pp_handle_inode_clusters(_UNUSED gpointer key, GQueue *inode_cluster,
                                            RmSession *session) {
    RmCfg *cfg = session->cfg;

    if(inode_cluster->length > 1) {
        /* there is a cluster of inode matches */

        /* remove path doubles */
        session->total_filtered_files -=
            rm_util_queue_foreach_remove(inode_cluster, (RmRFunc)rm_pp_check_path_double,
                                         session->tables->unique_paths_table);
        /* clear the hashtable ready for the next cluster */
        g_hash_table_remove_all(session->tables->unique_paths_table);
    }

    /* process and remove other lint */
    session->total_filtered_files -= rm_util_queue_foreach_remove(
        inode_cluster, (RmRFunc)rm_pp_handle_other_lint, (RmSession *)session);

    if(inode_cluster->length > 1) {
        /* bundle or free the non-head files */
        RmFile *headfile = inode_cluster->head->data;
        if(cfg->find_hardlinked_dupes) {
            /* prepare to bundle files under the hardlink head */
            headfile->hardlinks.files = g_queue_new();
            headfile->hardlinks.is_head = TRUE;
        }

        /* hardlink cluster are counted as filtered files since they are either
         * ignored or treated as automatic duplicates depending on settings (so
         * no effort eaither way); rm_pp_handle_hardlink will either free or bundle
         * the hardlinks depending on value of headfile->hardlinks.is_head.
         */
        session->total_filtered_files -= rm_util_queue_foreach_remove(
            inode_cluster, (RmRFunc)rm_pp_handle_hardlink, headfile);
    }

    /* update counters */
    rm_fmt_set_state(session->formats, RM_PROGRESS_STATE_PREPROCESS);

    rm_assert_gentle(inode_cluster->length <= 1);
    if(inode_cluster->length == 1) {
        session->tables->size_groups->data = g_slist_prepend(
            session->tables->size_groups->data, inode_cluster->head->data);
    }

    return TRUE;
}
Beispiel #2
0
static void rm_tm_extract(RmTreeMerger *self) {
    /* Iterate over all directories per hash (which are same therefore) */
    RmCfg *cfg = self->session->cfg;
    GList *result_table_values = g_hash_table_get_values(self->result_table);
    result_table_values =
        g_list_sort(result_table_values, (GCompareFunc)rm_tm_cmp_directory_groups);

    for(GList *iter = result_table_values; iter; iter = iter->next) {
        /* Needs at least two directories to be duplicate... */
        GQueue *dir_list = iter->data;

#ifdef _RM_TREEMERGE_DEBUG
        for(GList *i = dir_list->head; i; i = i->next) {
            RmDirectory *d = i->data;
            char buf[512];
            memset(buf, 0, sizeof(buf));
            rm_digest_hexstring(d->digest, buf);
            g_printerr("    mergeups=%" LLU ": %s - %s\n", d->mergeups, d->dirname, buf);
        }
        g_printerr("---\n");
#endif
        if(dir_list->length < 2) {
            continue;
        }

        if(rm_session_was_aborted()) {
            break;
        }

        /* List of result directories */
        GQueue result_dirs = G_QUEUE_INIT;

        /* Sort the RmDirectory list by their path depth, lowest depth first */
        g_queue_sort(dir_list, (GCompareDataFunc)rm_tm_sort_paths, self);

        /* Output the directories and mark their children to prevent
         * duplicate directory reports in lower levels.
         */
        for(GList *iter = dir_list->head; iter; iter = iter->next) {
            RmDirectory *directory = iter->data;
            if(directory->finished == false) {
                rm_tm_mark_finished(self, directory);
                g_queue_push_head(&result_dirs, directory);
            }
        }

        /* Make sure the original directory lands as first
         * in the result_dirs queue.
         */
        g_queue_sort(&result_dirs, (GCompareDataFunc)rm_tm_sort_orig_criteria, self);

        GQueue file_adaptor_group = G_QUEUE_INIT;

        for(GList *iter = result_dirs.head; iter; iter = iter->next) {
            RmDirectory *directory = iter->data;
            RmFile *mask = rm_directory_as_new_file(self, directory);
            g_queue_push_tail(self->free_list, mask);
            g_queue_push_tail(&file_adaptor_group, mask);

            if(iter == result_dirs.head) {
                /* First one in the group -> It's the original */
                mask->is_original = true;
                rm_tm_mark_original_files(self, directory);
            } else {
                gint64 prefd = rm_tm_mark_duplicate_files(self, directory);
                if(prefd == directory->dupe_count && cfg->keep_all_tagged) {
                    /* Mark the file as original when all files in it are preferred. */
                    mask->is_original = true;
                } else if(prefd == 0 && cfg->keep_all_untagged) {
                    mask->is_original = true;
                }
            }

            if(self->session->cfg->write_unfinished) {
                rm_tm_write_unfinished_cksums(self, directory);
            }
        }

        if(result_dirs.length >= 2) {
            rm_shred_forward_to_output(self->session, &file_adaptor_group);
        } 

        g_queue_clear(&file_adaptor_group);
        g_queue_clear(&result_dirs);
    }

    g_list_free(result_table_values);

    /* Iterate over all non-finished dirs in the tree,
     * and grab unfinished files that must be dupes elsewhise.
     */
    rm_trie_iter(&self->dir_tree, NULL, true, false, rm_tm_iter_unfinished_files, self);

    /* Now here's a problem. Consider an input like this:
     *  /root
     *  ├── a
     *  ├── sub1
     *  │   ├── a
     *  │   └── b
     *  └── sub2
     *      ├── a
     *      └── b
     *
     *  This yields two duplicate dirs (sub1, sub2)
     *  and one duplicate, unmatched file (a).
     *
     *  For outputting files we need groups, which consist of at least 2 files.
     *  So how to group that, so we don't end up deleting a file many times?
     *  We always choose which directories are originals first, so we flag all
     *  files in it as originals.
     */
    GHashTableIter iter;
    g_hash_table_iter_init(&iter, self->file_groups);

    GQueue *file_list = NULL;
    while(g_hash_table_iter_next(&iter, NULL, (void **)&file_list)) {
        if (self->session->cfg->partial_hidden) {
            /* with --partial-hidden we do not want to output */
            rm_util_queue_foreach_remove(file_list,
                    (RmRFunc)rm_tm_hidden_file, NULL);
        }

        if(file_list->length >= 2) {
            /* If no separate duplicate files are requested, we can stop here */
            if(self->session->cfg->find_duplicates == false) {
                self->session->dup_group_counter -= 1;
                self->session->dup_counter -= file_list->length - 1;
            } else {
                rm_shred_group_find_original(self->session, file_list, RM_SHRED_GROUP_FINISHING);
                rm_shred_forward_to_output(self->session, file_list);
            }
        }
    }
}