/* This does preprocessing including handling of "other lint" (non-dupes) * After rm_preprocess(), all remaining duplicate candidates are in * a jagged GSList of GSLists as follows: * session->tables->size_groups->group1->file1a * ->file1b * ->file1c * ->group2->file2a * ->file2b * etc */ void rm_preprocess(RmSession *session) { RmFileTables *tables = session->tables; GQueue *all_files = tables->all_files; session->total_filtered_files = session->total_files; /* initial sort by size */ g_queue_sort(all_files, (GCompareDataFunc)rm_file_cmp_full, session); rm_log_debug_line("initial size sort finished at time %.3f; sorted %d files", g_timer_elapsed(session->timer, NULL), session->total_files); /* split into file size groups; for each size, remove path doubles and bundle * hardlinks */ rm_assert_gentle(all_files->head); RmFile *file = g_queue_pop_head(all_files); RmFile *current_size_file = file; guint removed = 0; GHashTable *node_table = tables->node_table; while(file && !rm_session_was_aborted()) { /* group files into inode clusters */ GQueue *inode_cluster = rm_hash_table_setdefault(node_table, file, (RmNewFunc)g_queue_new); g_queue_push_tail(inode_cluster, file); /* get next file and check if it is part of the same group */ file = g_queue_pop_head(all_files); if(!file || rm_file_cmp_split(file, current_size_file, session) != 0) { /* process completed group (all same size & other criteria)*/ /* remove path doubles and handle "other" lint */ /* add an empty GSlist to our list of lists */ tables->size_groups = g_slist_prepend(tables->size_groups, NULL); removed += g_hash_table_foreach_remove( node_table, (GHRFunc)rm_pp_handle_inode_clusters, session); /* free up the node table for the next group */ g_hash_table_steal_all(node_table); if(tables->size_groups->data == NULL) { /* zero size group after handling other lint; remove it */ tables->size_groups = g_slist_delete_link(tables->size_groups, tables->size_groups); } } current_size_file = file; } session->other_lint_cnt += rm_pp_handler_other_lint(session); rm_log_debug_line( "path doubles removal/hardlink bundling/other lint finished at %.3f; removed %u " "of %d", g_timer_elapsed(session->timer, NULL), removed, session->total_files); rm_fmt_set_state(session->formats, RM_PROGRESS_STATE_PREPROCESS); }
/* Preprocess files, including embedded hardlinks. Any embedded hardlinks * that are "other lint" types are sent to rm_pp_handle_other_lint. If the * file itself is "other lint" types it is likewise sent to rm_pp_handle_other_lint. * If there are no files left after this then return TRUE so that the * cluster can be deleted from the node_table hash table. * NOTE: we rely on rm_file_list_insert to select an RM_LINT_TYPE_DUPE_CANDIDATE as head * file (unless ALL the files are "other lint"). */ static gboolean rm_pp_handle_inode_clusters(_UNUSED gpointer key, GQueue *inode_cluster, RmSession *session) { RmCfg *cfg = session->cfg; if(inode_cluster->length > 1) { /* there is a cluster of inode matches */ /* remove path doubles */ session->total_filtered_files -= rm_util_queue_foreach_remove(inode_cluster, (RmRFunc)rm_pp_check_path_double, session->tables->unique_paths_table); /* clear the hashtable ready for the next cluster */ g_hash_table_remove_all(session->tables->unique_paths_table); } /* process and remove other lint */ session->total_filtered_files -= rm_util_queue_foreach_remove( inode_cluster, (RmRFunc)rm_pp_handle_other_lint, (RmSession *)session); if(inode_cluster->length > 1) { /* bundle or free the non-head files */ RmFile *headfile = inode_cluster->head->data; if(cfg->find_hardlinked_dupes) { /* prepare to bundle files under the hardlink head */ headfile->hardlinks.files = g_queue_new(); headfile->hardlinks.is_head = TRUE; } /* hardlink cluster are counted as filtered files since they are either * ignored or treated as automatic duplicates depending on settings (so * no effort eaither way); rm_pp_handle_hardlink will either free or bundle * the hardlinks depending on value of headfile->hardlinks.is_head. */ session->total_filtered_files -= rm_util_queue_foreach_remove( inode_cluster, (RmRFunc)rm_pp_handle_hardlink, headfile); } /* update counters */ rm_fmt_set_state(session->formats, RM_PROGRESS_STATE_PREPROCESS); rm_assert_gentle(inode_cluster->length <= 1); if(inode_cluster->length == 1) { session->tables->size_groups->data = g_slist_prepend( session->tables->size_groups->data, inode_cluster->head->data); } return TRUE; }
static void rm_traverse_file(RmTravSession *trav_session, RmStat *statp, char *path, bool is_prefd, unsigned long path_index, RmLintType file_type, bool is_symlink, bool is_hidden, bool is_on_subvol_fs, short depth) { RmSession *session = trav_session->session; RmCfg *cfg = session->cfg; if(rm_fmt_is_a_output(session->formats, path)) { /* ignore files which are rmlint outputs */ return; } /* Try to autodetect the type of the lint */ if(file_type == RM_LINT_TYPE_UNKNOWN) { RmLintType gid_check; /* see if we can find a lint type */ if(statp->st_size == 0) { if(!cfg->find_emptyfiles) { return; } else { file_type = RM_LINT_TYPE_EMPTY_FILE; } } else if(cfg->permissions && access(path, cfg->permissions) == -1) { /* bad permissions; ignore file */ trav_session->session->ignored_files++; return; } else if(cfg->find_badids && (gid_check = rm_util_uid_gid_check(statp, trav_session->userlist))) { file_type = gid_check; } else if(cfg->find_nonstripped && rm_util_is_nonstripped(path, statp)) { file_type = RM_LINT_TYPE_NONSTRIPPED; } else { RmOff file_size = statp->st_size; if(!cfg->limits_specified || ((cfg->minsize == (RmOff)-1 || cfg->minsize <= file_size) && (cfg->maxsize == (RmOff)-1 || file_size <= cfg->maxsize))) { if(rm_mounts_is_evil(trav_session->session->mounts, statp->st_dev) == false) { file_type = RM_LINT_TYPE_DUPE_CANDIDATE; } else { /* A file in an evil fs. Ignore. */ trav_session->session->ignored_files++; return; } } else { return; } } } RmFile *file = rm_file_new(session, path, statp, file_type, is_prefd, path_index, depth); if(file != NULL) { file->is_symlink = is_symlink; file->is_hidden = is_hidden; file->is_on_subvol_fs = is_on_subvol_fs; file->link_count = statp->st_nlink; rm_file_list_insert_file(file, session); g_atomic_int_add(&trav_session->session->total_files, 1); rm_fmt_set_state(session->formats, RM_PROGRESS_STATE_TRAVERSE); if(trav_session->session->cfg->clear_xattr_fields && file->lint_type == RM_LINT_TYPE_DUPE_CANDIDATE) { rm_xattr_clear_hash(session, file); } } }