void DCOPY_walk_paths(mfu_flist flist) { int i; for (i = 0; i < num_src_params; i++) { /* get path for step */ const char* target = src_params[i].path; if (DCOPY_global_rank == 0) { MFU_LOG(MFU_LOG_INFO, "Walking %s", target); } /* walk file tree and record stat data for each item */ int walk_stat = 1; int dir_perm = 0; mfu_flist_walk_path(target, walk_stat, dir_perm, flist); } }
int main(int argc, char** argv) { uint64_t i; int status; uint64_t file_size; uint64_t chunk_size = DDUP_CHUNK_SIZE; SHA256_CTX* ctx_ptr; MPI_Init(NULL, NULL); mfu_init(); int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* pointer to mfu_walk_opts */ mfu_walk_opts_t* walk_opts = mfu_walk_opts_new(); mfu_debug_level = MFU_LOG_VERBOSE; static struct option long_options[] = { {"debug", 0, 0, 'd'}, {"verbose", 0, 0, 'v'}, {"quiet", 0, 0, 'q'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* Parse options */ int usage = 0; int help = 0; int c; int option_index = 0; while ((c = getopt_long(argc, argv, "d:vqh", \ long_options, &option_index)) != -1) { switch (c) { case 'd': if (strncmp(optarg, "fatal", 5) == 0) { mfu_debug_level = MFU_LOG_FATAL; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: fatal"); } else if (strncmp(optarg, "err", 3) == 0) { mfu_debug_level = MFU_LOG_ERR; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: " "errors"); } else if (strncmp(optarg, "warn", 4) == 0) { mfu_debug_level = MFU_LOG_WARN; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: " "warnings"); } else if (strncmp(optarg, "info", 4) == 0) { mfu_debug_level = MFU_LOG_INFO; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: info"); } else if (strncmp(optarg, "dbg", 3) == 0) { mfu_debug_level = MFU_LOG_DBG; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: debug"); } else { if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level `%s' not " "recognized. Defaulting to " "`info'.", optarg); } case 'h': usage = 1; help = 1; case 'v': mfu_debug_level = MFU_LOG_VERBOSE; break; case 'q': mfu_debug_level = MFU_LOG_NONE; break; case '?': usage = 1; help = 1; break; default: usage = 1; break; } } /* check that user gave us one and only one directory */ int numargs = argc - optind; if (numargs != 1) { /* missing the directory, so post a message, and print usage */ if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "You must specify a directory path"); } usage = 1; } /* print usage and bail if needed */ if (usage) { if (rank == 0) { print_usage(); } /* set error code base on whether user requested usage or not */ if (help) { status = 0; } else { status = -1; } MPI_Barrier(MPI_COMM_WORLD); goto out; } /* get the directory name */ const char* dir = argv[optind]; /* create MPI datatypes */ MPI_Datatype key; MPI_Datatype keysat; mpi_type_init(&key, &keysat); /* create DTCMP comparison operation */ DTCMP_Op cmp; mtcmp_cmp_init(&cmp); /* allocate buffer to read data from file */ char* chunk_buf = (char*)MFU_MALLOC(DDUP_CHUNK_SIZE); /* allocate a file list */ mfu_flist flist = mfu_flist_new(); /* Walk the path(s) to build the flist */ mfu_flist_walk_path(dir, walk_opts, flist); /* TODO: spread list among procs? */ /* get local number of items in flist */ uint64_t checking_files = mfu_flist_size(flist); /* allocate memory to hold SHA256 context values */ struct file_item* file_items = (struct file_item*) MFU_MALLOC(checking_files * sizeof(*file_items)); /* Allocate two lists of length size, where each * element has (DDUP_KEY_SIZE + 1) uint64_t values * (id, checksum, index) */ size_t list_bytes = checking_files * (DDUP_KEY_SIZE + 1) * sizeof(uint64_t); uint64_t* list = (uint64_t*) MFU_MALLOC(list_bytes); uint64_t* new_list = (uint64_t*) MFU_MALLOC(list_bytes); /* Initialize the list */ uint64_t* ptr = list; uint64_t new_checking_files = 0; for (i = 0; i < checking_files; i++) { /* check that item is a regular file */ mode_t mode = (mode_t) mfu_flist_file_get_mode(flist, i); if (! S_ISREG(mode)) { continue; } /* get the file size */ file_size = mfu_flist_file_get_size(flist, i); if (file_size == 0) { /* Files with size zero are not interesting at all */ continue; } /* for first pass, group all files with same file size */ ptr[0] = file_size; /* we'll leave the middle part of the key unset */ /* record our index in flist */ ptr[DDUP_KEY_SIZE] = i; /* initialize the SHA256 hash state for this file */ SHA256_Init(&file_items[i].ctx); /* increment our file count */ new_checking_files++; /* advance to next spot in the list */ ptr += DDUP_KEY_SIZE + 1; } /* reduce our list count based on any files filtered out above */ checking_files = new_checking_files; /* allocate arrays to hold result from DTCMP_Rankv call to * assign group and rank values to each item */ uint64_t output_bytes = checking_files * sizeof(uint64_t); uint64_t* group_id = (uint64_t*) MFU_MALLOC(output_bytes); uint64_t* group_ranks = (uint64_t*) MFU_MALLOC(output_bytes); uint64_t* group_rank = (uint64_t*) MFU_MALLOC(output_bytes); /* get total number of items across all tasks */ uint64_t sum_checking_files; MPI_Allreduce(&checking_files, &sum_checking_files, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); uint64_t chunk_id = 0; while (sum_checking_files > 1) { /* update the chunk id we'll read from all files */ chunk_id++; /* iterate over our list and compute SHA256 value for each */ ptr = list; for (i = 0; i < checking_files; i++) { /* get the flist index for this item */ uint64_t idx = ptr[DDUP_KEY_SIZE]; /* look up file name */ const char* fname = mfu_flist_file_get_name(flist, idx); /* look up file size */ file_size = mfu_flist_file_get_size(flist, idx); /* read a chunk of data from the file into chunk_buf */ uint64_t data_size; status = read_data(fname, chunk_buf, chunk_id, chunk_size, file_size, &data_size); if (status) { /* File size has been changed, TODO: handle */ printf("failed to read file %s, maybe file " "size has been modified during the " "process", fname); } /* update the SHA256 context for this file */ ctx_ptr = &file_items[idx].ctx; SHA256_Update(ctx_ptr, chunk_buf, data_size); /* * Use SHA256 value as key. * This is actually an hack, but SHA256_Final can't * be called multiple times with out changing ctx */ SHA256_CTX ctx_tmp; memcpy(&ctx_tmp, ctx_ptr, sizeof(ctx_tmp)); SHA256_Final((unsigned char*)(ptr + 1), &ctx_tmp); /* move on to next file in the list */ ptr += DDUP_KEY_SIZE + 1; } /* Assign group ids and compute group sizes */ uint64_t groups; DTCMP_Rankv( (int)checking_files, list, &groups, group_id, group_ranks, group_rank, key, keysat, cmp, DTCMP_FLAG_NONE, MPI_COMM_WORLD ); /* any files assigned to a group of size 1 is unique, * any files in groups sizes > 1 for which we've read * all bytes are the same, and filter all other files * into a new list for another iteration */ new_checking_files = 0; ptr = list; uint64_t* new_ptr = new_list; for (i = 0; i < checking_files; i++) { /* Get index into flist for this item */ uint64_t idx = ptr[DDUP_KEY_SIZE]; /* look up file name */ const char* fname = mfu_flist_file_get_name(flist, idx); /* look up file size */ file_size = mfu_flist_file_get_size(flist, idx); /* get a pointer to the SHA256 context for this file */ ctx_ptr = &file_items[idx].ctx; if (group_ranks[i] == 1) { /* * Only one file in this group, * mfu_flist_file_name(flist, idx) is unique */ } else if (file_size <= (chunk_id * chunk_size)) { /* * We've run out of bytes to checksum, and we * still have a group size > 1 * mfu_flist_file_name(flist, idx) is a * duplicate with other files that also have * matching group_id[i] */ unsigned char digest[SHA256_DIGEST_LENGTH]; SHA256_Final(digest, ctx_ptr); char digest_string[SHA256_DIGEST_LENGTH * 2 + 1]; dump_sha256_digest(digest_string, digest); printf("%s %s\n", fname, digest_string); } else { /* Have multiple files with the same checksum, * but still have bytes left to read, so keep * this file */ /* use new group ID to segregate files, * this id will be unique for all files of the * same size and having the same hash up to * this point */ new_ptr[0] = group_id[i]; /* Copy over flist index into new list entry */ new_ptr[DDUP_KEY_SIZE] = idx; /* got one more in the new list */ new_checking_files++; /* move on to next item in new list */ new_ptr += DDUP_KEY_SIZE + 1; MFU_LOG(MFU_LOG_DBG, "checking file " "\"%s\" for chunk index %d of size %" PRIu64"\n", fname, (int)chunk_id, chunk_size); } /* move on to next file in the list */ ptr += DDUP_KEY_SIZE + 1; } /* Swap lists */ uint64_t* tmp_list; tmp_list = list; list = new_list; new_list = tmp_list; /* Update size of current list */ checking_files = new_checking_files; /* Get new global list size */ MPI_Allreduce(&checking_files, &sum_checking_files, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); } /* free the walk options */ mfu_walk_opts_delete(&walk_opts); mfu_free(&group_rank); mfu_free(&group_ranks); mfu_free(&group_id); mfu_free(&new_list); mfu_free(&list); mfu_free(&file_items); mfu_free(&chunk_buf); mfu_flist_free(&flist); mtcmp_cmp_fini(&cmp); mpi_type_fini(&key, &keysat); status = 0; out: mfu_finalize(); MPI_Finalize(); return status; }