/* for given depth, just remove the files we know about */ static void remove_direct(mfu_flist list, uint64_t* rmcount) { /* each process directly removes its elements */ uint64_t idx; uint64_t size = mfu_flist_size(list); for (idx = 0; idx < size; idx++) { /* get name and type of item */ const char* name = mfu_flist_file_get_name(list, idx); mfu_filetype type = mfu_flist_file_get_type(list, idx); /* delete item */ if (type == MFU_TYPE_DIR) { remove_type('d', name); } else if (type == MFU_TYPE_FILE || type == MFU_TYPE_LINK) { remove_type('f', name); } else { remove_type('u', name); } } /* report the number of items we deleted */ *rmcount = size; return; }
/* given an input list, split items into separate lists depending * on their depth, returns number of levels, minimum depth, and * array of lists as output */ void mfu_flist_array_by_depth( mfu_flist srclist, int* outlevels, int* outmin, mfu_flist** outlists) { /* check that our pointers are valid */ if (outlevels == NULL || outmin == NULL || outlists == NULL) { return; } /* initialize return values */ *outlevels = 0; *outmin = -1; *outlists = NULL; /* get total file count */ uint64_t total = mfu_flist_global_size(srclist); if (total == 0) { return; } /* get min and max depths, determine number of levels, * allocate array of lists */ int min = mfu_flist_min_depth(srclist); int max = mfu_flist_max_depth(srclist); int levels = max - min + 1; mfu_flist* lists = (mfu_flist*) MFU_MALLOC((size_t)levels * sizeof(mfu_flist)); /* create a list for each level */ int i; for (i = 0; i < levels; i++) { lists[i] = mfu_flist_subset(srclist); } /* copy each item from source list to its corresponding level */ uint64_t idx = 0; uint64_t size = mfu_flist_size(srclist); while (idx < size) { int depth = mfu_flist_file_get_depth(srclist, idx); int depth_index = depth - min; mfu_flist dstlist = lists[depth_index]; mfu_flist_file_copy(srclist, idx, dstlist); idx++; } /* summarize each list */ for (i = 0; i < levels; i++) { mfu_flist_summarize(lists[i]); } /* set return parameters */ *outlevels = levels; *outmin = min; *outlists = lists; return; }
/* apply predicate tests and actions to matching items in flist */ static void mfu_flist_pred(mfu_flist flist, mfu_pred* p) { uint64_t idx; uint64_t size = mfu_flist_size(flist); for (idx = 0; idx < size; idx++) { mfu_pred_execute(flist, idx, p); } return; }
/* filter the list of files down based on the current stripe size and stripe count */ static mfu_flist filter_list(mfu_flist list, int stripe_count, uint64_t stripe_size, uint64_t min_size, uint64_t* total_count, uint64_t* total_size) { /* initialize counters for file and byte count */ uint64_t my_count = 0; uint64_t my_size = 0; /* this is going to be a subset of the full file list */ mfu_flist filtered = mfu_flist_subset(list); uint64_t idx; uint64_t size = mfu_flist_size(list); for (idx = 0; idx < size; idx++) { /* we only care about regular files */ mfu_filetype type = mfu_flist_file_get_type(list, idx); if (type == MFU_TYPE_FILE) { /* if our file is below the minimum file size, skip it */ uint64_t filesize = mfu_flist_file_get_size(list, idx); if (filesize < min_size) { continue; } const char* in_path = mfu_flist_file_get_name(list, idx); uint64_t curr_stripe_size = 0; uint64_t curr_stripe_count = 0; /* * attempt to get striping info, * skip the file if we can't get the striping info we seek */ if (mfu_stripe_get(in_path, &curr_stripe_size, &curr_stripe_count) != 0) { continue; } /* TODO: this should probably be better */ /* if the current stripe size or stripe count doesn't match, then a restripe the file */ if (curr_stripe_count != stripe_count || curr_stripe_size != stripe_size) { mfu_flist_file_copy(list, idx, filtered); /* increment file count and add file size to our running total */ my_count += 1; my_size += filesize; } } } /* summarize and return the new list */ mfu_flist_summarize(filtered); /* get sum of count and size */ MPI_Allreduce(&my_count, total_count, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&my_size, total_size, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); return filtered; }
/* Given an input file list, stat each file and enqueue details * in output file list, skip entries excluded by skip function * and skip args */ void mfu_flist_stat( mfu_flist input_flist, mfu_flist flist, mfu_flist_skip_fn skip_fn, void *skip_args) { flist_t* file_list = (flist_t*)flist; /* we will stat all items in output list, so set detail to 1 */ file_list->detail = 1; /* get user data if needed */ if (file_list->have_users == 0) { mfu_flist_usrgrp_get_users(flist); } /* get groups data if needed */ if (file_list->have_groups == 0) { mfu_flist_usrgrp_get_groups(flist); } /* step through each item in input list and stat it */ uint64_t idx; uint64_t size = mfu_flist_size(input_flist); for (idx = 0; idx < size; idx++) { /* get name of item */ const char* name = mfu_flist_file_get_name(input_flist, idx); /* check whether we should skip this item */ if (skip_fn != NULL && skip_fn(name, skip_args)) { /* skip this file, don't include it in new list */ MFU_LOG(MFU_LOG_INFO, "skip %s"); continue; } /* stat the item */ struct stat st; int status = mfu_lstat(name, &st); if (status != 0) { MFU_LOG(MFU_LOG_ERR, "mfu_lstat() failed: `%s' rc=%d (errno=%d %s)", name, status, errno, strerror(errno)); continue; } /* insert item into output list */ mfu_flist_insert_stat(flist, name, st.st_mode, &st); } /* compute global summary */ mfu_flist_summarize(flist); }
/* print to stdout the stripe size and count of each file in the mfu_flist */ static void stripe_info_report(mfu_flist list) { uint64_t idx; uint64_t size = mfu_flist_size(list); int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* print header */ if (rank == 0) { printf("%10s %3.3s %8.8s %s\n", "Size", "Cnt", "Str Size", "File Path"); printf("%10s %3.3s %8.8s %s\n", "----", "---", "--------", "---------"); fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); /* print out file info */ for (idx = 0; idx < size; idx++) { mfu_filetype type = mfu_flist_file_get_type(list, idx); /* report striping information for regular files only */ if (type == MFU_TYPE_FILE) { const char* in_path = mfu_flist_file_get_name(list, idx); uint64_t stripe_size = 0; uint64_t stripe_count = 0; char filesize[11]; char stripesize[9]; /* * attempt to get striping info and print it out, * skip the file if we can't get the striping info we seek */ if (mfu_stripe_get(in_path, &stripe_size, &stripe_count) != 0) { continue; } /* format it nicely */ generate_pretty_size(filesize, sizeof(filesize), mfu_flist_file_get_size(list, idx)); generate_pretty_size(stripesize, sizeof(stripesize), stripe_size); /* print the row */ printf("%10.10s %3" PRId64 " %8.8s %s\n", filesize, stripe_count, stripesize, in_path); fflush(stdout); } } }
static void remove_create(CIRCLE_handle* handle) { char path[CIRCLE_MAX_STRING_LEN]; /* enqueues all items at rm_depth to be deleted */ uint64_t idx; uint64_t size = mfu_flist_size(circle_list); for (idx = 0; idx < size; idx++) { /* get name and type of item */ const char* name = mfu_flist_file_get_name(circle_list, idx); mfu_filetype type = mfu_flist_file_get_type(circle_list, idx); /* encode type */ if (type == MFU_TYPE_DIR) { path[0] = 'd'; } else if (type == MFU_TYPE_FILE || type == MFU_TYPE_LINK) { path[0] = 'f'; } else { path[0] = 'u'; } /* encode name */ size_t len = strlen(name) + 2; if (len <= CIRCLE_MAX_STRING_LEN) { strcpy(&path[1], name); handle->enqueue(path); } else { MFU_LOG(MFU_LOG_ERR, "Filename longer than %lu", (unsigned long)CIRCLE_MAX_STRING_LEN ); } } return; }
int main(int argc, char** argv) { uint64_t i; int status; uint64_t file_size; uint64_t chunk_size = DDUP_CHUNK_SIZE; SHA256_CTX* ctx_ptr; MPI_Init(NULL, NULL); mfu_init(); int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* pointer to mfu_walk_opts */ mfu_walk_opts_t* walk_opts = mfu_walk_opts_new(); mfu_debug_level = MFU_LOG_VERBOSE; static struct option long_options[] = { {"debug", 0, 0, 'd'}, {"verbose", 0, 0, 'v'}, {"quiet", 0, 0, 'q'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* Parse options */ int usage = 0; int help = 0; int c; int option_index = 0; while ((c = getopt_long(argc, argv, "d:vqh", \ long_options, &option_index)) != -1) { switch (c) { case 'd': if (strncmp(optarg, "fatal", 5) == 0) { mfu_debug_level = MFU_LOG_FATAL; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: fatal"); } else if (strncmp(optarg, "err", 3) == 0) { mfu_debug_level = MFU_LOG_ERR; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: " "errors"); } else if (strncmp(optarg, "warn", 4) == 0) { mfu_debug_level = MFU_LOG_WARN; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: " "warnings"); } else if (strncmp(optarg, "info", 4) == 0) { mfu_debug_level = MFU_LOG_INFO; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: info"); } else if (strncmp(optarg, "dbg", 3) == 0) { mfu_debug_level = MFU_LOG_DBG; if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level set to: debug"); } else { if (rank == 0) MFU_LOG(MFU_LOG_INFO, "Debug level `%s' not " "recognized. Defaulting to " "`info'.", optarg); } case 'h': usage = 1; help = 1; case 'v': mfu_debug_level = MFU_LOG_VERBOSE; break; case 'q': mfu_debug_level = MFU_LOG_NONE; break; case '?': usage = 1; help = 1; break; default: usage = 1; break; } } /* check that user gave us one and only one directory */ int numargs = argc - optind; if (numargs != 1) { /* missing the directory, so post a message, and print usage */ if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "You must specify a directory path"); } usage = 1; } /* print usage and bail if needed */ if (usage) { if (rank == 0) { print_usage(); } /* set error code base on whether user requested usage or not */ if (help) { status = 0; } else { status = -1; } MPI_Barrier(MPI_COMM_WORLD); goto out; } /* get the directory name */ const char* dir = argv[optind]; /* create MPI datatypes */ MPI_Datatype key; MPI_Datatype keysat; mpi_type_init(&key, &keysat); /* create DTCMP comparison operation */ DTCMP_Op cmp; mtcmp_cmp_init(&cmp); /* allocate buffer to read data from file */ char* chunk_buf = (char*)MFU_MALLOC(DDUP_CHUNK_SIZE); /* allocate a file list */ mfu_flist flist = mfu_flist_new(); /* Walk the path(s) to build the flist */ mfu_flist_walk_path(dir, walk_opts, flist); /* TODO: spread list among procs? */ /* get local number of items in flist */ uint64_t checking_files = mfu_flist_size(flist); /* allocate memory to hold SHA256 context values */ struct file_item* file_items = (struct file_item*) MFU_MALLOC(checking_files * sizeof(*file_items)); /* Allocate two lists of length size, where each * element has (DDUP_KEY_SIZE + 1) uint64_t values * (id, checksum, index) */ size_t list_bytes = checking_files * (DDUP_KEY_SIZE + 1) * sizeof(uint64_t); uint64_t* list = (uint64_t*) MFU_MALLOC(list_bytes); uint64_t* new_list = (uint64_t*) MFU_MALLOC(list_bytes); /* Initialize the list */ uint64_t* ptr = list; uint64_t new_checking_files = 0; for (i = 0; i < checking_files; i++) { /* check that item is a regular file */ mode_t mode = (mode_t) mfu_flist_file_get_mode(flist, i); if (! S_ISREG(mode)) { continue; } /* get the file size */ file_size = mfu_flist_file_get_size(flist, i); if (file_size == 0) { /* Files with size zero are not interesting at all */ continue; } /* for first pass, group all files with same file size */ ptr[0] = file_size; /* we'll leave the middle part of the key unset */ /* record our index in flist */ ptr[DDUP_KEY_SIZE] = i; /* initialize the SHA256 hash state for this file */ SHA256_Init(&file_items[i].ctx); /* increment our file count */ new_checking_files++; /* advance to next spot in the list */ ptr += DDUP_KEY_SIZE + 1; } /* reduce our list count based on any files filtered out above */ checking_files = new_checking_files; /* allocate arrays to hold result from DTCMP_Rankv call to * assign group and rank values to each item */ uint64_t output_bytes = checking_files * sizeof(uint64_t); uint64_t* group_id = (uint64_t*) MFU_MALLOC(output_bytes); uint64_t* group_ranks = (uint64_t*) MFU_MALLOC(output_bytes); uint64_t* group_rank = (uint64_t*) MFU_MALLOC(output_bytes); /* get total number of items across all tasks */ uint64_t sum_checking_files; MPI_Allreduce(&checking_files, &sum_checking_files, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); uint64_t chunk_id = 0; while (sum_checking_files > 1) { /* update the chunk id we'll read from all files */ chunk_id++; /* iterate over our list and compute SHA256 value for each */ ptr = list; for (i = 0; i < checking_files; i++) { /* get the flist index for this item */ uint64_t idx = ptr[DDUP_KEY_SIZE]; /* look up file name */ const char* fname = mfu_flist_file_get_name(flist, idx); /* look up file size */ file_size = mfu_flist_file_get_size(flist, idx); /* read a chunk of data from the file into chunk_buf */ uint64_t data_size; status = read_data(fname, chunk_buf, chunk_id, chunk_size, file_size, &data_size); if (status) { /* File size has been changed, TODO: handle */ printf("failed to read file %s, maybe file " "size has been modified during the " "process", fname); } /* update the SHA256 context for this file */ ctx_ptr = &file_items[idx].ctx; SHA256_Update(ctx_ptr, chunk_buf, data_size); /* * Use SHA256 value as key. * This is actually an hack, but SHA256_Final can't * be called multiple times with out changing ctx */ SHA256_CTX ctx_tmp; memcpy(&ctx_tmp, ctx_ptr, sizeof(ctx_tmp)); SHA256_Final((unsigned char*)(ptr + 1), &ctx_tmp); /* move on to next file in the list */ ptr += DDUP_KEY_SIZE + 1; } /* Assign group ids and compute group sizes */ uint64_t groups; DTCMP_Rankv( (int)checking_files, list, &groups, group_id, group_ranks, group_rank, key, keysat, cmp, DTCMP_FLAG_NONE, MPI_COMM_WORLD ); /* any files assigned to a group of size 1 is unique, * any files in groups sizes > 1 for which we've read * all bytes are the same, and filter all other files * into a new list for another iteration */ new_checking_files = 0; ptr = list; uint64_t* new_ptr = new_list; for (i = 0; i < checking_files; i++) { /* Get index into flist for this item */ uint64_t idx = ptr[DDUP_KEY_SIZE]; /* look up file name */ const char* fname = mfu_flist_file_get_name(flist, idx); /* look up file size */ file_size = mfu_flist_file_get_size(flist, idx); /* get a pointer to the SHA256 context for this file */ ctx_ptr = &file_items[idx].ctx; if (group_ranks[i] == 1) { /* * Only one file in this group, * mfu_flist_file_name(flist, idx) is unique */ } else if (file_size <= (chunk_id * chunk_size)) { /* * We've run out of bytes to checksum, and we * still have a group size > 1 * mfu_flist_file_name(flist, idx) is a * duplicate with other files that also have * matching group_id[i] */ unsigned char digest[SHA256_DIGEST_LENGTH]; SHA256_Final(digest, ctx_ptr); char digest_string[SHA256_DIGEST_LENGTH * 2 + 1]; dump_sha256_digest(digest_string, digest); printf("%s %s\n", fname, digest_string); } else { /* Have multiple files with the same checksum, * but still have bytes left to read, so keep * this file */ /* use new group ID to segregate files, * this id will be unique for all files of the * same size and having the same hash up to * this point */ new_ptr[0] = group_id[i]; /* Copy over flist index into new list entry */ new_ptr[DDUP_KEY_SIZE] = idx; /* got one more in the new list */ new_checking_files++; /* move on to next item in new list */ new_ptr += DDUP_KEY_SIZE + 1; MFU_LOG(MFU_LOG_DBG, "checking file " "\"%s\" for chunk index %d of size %" PRIu64"\n", fname, (int)chunk_id, chunk_size); } /* move on to next file in the list */ ptr += DDUP_KEY_SIZE + 1; } /* Swap lists */ uint64_t* tmp_list; tmp_list = list; list = new_list; new_list = tmp_list; /* Update size of current list */ checking_files = new_checking_files; /* Get new global list size */ MPI_Allreduce(&checking_files, &sum_checking_files, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); } /* free the walk options */ mfu_walk_opts_delete(&walk_opts); mfu_free(&group_rank); mfu_free(&group_ranks); mfu_free(&group_id); mfu_free(&new_list); mfu_free(&list); mfu_free(&file_items); mfu_free(&chunk_buf); mfu_flist_free(&flist); mtcmp_cmp_fini(&cmp); mpi_type_fini(&key, &keysat); status = 0; out: mfu_finalize(); MPI_Finalize(); return status; }
/* removes list of items, sets write bits on directories from * top-to-bottom, then removes items one level at a time starting * from the deepest */ void mfu_flist_unlink(mfu_flist flist) { int level; /* wait for all tasks and start timer */ MPI_Barrier(MPI_COMM_WORLD); double start_remove = MPI_Wtime(); /* split files into separate lists by directory depth */ int levels, minlevel; mfu_flist* lists; mfu_flist_array_by_depth(flist, &levels, &minlevel, &lists); #if 0 /* dive from shallow to deep, ensure all directories have write bit set */ for (level = 0; level < levels; level++) { /* get list of items for this level */ mfu_flist list = lists[level]; /* determine whether we have details at this level */ int detail = mfu_flist_have_detail(list); /* iterate over items and set write bit on directories if needed */ uint64_t idx; uint64_t size = mfu_flist_size(list); for (idx = 0; idx < size; idx++) { /* check whether we have a directory */ mfu_filetype type = mfu_flist_file_get_type(list, idx); if (type == MFU_TYPE_DIR) { /* assume we have to set the bit */ int set_write_bit = 1; if (detail) { mode_t mode = (mode_t) mfu_flist_file_get_mode(list, idx); if (mode & S_IWUSR) { /* we have the mode of the file, and the bit is already set */ set_write_bit = 0; } } /* set the bit if needed */ if (set_write_bit) { const char* name = mfu_flist_file_get_name(list, idx); int rc = chmod(name, S_IRWXU); if (rc != 0) { MFU_LOG(MFU_LOG_ERR, "Failed to chmod directory `%s' (errno=%d %s)", name, errno, strerror(errno) ); } } } } /* wait for all procs to finish before we start next level */ MPI_Barrier(MPI_COMM_WORLD); } #endif /* now remove files starting from deepest level */ for (level = levels - 1; level >= 0; level--) { double start = MPI_Wtime(); /* get list of items for this level */ mfu_flist list = lists[level]; uint64_t count = 0; //remove_direct(list, &count); remove_spread(list, &count); // remove_map(list, &count); // remove_sort(list, &count); // remove_libcircle(list, &count); // TODO: remove sort w/ spread /* wait for all procs to finish before we start * with files at next level */ MPI_Barrier(MPI_COMM_WORLD); double end = MPI_Wtime(); if (mfu_debug_level >= MFU_LOG_VERBOSE) { uint64_t min, max, sum; MPI_Allreduce(&count, &min, 1, MPI_UINT64_T, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&count, &max, 1, MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&count, &sum, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); double rate = 0.0; if (end - start > 0.0) { rate = (double)sum / (end - start); } double time_diff = end - start; if (mfu_rank == 0) { printf("level=%d min=%lu max=%lu sum=%lu rate=%f secs=%f\n", (minlevel + level), (unsigned long)min, (unsigned long)max, (unsigned long)sum, rate, time_diff ); fflush(stdout); } } } mfu_flist_array_free(levels, &lists); /* wait for all tasks and stop timer */ MPI_Barrier(MPI_COMM_WORLD); double end_remove = MPI_Wtime(); /* report remove count, time, and rate */ if (mfu_debug_level >= MFU_LOG_VERBOSE && mfu_rank == 0) { uint64_t all_count = mfu_flist_global_size(flist); double time_diff = end_remove - start_remove; double rate = 0.0; if (time_diff > 0.0) { rate = ((double)all_count) / time_diff; } printf("Removed %lu items in %f seconds (%f items/sec)\n", all_count, time_diff, rate ); } return; }
/* for each depth, sort files by filename and then remove, to test * whether it matters to limit the number of directories each process * has to reference (e.g., locking) */ static void remove_sort(mfu_flist list, uint64_t* rmcount) { /* bail out if total count is 0 */ uint64_t all_count = mfu_flist_global_size(list); if (all_count == 0) { return; } /* get maximum file name and number of items */ int chars = (int) mfu_flist_file_max_name(list); uint64_t my_count = mfu_flist_size(list); /* create key datatype (filename) and comparison op */ MPI_Datatype dt_key; DTCMP_Op op_str; DTCMP_Str_create_ascend(chars, &dt_key, &op_str); /* create keysat datatype (filename + type) */ MPI_Datatype types[2], dt_keysat; types[0] = dt_key; types[1] = MPI_CHAR; DTCMP_Type_create_series(2, types, &dt_keysat); /* allocate send buffer */ int sendcount = (int) my_count; size_t sendbufsize = (size_t)(sendcount * (chars + 1)); char* sendbuf = (char*) MFU_MALLOC(sendbufsize); /* copy data into buffer */ char* ptr = sendbuf; uint64_t idx; for (idx = 0; idx < my_count; idx++) { /* encode the filename first */ const char* name = mfu_flist_file_get_name(list, idx); strcpy(ptr, name); ptr += chars; /* last character encodes item type */ mfu_filetype type = mfu_flist_file_get_type(list, idx); if (type == MFU_TYPE_DIR) { ptr[0] = 'd'; } else if (type == MFU_TYPE_FILE || type == MFU_TYPE_LINK) { ptr[0] = 'f'; } else { ptr[0] = 'u'; } ptr++; } /* sort items */ void* recvbuf; int recvcount; DTCMP_Handle handle; DTCMP_Sortz( sendbuf, sendcount, &recvbuf, &recvcount, dt_key, dt_keysat, op_str, DTCMP_FLAG_NONE, MPI_COMM_WORLD, &handle ); /* delete data */ int delcount = 0; ptr = (char*)recvbuf; while (delcount < recvcount) { /* get item name */ char* name = ptr; ptr += chars; /* get item type */ char type = ptr[0]; ptr++; /* delete item */ remove_type(type, name); delcount++; } /* record number of items we deleted */ *rmcount = (uint64_t) delcount; /* free output data */ DTCMP_Free(&handle); /* free our send buffer */ mfu_free(&sendbuf); /* free key comparison operation */ DTCMP_Op_free(&op_str); /* free datatypes */ MPI_Type_free(&dt_keysat); MPI_Type_free(&dt_key); return; }
/* for given depth, evenly spread the files among processes for * improved load balancing */ static void remove_spread(mfu_flist flist, uint64_t* rmcount) { uint64_t idx; /* initialize our remove count */ *rmcount = 0; /* get our rank and number of ranks in job */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* allocate memory for alltoall exchanges */ size_t bufsize = (size_t)ranks * sizeof(int); int* sendcounts = (int*) MFU_MALLOC(bufsize); int* sendsizes = (int*) MFU_MALLOC(bufsize); int* senddisps = (int*) MFU_MALLOC(bufsize); int* recvsizes = (int*) MFU_MALLOC(bufsize); int* recvdisps = (int*) MFU_MALLOC(bufsize); /* get number of items */ uint64_t my_count = mfu_flist_size(flist); uint64_t all_count = mfu_flist_global_size(flist); uint64_t offset = mfu_flist_global_offset(flist); /* compute number of bytes we'll send */ size_t sendbytes = 0; for (idx = 0; idx < my_count; idx++) { const char* name = mfu_flist_file_get_name(flist, idx); size_t len = strlen(name) + 2; sendbytes += len; } /* compute the number of items that each rank should have */ uint64_t low = all_count / (uint64_t)ranks; uint64_t extra = all_count - low * (uint64_t)ranks; /* compute number that we'll send to each rank and initialize sendsizes and offsets */ uint64_t i; for (i = 0; i < (uint64_t)ranks; i++) { /* compute starting element id and count for given rank */ uint64_t start, num; if (i < extra) { num = low + 1; start = i * num; } else { num = low; start = (i - extra) * num + extra * (low + 1); } /* compute the number of items we'll send to this task */ uint64_t sendcnt = 0; if (my_count > 0) { if (start <= offset && offset < start + num) { /* this rank overlaps our range, * and its first element comes at or before our first element */ sendcnt = num - (offset - start); if (my_count < sendcnt) { /* the number the rank could receive from us * is more than we have left */ sendcnt = my_count; } } else if (offset < start && start < offset + my_count) { /* this rank overlaps our range, * and our first element comes strictly before its first element */ sendcnt = my_count - (start - offset); if (num < sendcnt) { /* the number the rank can receive from us * is less than we have left */ sendcnt = num; } } } /* record the number of items we'll send to this task */ sendcounts[i] = (int) sendcnt; /* set sizes and displacements to 0, we'll fix this later */ sendsizes[i] = 0; senddisps[i] = 0; } /* allocate space */ char* sendbuf = (char*) MFU_MALLOC(sendbytes); /* copy data into buffer */ int dest = -1; int disp = 0; for (idx = 0; idx < my_count; idx++) { /* get name and type of item */ const char* name = mfu_flist_file_get_name(flist, idx); mfu_filetype type = mfu_flist_file_get_type(flist, idx); /* get rank that we're packing data for */ if (dest == -1) { dest = get_first_nonzero(sendcounts, ranks); if (dest == -1) { /* error */ } /* about to copy first item for this rank, * record its displacement */ senddisps[dest] = disp; } /* identify region to be sent to rank */ char* path = sendbuf + disp; /* first character encodes item type */ if (type == MFU_TYPE_DIR) { path[0] = 'd'; } else if (type == MFU_TYPE_FILE || type == MFU_TYPE_LINK) { path[0] = 'f'; } else { path[0] = 'u'; } /* now copy in the path */ strcpy(&path[1], name); /* TODO: check that we don't overflow the int */ /* add bytes to sendsizes and increase displacement */ size_t count = strlen(name) + 2; sendsizes[dest] += (int) count; disp += (int) count; /* decrement the count for this rank */ sendcounts[dest]--; if (sendcounts[dest] == 0) { dest = -1; } } /* compute displacements */ senddisps[0] = 0; for (i = 1; i < (uint64_t)ranks; i++) { senddisps[i] = senddisps[i - 1] + sendsizes[i - 1]; } /* alltoall to specify incoming counts */ MPI_Alltoall(sendsizes, 1, MPI_INT, recvsizes, 1, MPI_INT, MPI_COMM_WORLD); /* compute size of recvbuf and displacements */ size_t recvbytes = 0; recvdisps[0] = 0; for (i = 0; i < (uint64_t)ranks; i++) { recvbytes += (size_t) recvsizes[i]; if (i > 0) { recvdisps[i] = recvdisps[i - 1] + recvsizes[i - 1]; } } /* allocate recvbuf */ char* recvbuf = (char*) MFU_MALLOC(recvbytes); /* alltoallv to send data */ MPI_Alltoallv( sendbuf, sendsizes, senddisps, MPI_CHAR, recvbuf, recvsizes, recvdisps, MPI_CHAR, MPI_COMM_WORLD ); /* delete data */ char* item = recvbuf; while (item < recvbuf + recvbytes) { /* get item name and type */ char type = item[0]; char* name = &item[1]; /* delete item */ remove_type(type, name); /* keep tally of number of items we deleted */ *rmcount++; /* go to next item */ size_t item_size = strlen(item) + 1; item += item_size; } /* free memory */ mfu_free(&recvbuf); mfu_free(&recvdisps); mfu_free(&recvsizes); mfu_free(&sendbuf); mfu_free(&senddisps); mfu_free(&sendsizes); mfu_free(&sendcounts); return; }
static void mfu_flist_archive_create_libcircle(mfu_flist flist, const char* archivefile, mfu_archive_options_t* opts) { DTAR_flist = flist; DTAR_user_opts = *opts; MPI_Comm_rank(MPI_COMM_WORLD, &DTAR_rank); /* TODO: stripe the archive file if on parallel file system */ /* init statistics */ DTAR_statistics.total_dirs = 0; DTAR_statistics.total_files = 0; DTAR_statistics.total_links = 0; DTAR_statistics.total_size = 0; DTAR_statistics.total_bytes_copied = 0; time(&(DTAR_statistics.time_started)); DTAR_statistics.wtime_started = MPI_Wtime(); /* create the archive file */ DTAR_writer.name = archivefile; DTAR_writer.flags = O_WRONLY | O_CREAT | O_CLOEXEC | O_LARGEFILE; DTAR_writer.fd_tar = open(archivefile, DTAR_writer.flags, 0664); /* get number of items in our portion of the list */ DTAR_count = mfu_flist_size(DTAR_flist); /* allocate memory for file sizes and offsets */ uint64_t* fsizes = (uint64_t*) MFU_MALLOC(DTAR_count * sizeof(uint64_t)); DTAR_offsets = (uint64_t*) MFU_MALLOC(DTAR_count * sizeof(uint64_t)); /* compute local offsets for each item and total * bytes we're contributing to the archive */ uint64_t idx; uint64_t offset = 0; for (idx = 0; idx < DTAR_count; idx++) { /* assume the item takes no space */ fsizes[idx] = 0; /* identify item type to compute its size in the archive */ mfu_filetype type = mfu_flist_file_get_type(DTAR_flist, idx); if (type == MFU_TYPE_DIR || type == MFU_TYPE_LINK) { /* directories and symlinks only need the header */ fsizes[idx] = DTAR_HDR_LENGTH; } else if (type == MFU_TYPE_FILE) { /* regular file requires a header, plus file content, * and things are packed into blocks of 512 bytes */ uint64_t fsize = mfu_flist_file_get_size(DTAR_flist, idx); /* determine whether file size is integer multiple of 512 bytes */ uint64_t rem = fsize % 512; if (rem == 0) { /* file content is multiple of 512 bytes, so perfect fit */ fsizes[idx] = fsize + DTAR_HDR_LENGTH; } else { /* TODO: check and explain this math */ fsizes[idx] = (fsize / 512 + 4) * 512; } } /* increment our local offset for this item */ DTAR_offsets[idx] = offset; offset += fsizes[idx]; } /* execute scan to figure our global base offset in the archive file */ uint64_t global_offset = 0; MPI_Scan(&offset, &global_offset, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); global_offset -= offset; /* update offsets for each of our file to their global offset */ for (idx = 0; idx < DTAR_count; idx++) { DTAR_offsets[idx] += global_offset; } /* create an archive */ struct archive* ar = archive_write_new(); archive_write_set_format_pax(ar); int r = archive_write_open_fd(ar, DTAR_writer.fd_tar); if (r != ARCHIVE_OK) { MFU_LOG(MFU_LOG_ERR, "archive_write_open_fd(): %s", archive_error_string(ar)); DTAR_abort(EXIT_FAILURE); } /* write headers for our files */ for (idx = 0; idx < DTAR_count; idx++) { mfu_filetype type = mfu_flist_file_get_type(DTAR_flist, idx); if (type == MFU_TYPE_FILE || type == MFU_TYPE_DIR || type == MFU_TYPE_LINK) { DTAR_write_header(ar, idx, DTAR_offsets[idx]); } } /* prepare libcircle */ CIRCLE_init(0, NULL, CIRCLE_SPLIT_EQUAL | CIRCLE_CREATE_GLOBAL); CIRCLE_loglevel loglevel = CIRCLE_LOG_WARN; CIRCLE_enable_logging(loglevel); /* register callbacks */ CIRCLE_cb_create(&DTAR_enqueue_copy); CIRCLE_cb_process(&DTAR_perform_copy); /* run the libcircle job to copy data into archive file */ CIRCLE_begin(); CIRCLE_finalize(); /* compute total bytes copied */ uint64_t archive_size = 0; MPI_Allreduce(&offset, &archive_size, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); DTAR_statistics.total_size = archive_size; DTAR_statistics.wtime_ended = MPI_Wtime(); time(&(DTAR_statistics.time_ended)); /* print stats */ double rel_time = DTAR_statistics.wtime_ended - \ DTAR_statistics.wtime_started; if (DTAR_rank == 0) { char starttime_str[256]; struct tm* localstart = localtime(&(DTAR_statistics.time_started)); strftime(starttime_str, 256, "%b-%d-%Y, %H:%M:%S", localstart); char endtime_str[256]; struct tm* localend = localtime(&(DTAR_statistics.time_ended)); strftime(endtime_str, 256, "%b-%d-%Y, %H:%M:%S", localend); /* add two 512 blocks at the end */ DTAR_statistics.total_size += 512 * 2; /* convert bandwidth to unit */ double agg_rate_tmp; double agg_rate = (double) DTAR_statistics.total_size / rel_time; const char* agg_rate_units; mfu_format_bytes(agg_rate, &agg_rate_tmp, &agg_rate_units); MFU_LOG(MFU_LOG_INFO, "Started: %s", starttime_str); MFU_LOG(MFU_LOG_INFO, "Completed: %s", endtime_str); MFU_LOG(MFU_LOG_INFO, "Total archive size: %" PRIu64, DTAR_statistics.total_size); MFU_LOG(MFU_LOG_INFO, "Rate: %.3lf %s " \ "(%.3" PRIu64 " bytes in %.3lf seconds)", \ agg_rate_tmp, agg_rate_units, DTAR_statistics.total_size, rel_time); } /* clean up */ mfu_free(&fsizes); mfu_free(&DTAR_offsets); /* close archive file */ archive_write_free(ar); mfu_close(DTAR_writer.name, DTAR_writer.fd_tar); }
static int sort_files_readdir(const char* sortfields, mfu_flist* pflist) { /* get list from caller */ mfu_flist flist = *pflist; /* create a new list as subset of original list */ mfu_flist flist2 = mfu_flist_subset(flist); uint64_t incount = mfu_flist_size(flist); uint64_t chars = mfu_flist_file_max_name(flist); /* create datatype for packed file list element */ MPI_Datatype dt_sat; size_t bytes = mfu_flist_file_pack_size(flist); MPI_Type_contiguous((int)bytes, MPI_BYTE, &dt_sat); /* get our rank and the size of comm_world */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* build type for file path */ MPI_Datatype dt_filepath; MPI_Type_contiguous((int)chars, MPI_CHAR, &dt_filepath); MPI_Type_commit(&dt_filepath); /* build comparison op for filenames */ DTCMP_Op op_filepath; if (DTCMP_Op_create(dt_filepath, my_strcmp, &op_filepath) != DTCMP_SUCCESS) { MFU_ABORT(1, "Failed to create sorting operation for filepath"); } /* build comparison op for filenames */ DTCMP_Op op_filepath_rev; if (DTCMP_Op_create(dt_filepath, my_strcmp_rev, &op_filepath_rev) != DTCMP_SUCCESS) { MFU_ABORT(1, "Failed to create reverse sorting operation for filepath"); } /* TODO: process sort fields */ const int MAXFIELDS = 1; MPI_Datatype types[MAXFIELDS]; DTCMP_Op ops[MAXFIELDS]; sort_field fields[MAXFIELDS]; size_t lengths[MAXFIELDS]; int nfields = 0; for (nfields = 0; nfields < MAXFIELDS; nfields++) { types[nfields] = MPI_DATATYPE_NULL; ops[nfields] = DTCMP_OP_NULL; } nfields = 0; char* sortfields_copy = MFU_STRDUP(sortfields); char* token = strtok(sortfields_copy, ","); while (token != NULL) { int valid = 1; if (strcmp(token, "name") == 0) { types[nfields] = dt_filepath; ops[nfields] = op_filepath; fields[nfields] = FILENAME; lengths[nfields] = chars; } else if (strcmp(token, "-name") == 0) { types[nfields] = dt_filepath; ops[nfields] = op_filepath_rev; fields[nfields] = FILENAME; lengths[nfields] = chars; } else { /* invalid token */ valid = 0; if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "Invalid sort field: %s\n", token); } } if (valid) { nfields++; } if (nfields > MAXFIELDS) { /* TODO: print warning if we have too many fields */ break; } token = strtok(NULL, ","); } mfu_free(&sortfields_copy); /* build key type */ MPI_Datatype dt_key; if (DTCMP_Type_create_series(nfields, types, &dt_key) != DTCMP_SUCCESS) { MFU_ABORT(1, "Failed to create type for key"); } /* create sort op */ DTCMP_Op op_key; if (DTCMP_Op_create_series(nfields, ops, &op_key) != DTCMP_SUCCESS) { MFU_ABORT(1, "Failed to create sorting operation for key"); } /* build keysat type */ MPI_Datatype dt_keysat, keysat_types[2]; keysat_types[0] = dt_key; keysat_types[1] = dt_sat; if (DTCMP_Type_create_series(2, keysat_types, &dt_keysat) != DTCMP_SUCCESS) { MFU_ABORT(1, "Failed to create type for keysat"); } /* get extent of key type */ MPI_Aint key_lb, key_extent; MPI_Type_get_extent(dt_key, &key_lb, &key_extent); /* get extent of keysat type */ MPI_Aint keysat_lb, keysat_extent; MPI_Type_get_extent(dt_keysat, &keysat_lb, &keysat_extent); /* get extent of sat type */ MPI_Aint sat_lb, sat_extent; MPI_Type_get_extent(dt_sat, &sat_lb, &sat_extent); /* compute size of sort element and allocate buffer */ size_t sortbufsize = (size_t)keysat_extent * incount; void* sortbuf = MFU_MALLOC(sortbufsize); /* copy data into sort elements */ uint64_t idx = 0; char* sortptr = (char*) sortbuf; while (idx < incount) { /* copy in access time */ int i; for (i = 0; i < nfields; i++) { if (fields[i] == FILENAME) { const char* name = mfu_flist_file_get_name(flist, idx); strcpy(sortptr, name); } sortptr += lengths[i]; } /* pack file element */ sortptr += mfu_flist_file_pack(sortptr, flist, idx); idx++; } /* sort data */ void* outsortbuf; int outsortcount; DTCMP_Handle handle; int sort_rc = DTCMP_Sortz( sortbuf, (int)incount, &outsortbuf, &outsortcount, dt_key, dt_keysat, op_key, DTCMP_FLAG_NONE, MPI_COMM_WORLD, &handle ); if (sort_rc != DTCMP_SUCCESS) { MFU_ABORT(1, "Failed to sort data"); } /* step through sorted data filenames */ idx = 0; sortptr = (char*) outsortbuf; while (idx < (uint64_t)outsortcount) { sortptr += key_extent; sortptr += mfu_flist_file_unpack(sortptr, flist2); idx++; } /* build summary of new list */ mfu_flist_summarize(flist2); /* free memory */ DTCMP_Free(&handle); /* free ops */ DTCMP_Op_free(&op_key); DTCMP_Op_free(&op_filepath_rev); DTCMP_Op_free(&op_filepath); /* free types */ MPI_Type_free(&dt_keysat); MPI_Type_free(&dt_key); MPI_Type_free(&dt_filepath); /* free input buffer holding sort elements */ mfu_free(&sortbuf); /* free the satellite type */ MPI_Type_free(&dt_sat); /* return new list and free old one */ *pflist = flist2; mfu_flist_free(&flist); return MFU_SUCCESS; }
int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); mfu_init(); /* get our rank and number of ranks in the job */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* pointer to mfu_walk_opts */ mfu_walk_opts_t* walk_opts = mfu_walk_opts_new(); uint64_t idx; int option_index = 0; int usage = 0; int report = 0; unsigned int numpaths = 0; mfu_param_path* paths = NULL; unsigned long long bytes; /* verbose by default */ mfu_debug_level = MFU_LOG_VERBOSE; /* default to 1MB stripe size, stripe across all OSTs, and all files are candidates */ int stripes = -1; uint64_t stripe_size = 1048576; uint64_t min_size = 0; static struct option long_options[] = { {"count", 1, 0, 'c'}, {"size", 1, 0, 's'}, {"minsize", 1, 0, 'm'}, {"report", 0, 0, 'r'}, {"progress", 1, 0, 'P'}, {"verbose", 0, 0, 'v'}, {"quiet", 0, 0, 'q'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while (1) { int c = getopt_long(argc, argv, "c:s:m:rvqh", long_options, &option_index); if (c == -1) { break; } switch (c) { case 'c': /* stripe count */ stripes = atoi(optarg); break; case 's': /* stripe size in bytes */ if (mfu_abtoull(optarg, &bytes) != MFU_SUCCESS) { if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "Failed to parse stripe size: %s", optarg); } MPI_Abort(MPI_COMM_WORLD, 1); } stripe_size = (uint64_t)bytes; break; case 'm': /* min file size in bytes */ if (mfu_abtoull(optarg, &bytes) != MFU_SUCCESS) { if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "Failed to parse minimum file size: %s", optarg); } MPI_Abort(MPI_COMM_WORLD, 1); } min_size = (uint64_t)bytes; break; case 'r': /* report striping info */ report = 1; break; case 'P': mfu_progress_timeout = atoi(optarg); break; case 'v': mfu_debug_level = MFU_LOG_VERBOSE; break; case 'q': mfu_debug_level = MFU_LOG_NONE; break; case 'h': /* display usage */ usage = 1; break; case '?': /* display usage */ usage = 1; break; default: if (rank == 0) { printf("?? getopt returned character code 0%o ??\n", c); } } } /* check that we got a valid progress value */ if (mfu_progress_timeout < 0) { if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "Seconds in --progress must be non-negative: %d invalid", mfu_progress_timeout); } usage = 1; } /* paths to walk come after the options */ if (optind < argc) { /* determine number of paths specified by user */ numpaths = argc - optind; /* allocate space for each path */ paths = (mfu_param_path*) MFU_MALLOC((size_t)numpaths * sizeof(mfu_param_path)); /* process each path */ char** p = &argv[optind]; mfu_param_path_set_all((uint64_t)numpaths, (const char**)p, paths); optind += numpaths; } else { usage = 1; } /* if we need to print usage, print it and exit */ if (usage) { if (rank == 0) { print_usage(); } mfu_finalize(); MPI_Finalize(); return 1; } /* nothing to do if lustre support is disabled */ #ifndef LUSTRE_SUPPORT if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "Lustre support is disabled."); } MPI_Abort(MPI_COMM_WORLD, 1); #endif /* stripe count must be -1 for all available or greater than 0 */ if (stripes < -1) { if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "Stripe count must be -1 for all servers, 0 for lustre file system default, or a positive value"); } MPI_Abort(MPI_COMM_WORLD, 1); } /* lustre requires stripe sizes to be aligned */ if (stripe_size > 0 && stripe_size % 65536 != 0) { if (rank == 0) { MFU_LOG(MFU_LOG_ERR, "Stripe size must be a multiple of 65536"); } MPI_Abort(MPI_COMM_WORLD, 1); } /* TODO: verify that source / target are on Lustre */ /* walk list of input paths and stat as we walk */ mfu_flist flist = mfu_flist_new(); mfu_flist_walk_param_paths(numpaths, paths, walk_opts, flist); /* filter down our list to files which don't meet our striping requirements */ mfu_flist filtered = filter_list(flist, stripes, stripe_size, min_size, &create_prog_count_total, &stripe_prog_bytes_total); mfu_flist_free(&flist); MPI_Barrier(MPI_COMM_WORLD); /* report the file size and stripe count of all files we found */ if (report) { /* report the files in our filtered list */ stripe_info_report(filtered); /* free the paths and our list */ mfu_flist_free(&filtered); mfu_param_path_free_all(numpaths, paths); mfu_free(&paths); /* finalize */ mfu_finalize(); MPI_Finalize(); return 0; } /* generate a global suffix for our temp files and have each node check it's list */ char suffix[8]; uint64_t retry; /* seed our random number generator */ srand(time(NULL)); /* keep trying to make a valid random suffix...*/ do { uint64_t attempt = 0; /* make rank 0 responsible for generating a random suffix */ if (rank == 0) { generate_suffix(suffix, sizeof(suffix)); } /* broadcast the random suffix to all ranks */ MPI_Bcast(suffix, sizeof(suffix), MPI_CHAR, 0, MPI_COMM_WORLD); /* check that the file doesn't already exist */ uint64_t size = mfu_flist_size(filtered); for (idx = 0; idx < size; idx++) { char temp_path[PATH_MAX]; strcpy(temp_path, mfu_flist_file_get_name(filtered, idx)); strcat(temp_path, suffix); if(!mfu_access(temp_path, F_OK)) { /* the file already exists */ attempt = 1; break; } } /* do a reduce to figure out if a rank has a file collision */ MPI_Allreduce(&attempt, &retry, 1, MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD); } while(retry != 0); /* initialize progress messages while creating files */ create_prog_count = 0; create_prog = mfu_progress_start(mfu_progress_timeout, 1, MPI_COMM_WORLD, create_progress_fn); /* create new files so we can restripe */ uint64_t size = mfu_flist_size(filtered); for (idx = 0; idx < size; idx++) { char temp_path[PATH_MAX]; strcpy(temp_path, mfu_flist_file_get_name(filtered, idx)); strcat(temp_path, suffix); /* create a striped file at the temp file path */ mfu_stripe_set(temp_path, stripe_size, stripes); /* update our status for file create progress */ create_prog_count++; mfu_progress_update(&create_prog_count, create_prog); } /* finalize file create progress messages */ mfu_progress_complete(&create_prog_count, &create_prog); MPI_Barrier(MPI_COMM_WORLD); /* initialize progress messages while copying data */ stripe_prog_bytes = 0; stripe_prog = mfu_progress_start(mfu_progress_timeout, 1, MPI_COMM_WORLD, stripe_progress_fn); /* found a suffix, now we need to break our files into chunks based on stripe size */ mfu_file_chunk* file_chunks = mfu_file_chunk_list_alloc(filtered, stripe_size); mfu_file_chunk* p = file_chunks; while (p != NULL) { /* build path to temp file */ char temp_path[PATH_MAX]; strcpy(temp_path, p->name); strcat(temp_path, suffix); /* write each chunk in our list */ write_file_chunk(p, temp_path); /* move on to next file chunk */ p = p->next; } mfu_file_chunk_list_free(&file_chunks); /* finalize progress messages */ mfu_progress_complete(&stripe_prog_bytes, &stripe_prog); MPI_Barrier(MPI_COMM_WORLD); /* remove input file and rename temp file */ for (idx = 0; idx < size; idx++) { /* build path to temp file */ const char *in_path = mfu_flist_file_get_name(filtered, idx); char out_path[PATH_MAX]; strcpy(out_path, in_path); strcat(out_path, suffix); /* change the mode of the newly restriped file to be the same as the old one */ mode_t mode = (mode_t) mfu_flist_file_get_mode(filtered, idx); if (mfu_chmod(out_path, mode) != 0) { MFU_LOG(MFU_LOG_ERR, "Failed to chmod file %s (%s)", out_path, strerror(errno)); MPI_Abort(MPI_COMM_WORLD, 1); } /* rename the new, restriped file to the old name */ if (rename(out_path, in_path) != 0) { MFU_LOG(MFU_LOG_ERR, "Failed to rename file %s to %s", out_path, in_path); MPI_Abort(MPI_COMM_WORLD, 1); } } /* wait for everyone to finish */ MPI_Barrier(MPI_COMM_WORLD); /* free the walk options */ mfu_walk_opts_delete(&walk_opts); /* free filtered list, path parameters */ mfu_flist_free(&filtered); mfu_param_path_free_all(numpaths, paths); mfu_free(&paths); mfu_finalize(); MPI_Finalize(); return 0; }
static int print_flist_distribution(int file_histogram, struct distribute_option *option, mfu_flist* pflist, int rank) { /* file list to use */ mfu_flist flist = *pflist; /* get local size for each rank, and max file sizes */ uint64_t size = mfu_flist_size(flist); uint64_t global_max_file_size; int separators = 0; if (file_histogram) { /* create default separators */ create_default_separators(option, &flist, &size, &separators, &global_max_file_size); } else { separators = option->separator_number; } /* allocate a count for each bin, initialize the bin counts to 0 * it is separator + 1 because the last bin is the last separator * to the DISTRIBUTE_MAX */ uint64_t* dist = (uint64_t*) MFU_MALLOC((separators + 1) * sizeof(uint64_t)); /* initialize the bin counts to 0 */ for (int i = 0; i <= separators; i++) { dist[i] = 0; } /* for each file, identify appropriate bin and increment its count */ for (int i = 0; i < size; i++) { /* get the size of the file */ uint64_t file_size = mfu_flist_file_get_size(flist, i); /* loop through the bins and find the one the file belongs to, * set last bin to -1, if a bin is not found while looping through the * list of file size separators, then it belongs in the last bin * so (last file size - MAX bin) */ int max_bin_flag = -1; for (int j = 0; j < separators; j++) { if (file_size <= option->separators[j]) { /* found the bin set bin index & increment its count */ dist[j]++; /* a file for this bin was found so can't belong to * last bin (so set the flag) & exit the loop */ max_bin_flag = 1; break; } } /* if max_bin_flag is still -1 then the file belongs to the last bin */ if (max_bin_flag < 0) { dist[separators]++; } } /* get the total sum across all of the bins */ uint64_t* disttotal = (uint64_t*) MFU_MALLOC((separators + 1) * sizeof(uint64_t)); MPI_Allreduce(dist, disttotal, (uint64_t)separators + 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); /* Print the file distribution */ if (rank == 0) { /* number of files in a bin */ uint64_t number; double size_tmp; const char* size_units; printf("%-27s %s\n", "Range", "Number"); for (int i = 0; i <= separators; i++) { printf("%s", "[ "); if (i == 0) { printf("%7.3lf %2s", 0.000, "B"); } else { mfu_format_bytes((uint64_t)option->separators[i - 1], &size_tmp, &size_units); printf("%7.3lf %2s", size_tmp, size_units); } printf("%s", " - "); if (file_histogram) { mfu_format_bytes((uint64_t)option->separators[i], &size_tmp, &size_units); number = disttotal[i]; mfu_format_bytes((uint64_t)option->separators[i], &size_tmp, &size_units); printf("%7.3lf %2s ) %"PRIu64"\n", size_tmp, size_units, number); } else { if (i == separators) { number = disttotal[i]; printf("%10s ) %"PRIu64"\n", "MAX", number); } else { number = disttotal[i]; mfu_format_bytes((uint64_t)option->separators[i], &size_tmp, &size_units); printf("%7.3lf %2s ) %"PRIu64"\n", size_tmp, size_units, number); } } } } /* free the memory used to hold bin counts */ mfu_free(&disttotal); mfu_free(&dist); return 0; }
/* for given depth, hash directory name and map to processes to * test whether having all files in same directory on one process * matters */ size_t mfu_flist_distribute_map(mfu_flist list, char** buffer, mfu_flist_name_encode_fn encode, mfu_flist_map_fn map, void* args) { uint64_t idx; /* get our rank and number of ranks in job */ int ranks; MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* allocate arrays for alltoall */ size_t bufsize = (size_t)ranks * sizeof(int); int* sendsizes = (int*) MFU_MALLOC(bufsize); int* senddisps = (int*) MFU_MALLOC(bufsize); int* sendoffset = (int*) MFU_MALLOC(bufsize); int* recvsizes = (int*) MFU_MALLOC(bufsize); int* recvdisps = (int*) MFU_MALLOC(bufsize); /* initialize sendsizes and offsets */ int i; for (i = 0; i < ranks; i++) { sendsizes[i] = 0; sendoffset[i] = 0; } /* compute number of bytes we'll send to each rank */ size_t sendbytes = 0; uint64_t size = mfu_flist_size(list); for (idx = 0; idx < size; idx++) { int dest = map(list, idx, ranks, args); /* TODO: check that pack size doesn't overflow int */ /* total number of bytes we'll send to each rank and the total overall */ size_t count = encode(NULL, list, idx, args); sendsizes[dest] += (int) count; sendbytes += count; } /* compute displacements */ senddisps[0] = 0; for (i = 1; i < ranks; i++) { senddisps[i] = senddisps[i - 1] + sendsizes[i - 1]; } /* allocate space */ char* sendbuf = (char*) MFU_MALLOC(sendbytes); /* copy data into buffer */ for (idx = 0; idx < size; idx++) { int dest = map(list, idx, ranks, args); /* identify region to be sent to rank */ char* path = sendbuf + senddisps[dest] + sendoffset[dest]; size_t count = encode(path, list, idx, args); /* TODO: check that pack size doesn't overflow int */ /* bump up the offset for this rank */ sendoffset[dest] += (int) count; } /* alltoall to specify incoming counts */ MPI_Alltoall(sendsizes, 1, MPI_INT, recvsizes, 1, MPI_INT, MPI_COMM_WORLD); /* compute size of recvbuf and displacements */ size_t recvbytes = 0; recvdisps[0] = 0; for (i = 0; i < ranks; i++) { recvbytes += (size_t) recvsizes[i]; if (i > 0) { recvdisps[i] = recvdisps[i - 1] + recvsizes[i - 1]; } } /* allocate recvbuf */ char* recvbuf = (char*) MFU_MALLOC(recvbytes); /* alltoallv to send data */ MPI_Alltoallv( sendbuf, sendsizes, senddisps, MPI_CHAR, recvbuf, recvsizes, recvdisps, MPI_CHAR, MPI_COMM_WORLD ); /* free memory */ mfu_free(&recvdisps); mfu_free(&recvsizes); mfu_free(&sendbuf); mfu_free(&sendoffset); mfu_free(&senddisps); mfu_free(&sendsizes); *buffer = recvbuf; return recvbytes; }
/* given an input flist, return a newly allocated flist consisting of * a filtered set by finding all items that match/don't match a given * regular expression */ mfu_flist mfu_flist_filter_regex(mfu_flist flist, const char* regex_exp, int exclude, int name) { /* create our list to return */ mfu_flist dest = mfu_flist_subset(flist); /* check if user passed in an expression, if so then filter the list */ if (regex_exp != NULL) { /* compile regular expression, if it fails print error */ regex_t regex; int regex_return = regcomp(®ex, regex_exp, 0); if (regex_return) { MFU_ABORT(-1, "Could not compile regex: `%s' rc=%d\n", regex_exp, regex_return); } /* copy the things that don't or do (based on input) match the regex into a * filtered list */ uint64_t idx = 0; uint64_t size = mfu_flist_size(flist); while (idx < size) { /* get full path of item */ const char* file_name = mfu_flist_file_get_name(flist, idx); /* get basename of item (exclude the path) */ mfu_path* pathname = mfu_path_from_str(file_name); mfu_path_basename(pathname); char* base = mfu_path_strdup(pathname); /* execute regex on item, either against the basename or * the full path depending on name flag */ if (name) { /* run regex on basename */ regex_return = regexec(®ex, base, 0, NULL, 0); } else { /* run regex on full path */ regex_return = regexec(®ex, file_name, 0, NULL, 0); } /* copy item to the filtered list */ if (exclude) { /* user wants to exclude items that match, so copy everything that * does not match */ if (regex_return == REG_NOMATCH) { mfu_flist_file_copy(flist, idx, dest); } } else { /* user wants to copy over any matching items */ if (regex_return == 0) { mfu_flist_file_copy(flist, idx, dest); } } /* free the basename */ mfu_free(&base); mfu_path_delete(&pathname); /* get next item in our list */ idx++; } /* summarize the filtered list */ mfu_flist_summarize(dest); } /* return the filtered list */ return dest; }
/* given an input list and a map function pointer, call map function * for each item in list, identify new rank to send item to and then * exchange items among ranks and return new output list */ mfu_flist mfu_flist_remap(mfu_flist list, mfu_flist_map_fn map, const void* args) { uint64_t idx; /* create new list as subset (actually will be a remapping of * input list */ mfu_flist newlist = mfu_flist_subset(list); /* get our rank and number of ranks in job */ int ranks; MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* allocate arrays for alltoall */ size_t bufsize = (size_t)ranks * sizeof(int); int* sendsizes = (int*) MFU_MALLOC(bufsize); int* senddisps = (int*) MFU_MALLOC(bufsize); int* sendoffset = (int*) MFU_MALLOC(bufsize); int* recvsizes = (int*) MFU_MALLOC(bufsize); int* recvdisps = (int*) MFU_MALLOC(bufsize); /* initialize sendsizes and offsets */ int i; for (i = 0; i < ranks; i++) { sendsizes[i] = 0; sendoffset[i] = 0; } /* get number of elements in our local list */ uint64_t size = mfu_flist_size(list); /* allocate space to record file-to-rank mapping */ int* file2rank = (int*) MFU_MALLOC(size * sizeof(int)); /* call map function for each item to identify its new rank, * and compute number of bytes we'll send to each rank */ size_t sendbytes = 0; for (idx = 0; idx < size; idx++) { /* determine which rank we'll map this file to */ int dest = map(list, idx, ranks, args); /* cache mapping so we don't have to compute it again * below while packing items for send */ file2rank[idx] = dest; /* TODO: check that pack size doesn't overflow int */ /* total number of bytes we'll send to each rank and the total overall */ size_t count = mfu_flist_file_pack_size(list); sendsizes[dest] += (int) count; sendbytes += count; } /* compute send buffer displacements */ senddisps[0] = 0; for (i = 1; i < ranks; i++) { senddisps[i] = senddisps[i - 1] + sendsizes[i - 1]; } /* allocate space for send buffer */ char* sendbuf = (char*) MFU_MALLOC(sendbytes); /* copy data into send buffer */ for (idx = 0; idx < size; idx++) { /* determine which rank we mapped this file to */ int dest = file2rank[idx]; /* get pointer into send buffer and pack item */ char* ptr = sendbuf + senddisps[dest] + sendoffset[dest]; size_t count = mfu_flist_file_pack(ptr, list, idx); /* TODO: check that pack size doesn't overflow int */ /* bump up the offset for this rank */ sendoffset[dest] += (int) count; } /* alltoall to get our incoming counts */ MPI_Alltoall(sendsizes, 1, MPI_INT, recvsizes, 1, MPI_INT, MPI_COMM_WORLD); /* compute size of recvbuf and displacements */ size_t recvbytes = 0; recvdisps[0] = 0; for (i = 0; i < ranks; i++) { recvbytes += (size_t) recvsizes[i]; if (i > 0) { recvdisps[i] = recvdisps[i - 1] + recvsizes[i - 1]; } } /* allocate recvbuf */ char* recvbuf = (char*) MFU_MALLOC(recvbytes); /* alltoallv to send data */ MPI_Alltoallv( sendbuf, sendsizes, senddisps, MPI_CHAR, recvbuf, recvsizes, recvdisps, MPI_CHAR, MPI_COMM_WORLD ); /* unpack items into new list */ char* ptr = recvbuf; char* recvend = recvbuf + recvbytes; while (ptr < recvend) { size_t count = mfu_flist_file_unpack(ptr, newlist); ptr += count; } mfu_flist_summarize(newlist); /* free memory */ mfu_free(&file2rank); mfu_free(&recvbuf); mfu_free(&recvdisps); mfu_free(&recvsizes); mfu_free(&sendbuf); mfu_free(&sendoffset); mfu_free(&senddisps); mfu_free(&sendsizes); /* return list to caller */ return newlist; }
/* given a list of files print from the start to end of the list */ void mfu_flist_print(mfu_flist flist) { /* number of items to print from start and end of list */ uint64_t range = 10; /* allocate send and receive buffers */ size_t pack_size = mfu_flist_file_pack_size(flist); size_t bufsize = 2 * range * pack_size; void* sendbuf = MFU_MALLOC(bufsize); void* recvbuf = MFU_MALLOC(bufsize); /* get our rank and the size of comm_world */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* identify the number of items we have, the total number, * and our offset in the global list */ uint64_t count = mfu_flist_size(flist); uint64_t total = mfu_flist_global_size(flist); uint64_t offset = mfu_flist_global_offset(flist); /* count the number of items we'll send */ int num = 0; uint64_t idx = 0; while (idx < count) { uint64_t global = offset + idx; if (global < range || (total - global) <= range) { num++; } idx++; } /* allocate arrays to store counts and displacements */ int* counts = (int*) MFU_MALLOC((size_t)ranks * sizeof(int)); int* disps = (int*) MFU_MALLOC((size_t)ranks * sizeof(int)); /* tell rank 0 where the data is coming from */ int bytes = num * (int)pack_size; MPI_Gather(&bytes, 1, MPI_INT, counts, 1, MPI_INT, 0, MPI_COMM_WORLD); /* pack items into sendbuf */ idx = 0; char* ptr = (char*) sendbuf; while (idx < count) { uint64_t global = offset + idx; if (global < range || (total - global) <= range) { ptr += mfu_flist_file_pack(ptr, flist, idx); } idx++; } /* compute displacements and total bytes */ int recvbytes = 0; if (rank == 0) { int i; disps[0] = 0; recvbytes += counts[0]; for (i = 1; i < ranks; i++) { disps[i] = disps[i - 1] + counts[i - 1]; recvbytes += counts[i]; } } /* gather data to rank 0 */ MPI_Gatherv(sendbuf, bytes, MPI_BYTE, recvbuf, counts, disps, MPI_BYTE, 0, MPI_COMM_WORLD); /* create temporary list to unpack items into */ mfu_flist tmplist = mfu_flist_subset(flist); /* unpack items into new list */ if (rank == 0) { ptr = (char*) recvbuf; char* end = ptr + recvbytes; while (ptr < end) { mfu_flist_file_unpack(ptr, tmplist); ptr += pack_size; } } /* summarize list */ mfu_flist_summarize(tmplist); /* print files */ if (rank == 0) { printf("\n"); uint64_t tmpidx = 0; uint64_t tmpsize = mfu_flist_size(tmplist); while (tmpidx < tmpsize) { print_file(tmplist, tmpidx); tmpidx++; if (tmpidx == range && total > 2 * range) { /* going to have to leave some out */ printf("\n<snip>\n\n"); } } printf("\n"); } /* free our temporary list */ mfu_flist_free(&tmplist); /* free memory */ mfu_free(&disps); mfu_free(&counts); mfu_free(&sendbuf); mfu_free(&recvbuf); return; }