/* map function to evenly spread list among ranks, using block allocation */ static int map_spread(mfu_flist flist, uint64_t idx, int ranks, void* args) { /* compute global index of this item */ uint64_t offset = mfu_flist_global_offset(flist); uint64_t global_idx = offset + idx; /* get global size of the list */ uint64_t global_size = mfu_flist_global_size(flist); /* get whole number of items on each rank */ uint64_t items_per_rank = global_size / (uint64_t)ranks; /* if global list size is not divisible by the number of ranks * then we need to use the remainder */ uint64_t remainder = (global_size) - (items_per_rank * (uint64_t)ranks); /* If have a remainder, then we give one extra item to * to each rank starting from rank 0. Compute the number * of items contained in this set of ranks. There are * remainder such ranks. */ uint64_t extra = remainder * (items_per_rank + 1); /* break up into two cases: if you are adding +1 or not * calculate target rank */ int target_rank; if (global_idx < extra) { /* the item falls in the set of ranks that all have an extra item */ target_rank = (int) (global_idx / (items_per_rank + 1)); } else { /* the item falls into the set of ranks beyond the set holding an extra item */ target_rank = (int) (remainder + (global_idx - extra) / items_per_rank); } return target_rank; }
/* given an input list, split items into separate lists depending * on their depth, returns number of levels, minimum depth, and * array of lists as output */ void mfu_flist_array_by_depth( mfu_flist srclist, int* outlevels, int* outmin, mfu_flist** outlists) { /* check that our pointers are valid */ if (outlevels == NULL || outmin == NULL || outlists == NULL) { return; } /* initialize return values */ *outlevels = 0; *outmin = -1; *outlists = NULL; /* get total file count */ uint64_t total = mfu_flist_global_size(srclist); if (total == 0) { return; } /* get min and max depths, determine number of levels, * allocate array of lists */ int min = mfu_flist_min_depth(srclist); int max = mfu_flist_max_depth(srclist); int levels = max - min + 1; mfu_flist* lists = (mfu_flist*) MFU_MALLOC((size_t)levels * sizeof(mfu_flist)); /* create a list for each level */ int i; for (i = 0; i < levels; i++) { lists[i] = mfu_flist_subset(srclist); } /* copy each item from source list to its corresponding level */ uint64_t idx = 0; uint64_t size = mfu_flist_size(srclist); while (idx < size) { int depth = mfu_flist_file_get_depth(srclist, idx); int depth_index = depth - min; mfu_flist dstlist = lists[depth_index]; mfu_flist_file_copy(srclist, idx, dstlist); idx++; } /* summarize each list */ for (i = 0; i < levels; i++) { mfu_flist_summarize(lists[i]); } /* set return parameters */ *outlevels = levels; *outmin = min; *outlists = lists; return; }
/* sort flist by specified fields, given as common-delimitted list * precede field name with '-' character to reverse sort order: * name,user,group,uid,gid,atime,mtime,ctime,size * For example to sort by size in descending order, followed by name * char fields[] = "size,-name"; */ int mfu_flist_sort(const char* sortfields, mfu_flist* pflist) { if (sortfields == NULL || pflist == NULL) { return MFU_FAILURE; } /* get pointer to list */ mfu_flist flist = *pflist; /* start timer */ double start_sort = MPI_Wtime(); /* sort list */ int rc; if (mfu_flist_have_detail(flist)) { rc = sort_files_stat(sortfields, pflist); } else { rc = sort_files_readdir(sortfields, pflist); } /* end timer */ double end_sort = MPI_Wtime(); /* report sort count, time, and rate */ if (mfu_debug_level >= MFU_LOG_VERBOSE && mfu_rank == 0) { uint64_t all_count = mfu_flist_global_size(flist); double secs = end_sort - start_sort; double rate = 0.0; if (secs > 0.0) { rate = ((double)all_count) / secs; } printf("Sorted %lu files in %f seconds (%f files/sec)\n", all_count, secs, rate ); } return rc; }
/* removes list of items, sets write bits on directories from * top-to-bottom, then removes items one level at a time starting * from the deepest */ void mfu_flist_unlink(mfu_flist flist) { int level; /* wait for all tasks and start timer */ MPI_Barrier(MPI_COMM_WORLD); double start_remove = MPI_Wtime(); /* split files into separate lists by directory depth */ int levels, minlevel; mfu_flist* lists; mfu_flist_array_by_depth(flist, &levels, &minlevel, &lists); #if 0 /* dive from shallow to deep, ensure all directories have write bit set */ for (level = 0; level < levels; level++) { /* get list of items for this level */ mfu_flist list = lists[level]; /* determine whether we have details at this level */ int detail = mfu_flist_have_detail(list); /* iterate over items and set write bit on directories if needed */ uint64_t idx; uint64_t size = mfu_flist_size(list); for (idx = 0; idx < size; idx++) { /* check whether we have a directory */ mfu_filetype type = mfu_flist_file_get_type(list, idx); if (type == MFU_TYPE_DIR) { /* assume we have to set the bit */ int set_write_bit = 1; if (detail) { mode_t mode = (mode_t) mfu_flist_file_get_mode(list, idx); if (mode & S_IWUSR) { /* we have the mode of the file, and the bit is already set */ set_write_bit = 0; } } /* set the bit if needed */ if (set_write_bit) { const char* name = mfu_flist_file_get_name(list, idx); int rc = chmod(name, S_IRWXU); if (rc != 0) { MFU_LOG(MFU_LOG_ERR, "Failed to chmod directory `%s' (errno=%d %s)", name, errno, strerror(errno) ); } } } } /* wait for all procs to finish before we start next level */ MPI_Barrier(MPI_COMM_WORLD); } #endif /* now remove files starting from deepest level */ for (level = levels - 1; level >= 0; level--) { double start = MPI_Wtime(); /* get list of items for this level */ mfu_flist list = lists[level]; uint64_t count = 0; //remove_direct(list, &count); remove_spread(list, &count); // remove_map(list, &count); // remove_sort(list, &count); // remove_libcircle(list, &count); // TODO: remove sort w/ spread /* wait for all procs to finish before we start * with files at next level */ MPI_Barrier(MPI_COMM_WORLD); double end = MPI_Wtime(); if (mfu_debug_level >= MFU_LOG_VERBOSE) { uint64_t min, max, sum; MPI_Allreduce(&count, &min, 1, MPI_UINT64_T, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&count, &max, 1, MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&count, &sum, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); double rate = 0.0; if (end - start > 0.0) { rate = (double)sum / (end - start); } double time_diff = end - start; if (mfu_rank == 0) { printf("level=%d min=%lu max=%lu sum=%lu rate=%f secs=%f\n", (minlevel + level), (unsigned long)min, (unsigned long)max, (unsigned long)sum, rate, time_diff ); fflush(stdout); } } } mfu_flist_array_free(levels, &lists); /* wait for all tasks and stop timer */ MPI_Barrier(MPI_COMM_WORLD); double end_remove = MPI_Wtime(); /* report remove count, time, and rate */ if (mfu_debug_level >= MFU_LOG_VERBOSE && mfu_rank == 0) { uint64_t all_count = mfu_flist_global_size(flist); double time_diff = end_remove - start_remove; double rate = 0.0; if (time_diff > 0.0) { rate = ((double)all_count) / time_diff; } printf("Removed %lu items in %f seconds (%f items/sec)\n", all_count, time_diff, rate ); } return; }
/* for each depth, sort files by filename and then remove, to test * whether it matters to limit the number of directories each process * has to reference (e.g., locking) */ static void remove_sort(mfu_flist list, uint64_t* rmcount) { /* bail out if total count is 0 */ uint64_t all_count = mfu_flist_global_size(list); if (all_count == 0) { return; } /* get maximum file name and number of items */ int chars = (int) mfu_flist_file_max_name(list); uint64_t my_count = mfu_flist_size(list); /* create key datatype (filename) and comparison op */ MPI_Datatype dt_key; DTCMP_Op op_str; DTCMP_Str_create_ascend(chars, &dt_key, &op_str); /* create keysat datatype (filename + type) */ MPI_Datatype types[2], dt_keysat; types[0] = dt_key; types[1] = MPI_CHAR; DTCMP_Type_create_series(2, types, &dt_keysat); /* allocate send buffer */ int sendcount = (int) my_count; size_t sendbufsize = (size_t)(sendcount * (chars + 1)); char* sendbuf = (char*) MFU_MALLOC(sendbufsize); /* copy data into buffer */ char* ptr = sendbuf; uint64_t idx; for (idx = 0; idx < my_count; idx++) { /* encode the filename first */ const char* name = mfu_flist_file_get_name(list, idx); strcpy(ptr, name); ptr += chars; /* last character encodes item type */ mfu_filetype type = mfu_flist_file_get_type(list, idx); if (type == MFU_TYPE_DIR) { ptr[0] = 'd'; } else if (type == MFU_TYPE_FILE || type == MFU_TYPE_LINK) { ptr[0] = 'f'; } else { ptr[0] = 'u'; } ptr++; } /* sort items */ void* recvbuf; int recvcount; DTCMP_Handle handle; DTCMP_Sortz( sendbuf, sendcount, &recvbuf, &recvcount, dt_key, dt_keysat, op_str, DTCMP_FLAG_NONE, MPI_COMM_WORLD, &handle ); /* delete data */ int delcount = 0; ptr = (char*)recvbuf; while (delcount < recvcount) { /* get item name */ char* name = ptr; ptr += chars; /* get item type */ char type = ptr[0]; ptr++; /* delete item */ remove_type(type, name); delcount++; } /* record number of items we deleted */ *rmcount = (uint64_t) delcount; /* free output data */ DTCMP_Free(&handle); /* free our send buffer */ mfu_free(&sendbuf); /* free key comparison operation */ DTCMP_Op_free(&op_str); /* free datatypes */ MPI_Type_free(&dt_keysat); MPI_Type_free(&dt_key); return; }
/* for given depth, evenly spread the files among processes for * improved load balancing */ static void remove_spread(mfu_flist flist, uint64_t* rmcount) { uint64_t idx; /* initialize our remove count */ *rmcount = 0; /* get our rank and number of ranks in job */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* allocate memory for alltoall exchanges */ size_t bufsize = (size_t)ranks * sizeof(int); int* sendcounts = (int*) MFU_MALLOC(bufsize); int* sendsizes = (int*) MFU_MALLOC(bufsize); int* senddisps = (int*) MFU_MALLOC(bufsize); int* recvsizes = (int*) MFU_MALLOC(bufsize); int* recvdisps = (int*) MFU_MALLOC(bufsize); /* get number of items */ uint64_t my_count = mfu_flist_size(flist); uint64_t all_count = mfu_flist_global_size(flist); uint64_t offset = mfu_flist_global_offset(flist); /* compute number of bytes we'll send */ size_t sendbytes = 0; for (idx = 0; idx < my_count; idx++) { const char* name = mfu_flist_file_get_name(flist, idx); size_t len = strlen(name) + 2; sendbytes += len; } /* compute the number of items that each rank should have */ uint64_t low = all_count / (uint64_t)ranks; uint64_t extra = all_count - low * (uint64_t)ranks; /* compute number that we'll send to each rank and initialize sendsizes and offsets */ uint64_t i; for (i = 0; i < (uint64_t)ranks; i++) { /* compute starting element id and count for given rank */ uint64_t start, num; if (i < extra) { num = low + 1; start = i * num; } else { num = low; start = (i - extra) * num + extra * (low + 1); } /* compute the number of items we'll send to this task */ uint64_t sendcnt = 0; if (my_count > 0) { if (start <= offset && offset < start + num) { /* this rank overlaps our range, * and its first element comes at or before our first element */ sendcnt = num - (offset - start); if (my_count < sendcnt) { /* the number the rank could receive from us * is more than we have left */ sendcnt = my_count; } } else if (offset < start && start < offset + my_count) { /* this rank overlaps our range, * and our first element comes strictly before its first element */ sendcnt = my_count - (start - offset); if (num < sendcnt) { /* the number the rank can receive from us * is less than we have left */ sendcnt = num; } } } /* record the number of items we'll send to this task */ sendcounts[i] = (int) sendcnt; /* set sizes and displacements to 0, we'll fix this later */ sendsizes[i] = 0; senddisps[i] = 0; } /* allocate space */ char* sendbuf = (char*) MFU_MALLOC(sendbytes); /* copy data into buffer */ int dest = -1; int disp = 0; for (idx = 0; idx < my_count; idx++) { /* get name and type of item */ const char* name = mfu_flist_file_get_name(flist, idx); mfu_filetype type = mfu_flist_file_get_type(flist, idx); /* get rank that we're packing data for */ if (dest == -1) { dest = get_first_nonzero(sendcounts, ranks); if (dest == -1) { /* error */ } /* about to copy first item for this rank, * record its displacement */ senddisps[dest] = disp; } /* identify region to be sent to rank */ char* path = sendbuf + disp; /* first character encodes item type */ if (type == MFU_TYPE_DIR) { path[0] = 'd'; } else if (type == MFU_TYPE_FILE || type == MFU_TYPE_LINK) { path[0] = 'f'; } else { path[0] = 'u'; } /* now copy in the path */ strcpy(&path[1], name); /* TODO: check that we don't overflow the int */ /* add bytes to sendsizes and increase displacement */ size_t count = strlen(name) + 2; sendsizes[dest] += (int) count; disp += (int) count; /* decrement the count for this rank */ sendcounts[dest]--; if (sendcounts[dest] == 0) { dest = -1; } } /* compute displacements */ senddisps[0] = 0; for (i = 1; i < (uint64_t)ranks; i++) { senddisps[i] = senddisps[i - 1] + sendsizes[i - 1]; } /* alltoall to specify incoming counts */ MPI_Alltoall(sendsizes, 1, MPI_INT, recvsizes, 1, MPI_INT, MPI_COMM_WORLD); /* compute size of recvbuf and displacements */ size_t recvbytes = 0; recvdisps[0] = 0; for (i = 0; i < (uint64_t)ranks; i++) { recvbytes += (size_t) recvsizes[i]; if (i > 0) { recvdisps[i] = recvdisps[i - 1] + recvsizes[i - 1]; } } /* allocate recvbuf */ char* recvbuf = (char*) MFU_MALLOC(recvbytes); /* alltoallv to send data */ MPI_Alltoallv( sendbuf, sendsizes, senddisps, MPI_CHAR, recvbuf, recvsizes, recvdisps, MPI_CHAR, MPI_COMM_WORLD ); /* delete data */ char* item = recvbuf; while (item < recvbuf + recvbytes) { /* get item name and type */ char type = item[0]; char* name = &item[1]; /* delete item */ remove_type(type, name); /* keep tally of number of items we deleted */ *rmcount++; /* go to next item */ size_t item_size = strlen(item) + 1; item += item_size; } /* free memory */ mfu_free(&recvbuf); mfu_free(&recvdisps); mfu_free(&recvsizes); mfu_free(&sendbuf); mfu_free(&senddisps); mfu_free(&sendsizes); mfu_free(&sendcounts); return; }
/* Set up and execute directory walk */ void mfu_flist_walk_paths(uint64_t num_paths, const char** paths, mfu_walk_opts_t* walk_opts, mfu_flist bflist) { /* report walk count, time, and rate */ double start_walk = MPI_Wtime(); /* if dir_permission is set to 1 then set global variable */ SET_DIR_PERMS = 0; if (walk_opts->dir_perms) { SET_DIR_PERMS = 1; } /* if remove is set to 1 then set global variable */ REMOVE_FILES = 0; if (walk_opts->remove) { REMOVE_FILES = 1; } /* convert handle to flist_t */ flist_t* flist = (flist_t*) bflist; /* get our rank and number of ranks in job */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* print message to user that we're starting */ if (mfu_debug_level >= MFU_LOG_VERBOSE && mfu_rank == 0) { uint64_t i; for (i = 0; i < num_paths; i++) { MFU_LOG(MFU_LOG_INFO, "Walking %s", paths[i]); } } /* initialize libcircle */ CIRCLE_init(0, NULL, CIRCLE_SPLIT_EQUAL); /* set libcircle verbosity level */ enum CIRCLE_loglevel loglevel = CIRCLE_LOG_WARN; CIRCLE_enable_logging(loglevel); /* TODO: check that paths is not NULL */ /* TODO: check that each path is within limits */ /* set some global variables to do the file walk */ CURRENT_NUM_DIRS = num_paths; CURRENT_DIRS = paths; CURRENT_LIST = flist; /* we lookup users and groups first in case we can use * them to filter the walk */ flist->detail = 0; if (walk_opts->use_stat) { flist->detail = 1; if (flist->have_users == 0) { mfu_flist_usrgrp_get_users(flist); } if (flist->have_groups == 0) { mfu_flist_usrgrp_get_groups(flist); } } /* register callbacks */ if (walk_opts->use_stat) { /* walk directories by calling stat on every item */ CIRCLE_cb_create(&walk_stat_create); CIRCLE_cb_process(&walk_stat_process); // CIRCLE_cb_create(&walk_lustrestat_create); // CIRCLE_cb_process(&walk_lustrestat_process); } else { /* walk directories using file types in readdir */ CIRCLE_cb_create(&walk_readdir_create); CIRCLE_cb_process(&walk_readdir_process); // CIRCLE_cb_create(&walk_getdents_create); // CIRCLE_cb_process(&walk_getdents_process); } /* prepare callbacks and initialize variables for reductions */ reduce_items = 0; CIRCLE_cb_reduce_init(&reduce_init); CIRCLE_cb_reduce_op(&reduce_exec); CIRCLE_cb_reduce_fini(&reduce_fini); /* run the libcircle job */ CIRCLE_begin(); CIRCLE_finalize(); /* compute global summary */ mfu_flist_summarize(bflist); double end_walk = MPI_Wtime(); /* report walk count, time, and rate */ if (mfu_debug_level >= MFU_LOG_VERBOSE && mfu_rank == 0) { uint64_t all_count = mfu_flist_global_size(bflist); double time_diff = end_walk - start_walk; double rate = 0.0; if (time_diff > 0.0) { rate = ((double)all_count) / time_diff; } MFU_LOG(MFU_LOG_INFO, "Walked %lu items in %f seconds (%f files/sec)", all_count, time_diff, rate ); } /* hold procs here until summary is printed */ MPI_Barrier(MPI_COMM_WORLD); return; }
/* given a list of files print from the start to end of the list */ void mfu_flist_print(mfu_flist flist) { /* number of items to print from start and end of list */ uint64_t range = 10; /* allocate send and receive buffers */ size_t pack_size = mfu_flist_file_pack_size(flist); size_t bufsize = 2 * range * pack_size; void* sendbuf = MFU_MALLOC(bufsize); void* recvbuf = MFU_MALLOC(bufsize); /* get our rank and the size of comm_world */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* identify the number of items we have, the total number, * and our offset in the global list */ uint64_t count = mfu_flist_size(flist); uint64_t total = mfu_flist_global_size(flist); uint64_t offset = mfu_flist_global_offset(flist); /* count the number of items we'll send */ int num = 0; uint64_t idx = 0; while (idx < count) { uint64_t global = offset + idx; if (global < range || (total - global) <= range) { num++; } idx++; } /* allocate arrays to store counts and displacements */ int* counts = (int*) MFU_MALLOC((size_t)ranks * sizeof(int)); int* disps = (int*) MFU_MALLOC((size_t)ranks * sizeof(int)); /* tell rank 0 where the data is coming from */ int bytes = num * (int)pack_size; MPI_Gather(&bytes, 1, MPI_INT, counts, 1, MPI_INT, 0, MPI_COMM_WORLD); /* pack items into sendbuf */ idx = 0; char* ptr = (char*) sendbuf; while (idx < count) { uint64_t global = offset + idx; if (global < range || (total - global) <= range) { ptr += mfu_flist_file_pack(ptr, flist, idx); } idx++; } /* compute displacements and total bytes */ int recvbytes = 0; if (rank == 0) { int i; disps[0] = 0; recvbytes += counts[0]; for (i = 1; i < ranks; i++) { disps[i] = disps[i - 1] + counts[i - 1]; recvbytes += counts[i]; } } /* gather data to rank 0 */ MPI_Gatherv(sendbuf, bytes, MPI_BYTE, recvbuf, counts, disps, MPI_BYTE, 0, MPI_COMM_WORLD); /* create temporary list to unpack items into */ mfu_flist tmplist = mfu_flist_subset(flist); /* unpack items into new list */ if (rank == 0) { ptr = (char*) recvbuf; char* end = ptr + recvbytes; while (ptr < end) { mfu_flist_file_unpack(ptr, tmplist); ptr += pack_size; } } /* summarize list */ mfu_flist_summarize(tmplist); /* print files */ if (rank == 0) { printf("\n"); uint64_t tmpidx = 0; uint64_t tmpsize = mfu_flist_size(tmplist); while (tmpidx < tmpsize) { print_file(tmplist, tmpidx); tmpidx++; if (tmpidx == range && total > 2 * range) { /* going to have to leave some out */ printf("\n<snip>\n\n"); } } printf("\n"); } /* free our temporary list */ mfu_flist_free(&tmplist); /* free memory */ mfu_free(&disps); mfu_free(&counts); mfu_free(&sendbuf); mfu_free(&recvbuf); return; }