int split_bin_2level( int bins, int bin, const lwgrp_ring* lev1_ring, const lwgrp_logring* lev1_logring, const lwgrp_ring* lev2_ring, const lwgrp_logring* lev2_logring, lwgrp_ring* new_lev1_ring, lwgrp_logring* new_lev1_logring, lwgrp_ring* new_lev2_ring, lwgrp_logring* new_lev2_logring) { int i; /* initialize new rings and logrings to empty groups, * we'll overwrite these if proc is really in a group */ lwgrp_ring_set_null(new_lev1_ring); lwgrp_ring_set_null(new_lev2_ring); lwgrp_logring_build_from_ring(new_lev1_ring, new_lev1_logring); lwgrp_logring_build_from_ring(new_lev2_ring, new_lev2_logring); if (bins <= 0) { return 0; } /* get our rank within and the size of the parent communicator */ int comm_size; int comm_rank = lev1_ring->comm_rank; MPI_Comm_size(lev1_ring->comm, &comm_size); /* allocate memory to execute collectives */ int* reduce_inbuf = (int*) malloc(bins * sizeof(int)); int* reduce_outbuf = (int*) malloc(bins * sizeof(int)); int* scan_inbuf = (int*) malloc(2 * bins * sizeof(int)); int* scan_recvleft = (int*) malloc(2 * bins * sizeof(int)); int* scan_recvright = (int*) malloc(2 * bins * sizeof(int)); /* intiaize all bins to MPI_PROC_NULL, except for our * bin in which case we list our rank within comm */ for (i = 0; i < bins; i++) { /* initialize all bins to size(lev1), would like MPI_PROC_NULL, * but we use size instead so that reduce(min) does the right thing */ reduce_inbuf[i] = comm_size; } if (bin >= 0) { reduce_inbuf[bin] = comm_rank; } /* reduce to node leader to find lowest rank in each bin */ lwgrp_logring_reduce( reduce_inbuf, reduce_outbuf, bins, MPI_INT, MPI_MIN, 0, lev1_ring, lev1_logring ); /* create the scan type (a rank and a count pair) */ MPI_Datatype scan_type; MPI_Type_contiguous(2, MPI_INT, &scan_type); MPI_Type_commit(&scan_type); /* double exscan across node leaders to * build info for new node leader chains */ int lev1_rank = lev1_ring->group_rank; if (lev1_rank == 0) { /* prepare data for input to double scan, for each bin * record the lowest rank and a count of either 0 or 1 */ for (i = 0; i < bins; i++) { if (reduce_outbuf[i] != comm_size) { scan_inbuf[i*2 + SCAN_RANK] = reduce_outbuf[i]; scan_inbuf[i*2 + SCAN_COUNT] = 1; } else { scan_inbuf[i*2 + SCAN_RANK] = MPI_PROC_NULL; scan_inbuf[i*2 + SCAN_COUNT] = 0; } } /* create the scan operation */ MPI_Op scan_op; int commutative = 0; MPI_Op_create(scan_chain, commutative, &scan_op); /* execute the double exclusive scan to get next rank and * count of ranks to either side for each bin */ lwgrp_logring_double_exscan( scan_inbuf, scan_recvright, scan_inbuf, scan_recvleft, bins, scan_type, scan_op, lev2_ring, lev2_logring ); /* if we're on the end of the level 2 group, need to initialize * the recv values */ int lev2_rank = lev2_ring->group_rank; int lev2_size = lev2_ring->group_size; if (lev2_rank == 0) { /* we're on the left end of lev2 group, so we didn't get * anything from the left side */ for (i = 0; i < bins; i++) { scan_recvleft[i*2 + SCAN_RANK] = MPI_PROC_NULL; scan_recvleft[i*2 + SCAN_COUNT] = 0; } } if (lev2_rank == lev2_size-1) { /* we're on the right end of lev2 group, so we didn't get * anything from the right side */ for (i = 0; i < bins; i++) { scan_recvright[i*2 + SCAN_RANK] = MPI_PROC_NULL; scan_recvright[i*2 + SCAN_COUNT] = 0; } } /* free the scan op */ MPI_Op_free(&scan_op); } /* broadcast scan results to local comm */ lwgrp_logring_bcast(scan_recvleft, bins, scan_type, 0, lev1_ring, lev1_logring); lwgrp_logring_bcast(scan_recvright, bins, scan_type, 0, lev1_ring, lev1_logring); /* free the scan type */ MPI_Type_free(&scan_type); /* call bin_split on local chain */ lwgrp_ring_split_bin_radix(bins, bin, lev1_ring, new_lev1_ring); lwgrp_logring_build_from_ring(new_lev1_ring, new_lev1_logring); /* for each valid bin, all rank 0 procs of new lev1 groups form new lev2 groups */ if (bin >= 0) { int new_lev1_rank = new_lev1_ring->group_rank; if (new_lev1_rank == 0) { /* extract chain values from scan results */ MPI_Comm comm = new_lev1_ring->comm; int left = scan_recvleft[2*bin + SCAN_RANK]; int right = scan_recvright[2*bin + SCAN_RANK]; int size = scan_recvleft[2*bin + SCAN_COUNT] + scan_recvright[2*bin + SCAN_COUNT] + 1; int rank = scan_recvleft[2*bin + SCAN_COUNT]; /* build chain, then ring, then logring, and finally free chain */ lwgrp_chain tmp_chain; lwgrp_chain_build_from_vals(comm, left, right, size, rank, &tmp_chain); lwgrp_ring_build_from_chain(&tmp_chain, new_lev2_ring); lwgrp_logring_build_from_ring(new_lev2_ring, new_lev2_logring); lwgrp_chain_free(&tmp_chain); } } /* free our temporary memory */ free(scan_recvright); free(scan_recvleft); free(scan_inbuf); free(reduce_outbuf); free(reduce_inbuf); return 0; }
/* given a specified number of bins, an index into those bins, and a * input group, create and return a new group consisting of all ranks * belonging to the same bin, runs in O(num_bins * log N) time */ int lwgrp_ring_split_bin(int num_bins, int my_bin, const lwgrp_ring* in, lwgrp_ring* out) { /* With this function, we split the "in" group into up to "num_bins" * subgroups. A process is grouped with all other processes that * specify the same value for "my_bin". The descriptor for the new * group is returned in "out". If my_bin is less than 0, an empty * (NULL) group is returned. * * Implementation: * We run two exclusive scans, one scanning from left to right, and * another scanning from right to left. As the output of the * left-going scan, a process acquires the number of ranks to its * left that are in its bin as well as the rank of the process that * is immediately to its left that is also in its bin. Similarly, * the right-going scan provides the process with the number of ranks * to the right and the rank of the process immediately to its right * that is in the same bin. With this info, a process can determine * its rank and the number of ranks in its group, as well as, the * ranks of its left and right partners, which is sufficient to fully * define the "out" group. */ int i; if (my_bin >= num_bins) { /* TODO: fail */ } /* define some frequently used indicies into our arrays */ int my_bin_index = 2 * my_bin; int rank_index = 2 * num_bins; /* allocate space for our send and receive buffers */ int elements = 2 * num_bins + 1; int* bins = (int*) lwgrp_malloc(4 * elements * sizeof(int), sizeof(int), __FILE__, __LINE__); if (bins == NULL) { /* TODO: fail */ } /* set up pointers to our send and receive buffers */ int* send_left_bins = bins + (0 * elements); int* recv_left_bins = bins + (1 * elements); int* send_right_bins = bins + (2 * elements); int* recv_right_bins = bins + (3 * elements); /* initialize our send buffers, * set all ranks to MPI_PROC_NULL and set all counts to 0 */ for(i = 0; i < 2*num_bins; i += 2) { send_left_bins[i+INDEX_COUNT] = 0; send_right_bins[i+INDEX_COUNT] = 0; send_left_bins[i+INDEX_CLOSEST] = MPI_PROC_NULL; send_right_bins[i+INDEX_CLOSEST] = MPI_PROC_NULL; } /* for the bin we are in, set the rank to our rank and set the count to 1 */ if (my_bin >= 0) { send_left_bins[my_bin_index+INDEX_COUNT] = 1; send_right_bins[my_bin_index+INDEX_COUNT] = 1; send_left_bins[my_bin_index+INDEX_CLOSEST] = in->comm_rank; send_right_bins[my_bin_index+INDEX_CLOSEST] = in->comm_rank; } /* execute double, inclusive scan, one going left-to-right, * and another right-to-left */ MPI_Request request[4]; MPI_Status status[4]; MPI_Comm comm = in->comm; int comm_rank = in->comm_rank; int left_rank = in->comm_left; int right_rank = in->comm_right; int rank = in->group_rank; int ranks = in->group_size; int my_left = MPI_PROC_NULL; int my_right = MPI_PROC_NULL; int dist = 1; while (dist < ranks) { /* left-to-right shift: * inform rank to our right about the rank on our left, * recv data from left and send data to the right */ send_right_bins[rank_index] = left_rank; MPI_Irecv(recv_left_bins, elements, MPI_INT, left_rank, LWGRP_MSG_TAG_0, comm, &request[0]); MPI_Isend(send_right_bins, elements, MPI_INT, right_rank, LWGRP_MSG_TAG_0, comm, &request[1]); /* right-to-left shift: * inform rank to our left about the rank on our right * recv data from right and send data to the left */ send_left_bins[rank_index] = right_rank; MPI_Irecv(recv_right_bins, elements, MPI_INT, right_rank, LWGRP_MSG_TAG_0, comm, &request[2]); MPI_Isend(send_left_bins, elements, MPI_INT, left_rank, LWGRP_MSG_TAG_0, comm, &request[3]); /* wait for all communication to complete */ MPI_Waitall(4, request, status); /* make note of the rightmost rank in our bin * to the left if we haven't already found one */ if (my_left == MPI_PROC_NULL && my_bin >= 0) { my_left = recv_left_bins[my_bin_index+INDEX_CLOSEST]; } /* make note of the leftmost rank in our bin to the * right if we haven't already found one */ if (my_right == MPI_PROC_NULL && my_bin >= 0) { my_right = recv_right_bins[my_bin_index+INDEX_CLOSEST]; } /* merge data from left into our right-going data */ for(i = 0; i < 2*num_bins; i += 2) { /* if we haven't wrapped, add the counts for this bin */ if (rank - dist >= 0) { send_right_bins[i+INDEX_COUNT] += recv_left_bins[i+INDEX_COUNT]; } /* if we haven't already defined the rightmost rank for this bin, * set it to the value defined in the message from the left */ if (send_right_bins[i+INDEX_CLOSEST] == MPI_PROC_NULL) { send_right_bins[i+INDEX_CLOSEST] = recv_left_bins[i+INDEX_CLOSEST]; } } /* merge data from right into our left-going data */ for(i = 0; i < 2*num_bins; i += 2) { /* if we haven't wrapped, add the counts for this bin */ if (rank + dist < ranks) { send_left_bins[i+INDEX_COUNT] += recv_right_bins[i+INDEX_COUNT]; } /* if we haven't already defined the leftmost rank for this bin, * set it to the value defined in the message from the left */ if (send_left_bins[i+INDEX_CLOSEST] == MPI_PROC_NULL) { send_left_bins[i+INDEX_CLOSEST] = recv_right_bins[i+INDEX_CLOSEST]; } } /* get next processes on the left and right sides */ left_rank = recv_left_bins[rank_index]; right_rank = recv_right_bins[rank_index]; dist <<= 1; } /* if we're the only rank, set our ourself as our left and right neighbor */ if (ranks == 1) { my_left = comm_rank; my_right = comm_rank; } if (my_bin >= 0) { /* get count of number of ranks in our bin to our left and right sides */ int count_left = send_right_bins[my_bin_index + INDEX_COUNT] - 1; int count_right = send_left_bins[my_bin_index + INDEX_COUNT] - 1; /* the number of ranks to our left defines our rank, while we add * the number of ranks to our left with the number of ranks to our * right plus ourselves to get the total number of ranks in our bin */ out->comm = in->comm; out->comm_rank = in->comm_rank; out->comm_left = my_left; out->comm_right = my_right; out->group_rank = count_left; out->group_size = count_left + count_right + 1; } else { /* create an empty group */ lwgrp_ring_set_null(out); } /* free memory */ lwgrp_free(&bins); return LWGRP_SUCCESS; }